#!python

import argparse
import datetime
import json
import time
import sys
import os
import shlex
import shutil
import subprocess
import requests
from dotenv import load_dotenv

import nudecrawler 
from nudecrawler import Page, Unbuffered
from nudecrawler.page import  get_processed_images, context_fields
from nudecrawler.version import version
from nudecrawler.cache import cache
from nudecrawler.verbose import printv, bugreport
import nudecrawler.tgru

import transliterate.discover 

transliterate.discover.autodiscover()

stats = {
    'cmd': None,
    'filter': {
        'expr': 'True',
        'min_image_size': None,
        'min_total_images': 0,
        'min_content_length': None,
        'max_pictures': None,
        'image_extensions': None,        
        'max_errors': None
    },
    'uptime': 0,
    'urls': 0,
    'words': 0,
    'word': None,
    'last': {
        'url': None,
        'status': None,
        'detailed': None,
    },
    'last_interesting': {
        'url': None,
        'status': None,
        'detailed': None,
    },
    

    'now': None,
    'processed_images': 0,
    'ignored_pages': 0,
    'found_interesting_pages': 0,
    'found_nude_images': 0,
    'found_new_nude_images': 0,
    'found_new_total_images': 0,
    'resume': dict(),
    'gap_max': 0,
    'gap_url': None,
    'cache_path': None
}

previous_content_length = None

stats_file = None
stats_period = 60

stats_next_write = time.time() + stats_period

started = time.time()

logfile = None
stop_after = None
stop_each = None
refresh = None
detect_image = None
detect_url = None

#
# page_mintotal = 0

expr = 'True'

nude = 1
video = 1
verbose = False
all_found = True

filter_methods = {
    "true": ("builtin", ":true"),
    "false": ("builtin", ":false"),
    "nudepy": ("builtin", ":nude"),
    "nudenetb": ("builtin", ":nudenet"),
    "aid": ("image", "detect-image-aid.py"),
    "nsfwapi": ("image", "detect-image-nsfw-api.py"),
    "nudenet": ("image", "detect-image-nudenet.py")
}

def get_args(argv=None):

    load_dotenv()  

    def_detect = os.getenv('NUDE_DETECT')
    def_cache = os.getenv('NUDE_CACHE')
    def_log = os.getenv('NUDE_LOG')
    def_stats = os.getenv('NUDE_STATS', '/tmp/nudecrawler-stats.txt')
    def_unbuf = bool(os.getenv('NUDE_UNBUF'))

    parser = argparse.ArgumentParser(description=f'Nudecrawler: Telegra.ph Spider {version}\nhttps://github.com/yaroslaff/nudecrawler', formatter_class=argparse.RawTextHelpFormatter)

    def_expr = '(total_images>5 and new_nude_images>0) or total_video>0'
    def_workdir = os.getenv('NUDE_DIR', '.')

    def_total = int(os.getenv('NUDE_TOTAL', '1'))
    def_errors = 5
    def_minsize=10

    methods_list = ', '.join(filter_methods.keys())

    parser.add_argument('words', nargs='*')
    parser.add_argument('-d', '--days', type=int, default=30)
    # parser.add_argument('--nude', metavar='N', type=int, default=1, help='Interesting if N+ nude images')
    # parser.add_argument('--video', metavar='N', type=int, default=1, help='Interesting if N+ video')
    parser.add_argument('--url1', metavar="URL", help='process only one url')
    parser.add_argument('-f', '--fails', type=int, default=5, help='stop searching next pages with same words after N failures')
    parser.add_argument('--day', nargs=2, type=int, metavar=('MONTH', 'DAY'), help='Current date (default is today) example: --day 12 31')

    g = parser.add_argument_group('Page filtering options')
    parser.add_argument('--expr', '-e', metavar='EXPR', default=def_expr, 
                        help=f'Interesting if EXPR is True. def: {def_expr!r}\nFields: ' + ' '.join(context_fields) )
    parser.add_argument('--total', metavar='N', type=int, default=def_total, help=f'Boring if less then N total images ({def_total})')
    parser.add_argument('--max-errors', metavar='N', type=int, default=def_errors, help=f'Max allowed errors on page ({def_errors})')
    parser.add_argument('--min-content-length', metavar='N', type=int, default=None, help=f'Skip page if content-length less then N (try 5000 or higher)')

    g = parser.add_argument_group('Image filtering options')
    g.add_argument('-a', '--all', default=False, action='store_true', help='do not detect, print all found pages')
    g.add_argument('--detect-image', metavar='SCRIPT', help='explicitly use this script to detect nudity on image file')
    g.add_argument('--detect-url', metavar='SCRIPT', help='explicitly use this script to detect nudity on image URL')
    g.add_argument('--detect', metavar='METHOD', default=def_detect, help=f'One of {methods_list}')
    g.add_argument('--extensions', nargs='*', default=['.jpeg','.jpg', '.png'],help='interesting extensions (with dot, like .jpg)')
    g.add_argument('--minsize', type=int, default=def_minsize,help=f'min size of image in Kb ({def_minsize})')
    g.add_argument('--max-pictures', type=int, metavar='N', help=f'Detect only among first prefiltered N pictures')
    g.add_argument('--cache', metavar='PATH', default=def_cache, help=f'path to cache file (will create if missing)')


    g = parser.add_argument_group('Output options')
    g.add_argument('-v', '--verbose', default=False, action='store_true', help='verbose')
    g.add_argument('--unbuffered', '-b', default=def_unbuf, action='store_true', help='Use unbuffered stdout')
    g.add_argument('--urls', default=False, action='store_true', help='Do not detect, just generate and print URLs')    
    g.add_argument('--log', default=def_log, help='print all precious treasures to this logfile')
    g.add_argument('--bugreport', default=False, action='store_true', help='print all precious treasures to this logfile')
    g.add_argument('--workdir', default=def_workdir, help=f'Use all files (log, wordlist, cache) in this dir. def: {def_workdir}')


    g = parser.add_argument_group('list-related options')
    g.add_argument('-w', '--wordlist', help='wordlist (urllist) file')
    g.add_argument('--stats', metavar='STATS_FILE', default=def_stats, help='periodical statistics file')
    g.add_argument('--resume', metavar='STATS_FILE', help='resume from STATS_FILE (other args are not needed)')
    g.add_argument('--stop', type=int, metavar='NUM_IMAGES', help='stop (or --refresh) after N images processed (or little after)')
    g.add_argument('--refresh', metavar=('SCRIPT', 'ARG'), nargs='+', help='run this refresh script every --stop NUM_IMAGES images')

    return parser.parse_args(argv)




def analyse(url):

    global stop_after, previous_content_length

    p = Page(url, all_found=all_found,
            detect_url=detect_url, detect_image=detect_image, ignore_content_length=previous_content_length,
            min_images_size=stats['filter']['min_image_size'], 
            image_extensions = stats['filter']['image_extensions'],
            min_total_images=stats['filter']['min_total_images'],
            max_errors=stats['filter']['max_errors'],
            max_pictures=stats['filter']['max_pictures'],
            expr = stats['filter']['expr'], min_content_length=stats['filter']['min_content_length'])

    stats['urls'] += 1
    
    p.check_all()
    
    stats['last']['url'] = url
    stats['last']['status'] = p._status
    stats['last']['detailed'] = p._status_detailed

    stats['found_new_total_images'] += p.new_total_images
    stats['found_new_nude_images'] += p.new_nude_images

    previous_content_length = p.content_length

    if p.status().startswith('INTERESTING'):        
        stats['found_interesting_pages'] += 1
        stats['found_nude_images'] += p.nude_images
        stats['last_interesting']['url'] = url
        stats['last_interesting']['status'] = p._status
        stats['last_interesting']['detailed'] = p._status_detailed

        if logfile:
            with open(logfile, "a") as fh:
                print(p, file=fh)
    
    if p.status().startswith("INTERESTING") or verbose:
        print(p)

    if p.status().startswith("IGNORED"):
        stats['ignored_pages'] += 1

    save_stats(force=False)

    if stats['cache_path']:
        cache.save_conditional(stats['cache_path'])

    if stop_after is not None and get_processed_images() > stop_after:
        print("Stop/refresh after processed", get_processed_images(), "images...")
        if refresh:
            # print("Refresh:", refresh)
            subprocess.run(refresh)

            # schedule next stop
            stop_after = get_processed_images() + stop_each
        else:
            print("No --refresh, exiting with code 2")
            sys.exit(2)

    return p


def save_stats(force=False):
    global stats_next_write    

    if stats_file is None:
        return

    if time.time() > stats_next_write or force:
        stats['now'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        stats['uptime'] = int(time.time() - started)
        stats['processed_images'] = get_processed_images()
        
        stats['cache'] = cache.status()

        with open(stats_file, "w") as fh:
            json.dump(stats, fh, indent=4)
            stats_next_write = time.time() + stats_period

    


def check_word(word, day, fails, print_urls=False, resumecount=None):
    global previous_content_length

    word = word.replace(' ','-').translate({ord('ь'): '', ord('ъ'): ''})

    if word.startswith("https://"):
        baseurl = word
    else:
        trans_word = transliterate.translit(word, 'tgru', reversed=True)
        baseurl=f'https://telegra.ph/{trans_word}'


    stats['word'] = word
    stats['words'] += 1

    url=f'{baseurl}-{day.month:02}-{day.day:02}'

    stats['resume']['month'] = day.month
    stats['resume']['day'] = day.day    
    stats['resume']['count'] = resumecount

    if print_urls:
        print(url)
        return

    previous_content_length = None


    # r = requests.get(url)  
    if not resumecount:
        p = analyse(url)
        #if p.ignore:
        #    return
        c=2
    else:
        c=resumecount
        print(f"Resume from word {word} count {c}")
    
    

    nfails=0    
    while nfails<fails:        
        url=f'{baseurl}-{day.month:02}-{day.day:02}-{c}'        
        p = analyse(url)

        if p.http_code == 404:
            nfails += 1
        else:
            # end of gap
            if nfails>stats['gap_max']:
                stats['gap_max'] = nfails
                stats['gap_url'] = url
            nfails=0

        c+=1
        stats['resume']['count'] = c


def sanity_check(args):
    pass

def load_stats(path):
    global stats
    with open(path) as fh:
        loaded_stats = json.load(fh)
    
    for k in stats:
        if k not in loaded_stats:
            loaded_stats[k] = stats[k]
    
    stats = loaded_stats

def abort(msg):
    print(msg, file=sys.stderr)
    sys.exit(1)

def main():
    global nude, video, verbose, all_found, stats_file, stats, logfile, \
        stop_after, stop_each, detect_image, detect_url,\
        refresh

    words = None
    args = get_args()
    sanity_check(args)

    # when fastforward, we go to specific word/day/count quickly
    fastforward = False    

    if args.unbuffered:
        sys.stdout = Unbuffered(sys.stdout)

    if args.resume:
        if args.workdir:
            args.resume = os.path.join(args.workdir, args.resume)

        print("Resume from", args.resume)
        try:
            load_stats(args.resume)
        except FileNotFoundError as e: 
            abort(f"Missing status file {args.resume}")

        cmd = stats['cmd']
        args = get_args(shlex.split(cmd)[1:])
        fastforward = True
    else:
        stats['cmd'] = shlex.join(sys.argv)

    if args.workdir:
        for attr in ['cache', 'wordlist', 'log', 'resume', 'stats']:
            old = getattr(args, attr)
            if old is not None:                
                new = os.path.join(args.workdir, old)
                setattr(args, attr, new)

    # nude = args.nude
    # video = args.video
    verbose = args.verbose
    all_found = args.all    
    matched_resume = False
    skipped_words = 0
    stop_after = args.stop
    stop_each = args.stop
    refresh = args.refresh
    detect_url = args.detect_url
    detect_image = args.detect_image
    stats['filter']['expr'] = args.expr
    stats['filter']['min_content_length'] = args.min_content_length
    stats['filter']['max_errors'] = args.max_errors
    stats['filter']['max_pictures'] = args.max_pictures
    stats['cache_path'] = args.cache
    
    if args.bugreport:
        nudecrawler.verbose.send_bugreports = True

    if args.detect:
        try:
            kind, basename = filter_methods[args.detect]
        except KeyError:
            print(f"Do not know detector {args.detect!r}, use one of known detectors: ({ ', '.join(filter_methods.keys()) }) or explicitly specify script with --detect-image or --detect-url")
            sys.exit(1)

        if kind in ['image', 'url']:
            if shutil.which(basename) is None:
                print(f"Cannot find {basename}, maybe not in $PATH?" ,file=sys.stderr)
                sys.exit(1)

        if kind == 'builtin':
            if basename in [':nude', ':nudenet']:
                detect_image = basename
            else:
                detect_url = basename
        elif kind == 'image':
            detect_image = basename
            print(f"# Will use script {shutil.which(basename)} for filtering images")
        elif kind == 'url':
            detect_url = basename
            print(f"# Will use script {shutil.which(basename)} for filtering images")            
    

    # fix arguments
    if not any([detect_image, detect_url, all_found]):        
        print("# No filter, using built-in :nude by default")
        detect_image=':nude'

    nudecrawler.verbose.verbose = verbose

    if args.extensions:
        stats['filter']['image_extensions'] = args.extensions
    
    if args.minsize:
        stats['filter']['min_image_size'] = args.minsize * 1024

    if args.total:
        stats['filter']['min_total_images'] = args.total


    if stats['cache_path']:
        if os.path.exists(stats['cache_path']):
            cache.load(stats['cache_path'])
        else:
            print(f"# No cache file {stats['cache_path']}, start with empty cache")

    # processing could start here
    # --url1 
    if args.url1:
        p = analyse(args.url1)
        print(p.status())
        for msg in p._log:
            print(" ", msg)
        return

    ## wordlist
    if args.wordlist:
        stats_file = args.stats
        with open(args.wordlist) as fh:
            words = [line.rstrip() for line in fh]
    
    if args.words:
        words = args.words
    
    if not words:
        print("Need either --url1 URL or words like 'nude' or -w wordlist.txt")
        sys.exit(1)

    logfile = args.log

    for w in words:
        if fastforward and not matched_resume:
            if w == stats['resume']['word']:
                matched_resume = True
            else:
                skipped_words += 1
                continue

        stats['resume']['word'] = w


        if fastforward:
            day = datetime.datetime(2020, stats['resume']['month'], stats['resume']['day'])
        elif args.day is None:
            day = datetime.datetime.now()
        else:
            day = datetime.datetime(2020, args.day[0], args.day[1])

        days_tried = 0
        while days_tried < args.days:
            if fastforward:
                resumecount = stats['resume']['count']
            else:
                resumecount = None
            # stop fastforward
            fastforward=False
            check_word(w, day, args.fails, print_urls = args.urls, resumecount=resumecount)
            
            days_tried += 1
            day = day - datetime.timedelta(days=1)


    print(f"Finished {len(words)} (skipped {skipped_words}) words in {time.time() - started:.2f} seconds, found {stats['found_interesting_pages']} pages")
    if fastforward and not matched_resume:
        abort(f"Did not found word {stats['resume']['word']}")

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt as e:
        print("KEYBOARD INTERRUPT")
        print(e)
        save_stats(force=True)
        