#!python

# asdf3

import argparse
import datetime
import json
import time
import sys
import nudecrawler 
from nudecrawler import Page, Unbuffered

import transliterate.discover 
from transliterate.base import TranslitLanguagePack, registry

transliterate.discover.autodiscover()

nudity_threshold = 0.5

stats = {
    'cmd': None,
    'uptime': 0,
    'urls': 0,
    'words': 0,
    'word': None,
    'url': None,
    'now': None,
    'interesting': 0,
    'resume': dict()
}

stats_file = None
stats_period = 60

stats_next_write = time.time() + stats_period

started = time.time()

logfile = None

class TgRuLanguagePack(TranslitLanguagePack):
    language_code = "tgru"
    language_name = "tgru"

    mapping = (
        u"abvgdezijklmnoprstufhcC'y'ABVGDEZIJKLMNOPRSTUFH'Y'",
        u"абвгдезийклмнопрстуфхцЦъыьАБВГДЕЗИЙКЛМНОПРСТУФХЪЫЬ",
    )

    reversed_specific_mapping = (
        u"эЭъьЪЬ",
        u"eE''''"
    )

    pre_processor_mapping = {
        u"zh": u"ж",
        "yo": 'ё',
        u"ts": u"ц",
        u"ch": u"ч",
        u"sh": u"ш",
        u"sch": u"щ",
        u"yu": u"ю",
        u"ya": u"я",
        "Yo": 'Ё',
        u"Zh": u"Ж",
        u"Ts": u"Ц",
        u"Ch": u"Ч",
        u"Sh": u"Ш",
        u"Sch": u"Щ",
        u"Yu": u"Ю",
        u"Ja": u"Я"
    }

registry.register(TgRuLanguagePack)


nude = 1
video = 1
verbose = False
all_found = True

def get_args():
    parser = argparse.ArgumentParser(description='Telegra.ph Spider')
    parser.add_argument('words', nargs='*')
    parser.add_argument('-d', '--days', type=int, default=30)
    parser.add_argument('--nude', type=int, default=1, help='Interesting if N nude images')
    parser.add_argument('--video', type=int, default=1, help='Interesting if N video')
    parser.add_argument('-u', '--url', help='process one url')
    parser.add_argument('-a', '--all', default=False, action='store_true', help='do not detect, print all found pages')
    parser.add_argument('-f', '--fails', type=int, default=0, help='stop searching next pages with same words after N failures')
    parser.add_argument('-t', '--threshold', type=float, default=0.2, help='nudity threshold (0..1), 0 will match almost everything')
    parser.add_argument('--day', nargs=2, type=int, metavar=('MONTH', 'DAY'), help='Current date (default is today) example: --day 12 31')


    g = parser.add_argument_group('Output options')
    g.add_argument('-v', '--verbose', default=False, action='store_true', help='verbose')
    g.add_argument('--unbuffered', '-b', default=False, action='store_true', help='Use unbuffered stdout')
    g.add_argument('--urls', default=False, action='store_true', help='Do not check, just generate and print URLs')
    g.add_argument('--log', help='print all precious treasures to this logfile')


    g = parser.add_argument_group('list-related options')
    g.add_argument('-w', '--wordlist', help='wordlist (urllist) file')
    g.add_argument('--stats', default='/tmp/nudecrawler-stats.txt', help='periodical statistics file')
    g.add_argument('--resume', default=False, action='store_true', help='skip all words before WORD in list, resume starting from it')


    return parser.parse_args()




def analyse(url):

    p = Page(url, neednnudes=nude, neednvideo=video, all_found=all_found, threshold=nudity_threshold)

    stats['urls'] += 1

    if p.error:
        return p
    p.check_all()
    if p.status().startswith('INTERESTING'):
        stats['interesting']+=1
        if logfile:
            with open(logfile, "a") as fh:
                print(p, file=fh)
        print(p)
    return p


def save_stats():
    global stats_next_write    
    if time.time() > stats_next_write:
        stats['now'] = datetime.datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        stats['uptime'] = int(time.time() - started)
        
        with open(stats_file, "w") as fh:
            json.dump(stats, fh, indent=4)
            stats_next_write = time.time() + stats_period


def check_word(word, day, fails, print_urls=False, resumecount=None):
    word = word.replace(' ','-').translate({ord('ь'): '', ord('ъ'): ''})

    if word.startswith("https://"):
        baseurl = word
    else:
        trans_word = transliterate.translit(word, 'tgru', reversed=True)
        baseurl=f'https://telegra.ph/{trans_word}'

    stats['word'] = word
    stats['words'] += 1

    url=f'{baseurl}-{day.month:02}-{day.day:02}'
    stats['url'] = url

    stats['resume']['month'] = day.month
    stats['resume']['day'] = day.day    
    stats['resume']['count'] = None

    if print_urls:
        print(url)
        return

    # r = requests.get(url)  
    if not resumecount:
        p = analyse(url)
        save_stats()
        if p.error:
            return
        c=2
    else:
        c=resumecount
        print(f"Resume from word {word} count {c}")

    nfails=0
    while nfails<fails:


        url=f'{baseurl}-{day.month:02}-{day.day:02}-{c}'
        stats['url'] = url
        p = analyse(url)
        save_stats()
        if p.error:
            nfails += 1
        else:
            nfails=0
        c+=1
        stats['resume']['count'] = c


def main():
    global nude, video, verbose, all_found, nudity_threshold, stats_file, stats, logfile

    words = None
    args = get_args()


    nude = args.nude
    video = args.video
    verbose = args.verbose
    all_found = args.all
    nudity_threshold = args.threshold
    matched_resume = False
    skipped_words = 0

    # when fastforward, we go to specific word/day/count quickly
    fastforward = False

    nudecrawler.verbose.verbose = verbose

    stats['cmd'] = ' '.join(sys.argv)

    if args.unbuffered:
        sys.stdout = Unbuffered(sys.stdout)

    if args.url:
        p = analyse(args.url)
        if p.error:
            print("ERROR")
        # print(p.status())
        return

    if args.wordlist:
        stats_file = args.stats
        with open(args.wordlist) as fh:
            words = [line.rstrip() for line in fh]
    
    if args.words:
        words = args.words
    
    if not words:
        print("Need either -u URL or words like 'nude' or -w wordlist.txt")
        sys.exit(1)

    logfile = args.log

    if args.resume:
        if not args.stats:
            print("--resume work only with --stats")
            sys.exit(1)
        
        with open(args.stats) as fh:
            stats = json.load(fh)
            fastforward = True
                
            

    for w in words:        
        if fastforward and not matched_resume:
            if w == stats['resume']['word']:
                matched_resume = True
            else:
                skipped_words += 1
                continue

        stats['resume']['word'] = w


        if fastforward:
            day = datetime.datetime(2020, stats['resume']['month'], stats['resume']['day'])
        elif args.day is None:
            day = datetime.datetime.now()
        else:
            day = datetime.datetime(2020, args.day[0], args.day[1])

        days_tried = 0
        while days_tried < args.days:
            if fastforward:
                resumecount = stats['resume']['count']
            else:
                resumecount = None
            # stop fastforward
            fastforward=False
            check_word(w, day, args.fails, print_urls = args.urls, resumecount=resumecount)
            
            days_tried += 1
            day = day - datetime.timedelta(days=1)

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt as e:
        print(e)