import os
import io
import yaml
import logging
import time
from datetime import timedelta, datetime
from importlib import import_module

from memorious import settings, signals
from memorious.core import session, local_queue
from memorious.model import Tag, Event, Result
from memorious.reporting import get_last_run, is_running, cleanup_crawler
from memorious.logic.context import handle
from memorious.logic.stage import CrawlerStage

log = logging.getLogger(__name__)


class Crawler(object):
    """A processing graph that constitutes a crawler."""
    SCHEDULES = {
        'daily': timedelta(days=1),
        'weekly': timedelta(weeks=1),
        'monthly': timedelta(weeks=4)
    }

    def __init__(self, manager, source_file):
        self.manager = manager
        self.source_file = source_file
        with io.open(source_file, encoding='utf-8') as fh:
            self.config_yaml = fh.read()
            self.config = yaml.load(self.config_yaml)

        self.name = os.path.basename(source_file)
        self.name = self.config.get('name', self.name)
        self.description = self.config.get('description', self.name)
        self.category = self.config.get('category', 'scrape')
        self.schedule = self.config.get('schedule')
        self.disabled = self.config.get('disabled', False)
        self.init_stage = self.config.get('init', 'init')
        self.delta = Crawler.SCHEDULES.get(self.schedule)
        self.delay = int(self.config.get('delay', 0))
        self.expire = int(self.config.get('expire', settings.EXPIRE))
        self.stealthy = self.config.get('stealthy', False)
        self.cleanup_config = self.config.get('cleanup', {})

        self.stages = {}
        for name, stage in self.config.get('pipeline', {}).items():
            self.stages[name] = CrawlerStage(self, name, stage)

    def check_due(self):
        """Check if the last execution of this crawler is older than
        the scheduled interval."""
        if self.disabled:
            return False
        if self.delta is None:
            return False
        last_run = get_last_run(self)
        if last_run is None:
            return True
        now = datetime.utcnow()
        if now > last_run + self.delta:
            return True
        return False

    def flush(self):
        """Delete all run-time data generated by this crawler."""
        Tag.delete(self.name)
        Event.delete(self.name)
        Result.delete(self.name)
        session.commit()
        signals.crawler_flush.send(self)

    def run(self, incremental=None):
        """Queue the execution of a particular crawler."""
        state = {
            'crawler': self.name,
            'incremental': settings.INCREMENTAL
        }
        if incremental is not None:
            state['incremental'] = incremental
        stage = self.get(self.init_stage)
        handle.delay(state, stage.name, {})

        # If running in eager mode, we need to block until all the queued
        # tasks are finished.
        while not local_queue.is_empty:
            time.sleep(1)

    def replay(self, stage):
        """Re-run all tasks issued to a particular stage.

        This sort of requires a degree of idempotence for each operation.
        Usually used to re-parse a set of crawled documents.
        """
        query = Result.by_crawler_next_stage(self.name, stage)
        for result in query:
            state = {'crawler': self.name}
            handle.delay(state, stage, result.data)

    @property
    def cleanup_method(self):
        if self.cleanup_config:
            method = self.cleanup_config["method"]
            package = 'memorious.helpers.export'
            module = import_module(package)
            return getattr(module, method)

    def cleanup(self):
        """Run a cleanup method after the crawler finishes running"""
        should_run_cleanup = False
        # Run cleanup if the crawler has finished running
        if not is_running(self):
            should_run_cleanup = True
        # Run cleanup if the last operation of the crawler was more than half
        # a day ago and it's just hanging in running state since then.
        delta = timedelta(hours=12)
        last_run = get_last_run(self)
        if last_run is not None:
            now = datetime.utcnow()
            if now > last_run + delta:
                should_run_cleanup = True

        if should_run_cleanup:
            cleanup_crawler(self)
            if self.cleanup_method:
                log.info("Running clean up for %s" % self.name)
                self.cleanup_method(self.cleanup_config["params"])
            else:
                pass

    def get(self, name):
        return self.stages.get(name)

    def __iter__(self):
        return iter(self.stages.values())

    def __repr__(self):
        return '<Crawler(%s)>' % self.name
