#!/usr/bin/env python

import sys
import os
import getopt
import logging
import string
import stat

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from esgcet.publish import extractFromDataset, aggregateVariables, publishDataset, filelistIterator, fnmatchIterator, fnIterator, directoryIterator, progressCallback, StopEvent, generateThredds, readDatasetMap, datasetMapIterator
from esgcet.config import loadConfig, getHandler, getHandlerByName, initLogging, registerHandlers, CFHandler
from esgcet.exceptions import *
from esgcet.messaging import debug, info, warning, error, critical, exception

usage = """Usage:
    esgextract [options] path [path ...]

    Extract metadata from a list of files representing a dataset, write it to a database, and optionally generate a THREDDS catalog for the dataset.

Arguments:
    path: A file to scan. Ignored if -f or -r are used.

Options:

    -a aggregate_dimension_name:
        Name of the aggregate dimension. Defaults to 'time'

    --aggregate-only:
        Just aggregate variables from a dataset already in the database.

    --append
        Append the files to an existing dataset.

    -d dataset_name: Specify the name of the dataset. By default the name is generated.

    --directory directory_name:
        Scan all files in a directory, recursively. Also see --filter

    --echo-sql: Echo SQL commands

    --experiment experiment_id:
        Experiment identifier.

    -f filelist: Read the path names from filelist, a file with one path per line

    --filter regular_expression:
        Filter files matching the regular expression. Used with --directory. Default is '.*\.nc'
        Regular expression syntax is defined by the Python re module.

    -h, --help: Print a help message.

    -i init_dir:
        Directory containing all initialization files.
        Recommended: one initialization file for the default sections (esg.ini) and one per project, must match the name format esg.<project>.ini
        If not specified, the default installed init files are read.

    --map dataset_map
        Read path names from a dataset map, as generated by esgscan_directory.

    --model model_id:
        Model identifier.

    --offline:
        Just publish filenames and sizes.

    -p, --property 'name=value':
        Add a property/value pair. This option can be used multiple times.
        Note: the property must also be configured in the initialization file
        and project handler.

    --product product_id:
        Product identifier. For example, IPCC4 products are 3hourly, daily, monthly, etc.

    --project project_id:
        Project identifier.

    --publish
        Publish the dataset if there are no errors.

    -r filename_expression:
                 Scan all files matching the filename expression, using
                 Unix shell-style patterns

    --run run_id
        Run identifier, of the form 'runN', e.g., run5.

    --thredds output_file
        Generate a THREDDS configuration file
"""

def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "a:d:f:hi:p:r:", ['aggregate-only', 'append', 'directory=', 'echo-sql', 'experiment=', 'filter=', 'help', 'map=', 'model=', 'offline', 'product=', 'project=', 'property=', 'publish', 'run=', 'test-progress', 'thredds='])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    aggregateDimension = "time"
    aggregateOnly = False
    appendOpt = False
    context = {}
    datasetMapfile = None
    datasetName = None
    directory = None
    echoSql = False
    filefilt = '.*\.nc'
    filelist = None
    init_dir = '/esg/config/esgcet/'
    offline = False
    projectName = None
    properties = {}
    publish = False
    regexp = None
    testProgress1 = testProgress2 = None
    thredds = False
    for flag, arg in args:
        if flag=='-a':
            aggregateDimension = arg
        elif flag=='--aggregate-only':
            aggregateOnly = True
        elif flag=='--append':
            appendOpt = True
        elif flag=='-d':
            datasetName = arg
        elif flag=='--directory':
            directory = arg
        elif flag=='--echo-sql':
            echoSql = True
        elif flag=='--experiment':
            context['experiment'] = arg
        elif flag=='-f':
            filelist = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_dir = arg
        elif flag=='--map':
            datasetMapfile = arg
        elif flag=='--model':
            context['model'] = arg
        elif flag=='--offline':
            offline = True
        elif flag=='--product':
            context['product'] = arg
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag=='--publish':
            publish = True
        elif flag=='-r':
            regexp = arg
        elif flag=='--run':
            context['run_name'] = arg
        elif flag=='--test-progress':
            testProgress1 = (progressCallback, 0, 50)
            testProgress2 = (progressCallback, 50, 100)
        elif flag=='--thredds':
            thredds = True
            threddsOutputPath = arg

    # Load the configuration and set up a database connection
    config = loadConfig(init_dir)
    engine = create_engine(config.get('extract', 'dburl'), echo=echoSql, pool_recycle=3600)
    initLogging('extract', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Create a file iterator
    if datasetMapfile is not None:
        datasetMap = readDatasetMap(datasetMapfile)
        if len(datasetMap.keys())>1:
            warning("Multiple datasets found, only publishing the first one.")
        datasetNames = datasetMap.keys()
        datasetNames.sort()
        datasetName = datasetNames[0]
        try:
            firstFile = datasetMapIterator(datasetMap, datasetName).next()[0]
        except StopIteration:
            info("No files specified in filelist: %s"%filelist)
            return
        fileiter = datasetMapIterator(datasetMap, datasetNames[0])
    elif filelist is not None:
        try:
            firstFile = filelistIterator(filelist).next()[0]
        except StopIteration:
            info("No files specified in filelist: %s"%filelist)
            return
        fileiter = filelistIterator(filelist)
    elif regexp is not None:
        try:
            firstFile = fnmatchIterator(regexp).next()[0]
        except StopIteration:
            info("No files found that match the regular expression: %s"%regexp)
            return
        fileiter = fnmatchIterator(regexp)
    elif directory is not None:
        try:
            firstFile = directoryIterator(directory, filefilt).next()[0]
        except StopIteration:
            info("No matching files found in directory: %s"%directory)
            return
        fileiter  = directoryIterator(directory, filefilt)
    else:
        if len(lastargs)==0:
            info("No files specified.")
            return
        firstFile = lastargs[0]
        fileiter = fnIterator(lastargs)

    if offline and testProgress1 is not None:
        testProgress1 = (progressCallback, 0, 100)

    # Register project handlers
    registerHandlers()

    # If the project is not specified, try to read it from the first file
    if projectName is not None:
        handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline)
    else:
        handler = getHandler(firstFile, Session, validate=True)
        if handler is None:
            raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)

    # Load the rest of the context from the first file, if possible
    context = handler.getContext(**context)

    # Add properties from the command line
    fieldNames = handler.getFieldNames()
    for name, value in properties.items():
        if name not in fieldNames:
            warning('Property not configured: %s, was ignored'%name)
        else:
            context[name] = value

    # Ensure that fields are valid:
    handler.validateContext(context)

    # Generate the dataset name if not specified
    if datasetName is None:
        datasetName = handler.generateNameFromContext('dataset_id')

    # Create a CFHandler for validation of standard names, checking time axes, etc.
    cfHandler = CFHandler(Session)

    dataset=None
    if not aggregateOnly:
        dataset = extractFromDataset(datasetName, fileiter, Session, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, append=appendOpt, progressCallback=testProgress1, **context)
    if not offline:
        aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset)

    if publish:
        publishDataset(datasetName, Session)

    if thredds:
        threddsOutput = open(threddsOutputPath, "w")
        generateThredds(datasetName, Session, threddsOutput, handler, offline=offline)
        threddsOutput.close()

if __name__=='__main__':
    main(sys.argv[1:])
