#!/usr/bin/env python

import sys
import logging
import os
import getopt
import string
import stat
import re

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from esgcet.publish import extractFromDataset, aggregateVariables, filelistIterator, fnmatchIterator, fnIterator, directoryIterator, multiDirectoryIterator, progressCallback, StopEvent, readDatasetMap, datasetMapIterator, iterateOverDatasets, publishDatasetList, processIterator, processNodeMatchIterator, CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP, REPLACE_OP
from esgcet.config import loadConfig, getHandler, getHandlerByName, initLogging, registerHandlers, splitLine, getOfflineLister
from esgcet.exceptions import *
from esgcet.messaging import debug, info, warning, error, critical, exception
from esgcet.model import Dataset
from esgcet.query import queryDatasetMap, updateDatasetFromContext

# import pdb

usage = """Usage:
    esgpublish [operation] [options] directory [directory ...]

    -or-

    esgpublish [operation] [options] --map dataset_map


    Extract metadata from a list of directories representing one or more datasets, into a database. The directories should relate to one project only. Generate a THREDDS configuration catalog for each dataset, and publish the catalogs to an index node.

Arguments:
    dataset_map: A file mapping dataset_ids to directories, as generated by esgscan_directory.

    directory: Directory path to scan recursively.

    operation: One of:
        --create (default)
        --replace
        --update
        --delete-files
        --rename-files

Options:

    -a aggregate_dimension_name:
        Name of the aggregate dimension. Defaults to 'time'

    -c, --create
        Create and publish a dataset containing the files listed in the directory or dataset map.

    --echo-sql: Echo SQL commands

    --dataset dataset_name:
        String name of the dataset. If specified, all files will belong to the specified dataset,
        regardless of path. If omitted, paths are matched to the directory_format, as specified in the
        configuration file, to determine the dataset name.

    -d, --delete-files
        Delete the files listed in the dataset map or directory, and republish the dataset.
        Note: This differs from the action of esgunpublish, where the entire dataset is deleted on both
        data node and index node. Also note this operation does not affect the physical files, just the node
        database entries.

    --experiment experiment_id:
        Experiment identifier. All datasets will have this experiment ID, regardless of informtion
        in the dataset map or directory names.

    --filter regular_expression:
        Filter files matching the regular expression. The default is '.*\.nc$'
        Regular expression syntax is defined by the Python re module.

    -h, --help: Print a help message.

    -i init_dir:
        Directory containing all initialization files.
        Recommended: one initialization file for the default sections (esg.ini) and one per project, must match the name format esg.<project>.ini
        If not specified, the default installed init files are read.

    --keep-version:
        Keep the dataset version number the same for an existing dataset. By default the version number
        is incremented by 1. This option is ignored for new datasets.

    --log log_file:
        Name of output log file. Overrides the configuration log_filename option. Default is standard output.

    --map dataset_map: Read input from a dataset map, as generated by esgscan_directory.
        The directory arguments are ignored.

    -m, --message comment:
        Comment to associate with the latest version of the dataset(s). If no new version
        is created, the comment is ignored.

    --model model_id:
        Model identifier. All datasets will have this model ID, regardless of informtion
        in the dataset map or directory names.

    --new-version version_number
        Specify the dataset version number, a positive integer. If unspecified, the version number is
        set to 1 for new datasets, and is incremented by 1 for existing datasets. Use this option
        with caution, as the version number will apply to all datasets processed. See --keep-version and --version-list.

    --nodbwrite
         Scan the files, but do not write the dataset to the postgres database.  This option should not be used with --noscan, --thredds, --publish, as it is intended for "dry-runs" to validate metadata.

    --noscan
        Skip the scan phase and just publish. Assumes that the scan has already been done!

    --offline
        The datasets are offline. A minimal amount of information is published, including file size.
        The datafiles are not scanned, and no aggregations are published.

        Note: The project_id and dataset_id must be specified with this option (see --project and
        --dataset).

    -p, --property 'name=value':
        Add a property/value pair. This option can be used multiple times.

        Note: the property must also be configured in the initialization file
        and project handler.

    --parent parent_id:
        DEPRECATED: The P2P system does not support hierarchical datasets. This option is ignored.
        Name of the parent dataset of ALL the datasets. If not specified, the parent identifier is generated
        for each dataset from the parent_id option of the initialization file. Use this option with caution.

    --per-time
    --per-variable
        Specify how THREDDS catalogs are generated. If per variable, create a dataset and aggregation for
        each variable. If per time, all variables are contained in a single dataset. The options are
        mutually exclusive, and override the configuration option 'variable_per_file'. Offline datasets
        are always written as per time.

    --project project_id:
        Project identifier. If not specified, the project is determined from the dataset map
        if the --map form is used, otherwise the project is determined from the first file found
        that matches the file filter (see --filter).

        Note: This option is mandatory for offline datasets.

    --publish
        Publish the dataset if there are no errors. Implies --thredds.

    -e, --read-directories:
        Read dataset identification information from the directory
        names. THIS ASSUMES THAT EACH FILE IN A LEAF DIRECTORY BELONGS
        TO THE SAME DATASET. See --read-files, and Notes. This option
        is the default, and is generally faster than --read-files.

    --read-files:
        Read dataset identification information from each individual
        file. If not set, the dataset ID is generated by matching the
        directory with the config file option 'directory_format'.  See
        --read-directories and Notes.

    --rename-files
        Rename one or more files in a dataset. The --map form of the command must be used, and each
        line of the dataset map should have the form:

            dataset_id | to_file | size_in_bytes  | *from_file*=path

    -r, --replace
        Replace the dataset. If the dataset exists, all file entries not in the 'new' dataset are removed,
        existing files are replaced, and new files are added. If the dataset does not exist, the operation
        is the same as --create.

    --replica master_gateway_id
        DEPRECATED: Use --set-replica instead.
        Flag the dataset(s) as replicated. master_gateway_id is ignored.

    --rest-api
        Publish using the RESTful publication services. The configuration file option ``rest_service_url''
        defines the service location. If it is undefined, the service location is
        https://HOST/esg-search/ws, where HOST is derived from configuration option ``hessian_service_url''.

    --service service_name
        Specify a THREDDS service name to associate with an offline dataset. If omitted, the name of the
        first offline service in the configuration ''thredds_offline_services'' is used. This determines
        which offline lister to use.

    --set-replica
        Flag the dataset(s) as replicated.

    --summarize-errors
        Print a summary of errors for each dataset scanned.

    --thredds
        Generate THREDDS files. Implies --thredds-reinit.

    --thredds-reinit | --no-thredds-reinit
        Toggle reinitialization of the THREDDS server.

        If --thredds-reinit is specified, the TDS master catalog is regenerated and the THREDDS server is
        reinitialized to read the catalog hierarchy. This option is implied if --thredds is also specified.
        So the option is mainly useful to reinitialize the server without generating any new dataset
        catalogs.

        If --no-thredds-reinit is specified, the THREDDS server is not reinitialized, regardless of
        whether --thredds is used. This option is useful to generate one or more new dataset catalogs
        without the overhead of a TDS reinitialization call, followed by a single reinitialization call.
        Use this option with caution, as it will leave the database and THREDDS catalogs in an inconsistent
        state.

    -u, --update
        If a dataset exists, update (replace or append) listed files
        to the dataset. If the --map form of the command is used, each
        line of the dataset map has the form:

            dataset_id | to_file | size_in_bytes [ | *from_file*=path]

        If from_file is specified, the file from_file is replaced by to_file. If from_file is not specified,
        the file to_file replaces the dataset file with the same path. Note: in contrast to --replace,
        any existing file entries not in the 'new' dataset remain in the dataset.

    --use-existing dataset_name
        Run the scan phase based on dataset and file information already in the database.
        This option may be used more than once. Compare with --map, which takes a mapfile.
        To republish an existing or older version, specify the dataset as dataset_name#version.

    --use-list filelist
        Like --use-existing, but read the list of dataset names from a
        file, containing one dataset name per line. If the filelist is '-',
        read from standard input.

    --validate schema_name
        Validate the published catalog against a schema, on the server side. Implies --rest-api.
        By default no schema-specific validation is performed. The schema_name for CMIP5 is 'cmip5'.

    --version-list versionlist
        Use the version indicated in the version list. versionlist is a file, each line of which
        has the form:

            dataset_id | version

        This option is useful for publishing replica data in bulk, where the datasets must retain
        the replica version.

Examples:

    Publish data in directory /foo/bar, for project test. Obtain metadata by opening and reading the files,
    instead of matching directory names:

        esgpublish --read-files --project test --thredds --publish /foo/bar

    Create a mapfile, and run the scan, thredds, and publish phases separately:

        esgscan_directory --project test --read-files -o test.map /esg/data/test
        esgpublish --map test.map --read-files --project test
        esgpublish --map test.map --project test --noscan --thredds
        esgpublish --map test.map --project test --noscan --publish

    Publish using the RESTful publication API, and validate against the CMIP5 schema.

        esgpublish --read-files --project test --thredds --publish --rest-api --validate cmip5 /foo/bar

"""

def getDatasetUpdate(filename, session):
    
    if not os.path.exists(filename):
        raiseESGPublishError("File "+ filename + " not found.")
    f = open(filename)
    datasetList = []

    for line in f:
        parts = splitLine(line)
        
        datasetName = parts[0]
        ddict = {}
        for n in parts[1:]:
            kv = n.split("=")
            ddict[kv[0]] = kv[1]
        updateDatasetFromContext(ddict, datasetName, session)
        
        datasetList.append((datasetName, -1))


    return datasetList

def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline',  'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite'])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    aggregateDimension = "time"
    datasetMapfile = None
    datasetName = None
    echoSql = False
    filefilt = '.*\.nc$'
    init_dir = '/esg/config/esgcet/'
    initcontext = {}
    keepVersion = True

    las = False
    log_filename = None
    masterGateway = None
    message = None
    offline = False
    parent = None
    perVariable = None

    projectName = None
    properties = {}
    publish = False
    publishOnly = False
    publishOp = CREATE_OP
    readFiles = False
    rescan = False

    rescanDatasetName = []

    restApi = None
    schema = None
    service = None
    summarizeErrors = False
    testProgress1 = testProgress2 = None
    thredds = False
    threddsReinit = None
    version = None
    versionList = None
    nodbwrite = False

    for flag, arg in args:
        if flag=='-a':
            aggregateDimension = arg
        elif flag=='--append':
            publishOp = UPDATE_OP
        elif flag in ['-c', '--create']:
            publishOp = CREATE_OP
        elif flag=='--dataset':
            datasetName = arg
        elif flag in ['-d', '--delete-files']:
            publishOp = DELETE_OP
        elif flag=='--echo-sql':
            echoSql = True
        elif flag=='--experiment':
            initcontext['experiment'] = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_dir = arg
        elif flag=='--keep-version':
            keepVersion = True
        elif flag=='--log':
            log_filename = arg
        elif flag=='--map':
            datasetMapfile = arg
        elif flag in ['-m', '--message']:
            message = arg
        elif flag=='--model':
            initcontext['model'] = arg
        elif flag=='--nodbwrite':
            nodbwrite = True
        elif flag=='--new-version':
            try:
                version = string.atoi(arg)
                if version <=0:
                    raise ValueError
            except ValueError:
                raise ESGPublishError("Version number must be a positive integer: %s"%arg)
        elif flag=='--no-thredds-reinit':
            threddsReinit = False
        elif flag=='--noscan':
            publishOnly = True
        elif flag=='--offline':
            offline = True
        elif flag=='--parent':
            parent = arg
        elif flag=='--per-time':
            perVariable = False
        elif flag=='--per-variable':
            perVariable = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag=='--publish':
            publish = True
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--rename-files':
            publishOp = RENAME_OP
        elif flag in ['-r', '--replace']:
            publishOp = REPLACE_OP
        elif flag=='--replica':
            masterGateway = arg
            warning("The --replica option is deprecated. Use --set-replica instead")
        elif flag=='--rest-api':
            restApi = True
        elif flag=='--service':
            service = arg
        elif flag=='--set-replica':
            masterGateway = 'DEFAULT'
        elif flag=='--summarize-errors':
            summarizeErrors = True
        elif flag=='--thredds':
            thredds = True
        elif flag=='--thredds-reinit':
            threddsReinit = True
        elif flag in ['-u', '--update']:
            publishOp = UPDATE_OP
        elif flag=='--use-existing':
            rescan = True
            rescanDatasetName.append(arg)
        elif flag=='--use-list':
            rescan = True
            if arg=='-':
                namelist=sys.stdin
            else:
                namelist = open(arg)
            for line in namelist.readlines():
                line = line.strip()
                if line[0]!='#':
                    rescanDatasetName.append(line)
        elif flag=='--validate':
            schema = arg
            restApi = True
        elif flag=='--version-list':
            versionList = arg

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None and versionList is None:
        raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets")


    # Load the configuration and set up a database connection
    config = loadConfig(init_dir)
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
    initLogging('DEFAULT', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None




    # ... otherwise generate the directory map.
#        pdb.set_trace()

    datasetNames = getDatasetUpdate(datasetMapfile, Session)
    datasetNames.sort()

    if len(datasetNames)==0:
        warning("No datasets found.")
#    datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite)


    result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema)

    # print `result`

    # if summarizeErrors:
    #     print 'Summary of errors:'
    #     for name,versionno in datasetNames:
    #         dset = Dataset.lookup(name, Session)
    #         print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session)
    #         if dset.has_warnings(Session):
    #             print '=== Dataset: %s ==='%dset.name
    #             for line in dset.get_warnings(Session):
    #                 print line


if __name__=='__main__':
    main(sys.argv[1:])
