#!python

import sys
import logging
import os
import getopt
import string
import stat
import re

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from esgcet.publish import extractFromDataset, aggregateVariables, filelistIterator, fnmatchIterator, fnIterator, directoryIterator, \
    multiDirectoryIterator, progressCallback, StopEvent, readDatasetMap, datasetMapIterator, iterateOverDatasets, publishDatasetList, \
    processIterator, processNodeMatchIterator, CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP, REPLACE_OP, reinitializeThredds, \
    updateThreddsMasterCatalog, establish_pid_connection, bcolors, create_cim_from_dmap, setup_cdf2cim_environment, set_verbose_cim_errors
from esgcet.config import loadConfig, getHandler, getHandlerByName, initLogging, registerHandlers, splitLine, getOfflineLister, getThreddsServiceSpecs
from esgcet.exceptions import *
from esgcet.messaging import debug, info, warning, error, critical, exception
from esgcet.model import Dataset
from esgcet.query import queryDatasetMap
from esgcet.logon import check_cert, logon, get_myproxy_value_from_config

import esgcet


usage = """Usage:
    esgpublish [operation] [options] directory [directory ...]

    -or-

    esgpublish [operation] [options] --map <dataset_mapfile or mapfile_directory>

        Extract metadata from a list of directories representing one or more datasets, into a database. The directories should relate to one project only. Generate a THREDDS configuration catalog for each dataset, and publish the catalogs to an index node.

    -or-

    esgpublish --thredds-reinit

        Reinitialize the THREDDS Server.
        Please note: This option cannot be combined with other options, the script will exit right after the THREDDS initialization.


Arguments:
    dataset_map: A file mapping dataset_ids to directories, as generated by esgscan_directory.

    directory: Directory path to scan recursively.

    operation: One of:
        --create (default)
        --replace
        --update
        --delete-files
        --rename-files

Options:

    -a aggregate_dimension_name:
        Name of the aggregate dimension. Defaults to 'time'

    --commit-every n
        Commit file info to database every n files when adding a dataset. By default commit once
        after all files are scanned.

        This option can reduce memory usage, but if it is used, the dataset should be unpublished
        if publication is interrupted.

    -c, --create
        Create and publish a dataset containing the files listed in the directory or dataset map.

    --create-cim
        Can only be used in conjunction with --map. Calls cdf2cim as an initial step before
        publication to the database, to create CIM records.

        The cdf2cim script will scan for data files below a given starting directory, which will 
        be the deepest common parent directory of the data files listed in the map file.  If 
        filesystem layout recommendations for CMIP6 are followed, it will find the same set of
        data files as listed in the map file.

        The decision whether to call cdf2cim is determined as follows:

            - Command line option --create-cim or --no-create-cim take precedence.

            - If no command-line option, and the project ini file has create_cim=true, then call
              cdf2cim if --map is specified and --noscan is not supplied, ensuring that it is called
              before the scan phase but is not repeated if publication is done in stages.

            - If neither of the above apply, default is not to run cdf2cim.

    --create-cim-only
        As --create-cim, but does not actually publish the dataset.

    --echo-sql: Echo SQL commands

    --dataset dataset_name:
        String name of the dataset. If specified, all files will belong to the specified dataset,
        regardless of path. If omitted, paths are matched to the directory_format, as specified in the
        configuration file, to determine the dataset name.

    -d, --delete-files
        Delete the files listed in the dataset map or directory, and republish the dataset.
        Note: This differs from the action of esgunpublish, where the entire dataset is deleted on both
        data node and index node. Also note this operation does not affect the physical files, just the node
        database entries.

    --experiment experiment_id:
        Experiment identifier. All datasets will have this experiment ID, regardless of informtion
        in the dataset map or directory names.

    --filter regular_expression:
        Filter files matching the regular expression. The default is '.*\.nc$'
        Regular expression syntax is defined by the Python re module.

    -h, --help: Print a help message.

    --hessian-api: Use the legacy publishing API. See also --rest-api.

    -i init_dir:
        Directory containing all initialization files.
        Recommended: one initialization file for the default sections (esg.ini) and one per project, must match the name format esg.<project>.ini
        If not specified, the default installed init files are read.

    --keep-credentials:
        Do not renew myproxy credentials.

    --keep-version:
        Keep the dataset version number the same for an existing dataset. By default the version number
        is incremented by 1. This option is ignored for new datasets.

    --log log_file:
        Name of output log file. Overrides the configuration log_filename option. Default is standard output.

    --map dataset_mapfile or mapfile_directory: Read input from a dataset mapfile, as generated by 'esgprep mapfile'.
        Use either a single mapfile as input or scan a directory recursively to publish all containing mapfiles.

    -m, --message comment:
        Comment to associate with the latest version of the dataset(s). If no new version
        is created, the comment is ignored.

    --model model_id:
        Model identifier. All datasets will have this model ID, regardless of informtion
        in the dataset map or directory names.

    --new-version version_number
        Specify the dataset version number, a positive integer. If unspecified, the version number is
        set to 1 for new datasets, and is incremented by 1 for existing datasets. Use this option
        with caution, as the version number will apply to all datasets processed. See --keep-version and --version-list.

    --no-create-cim
        Do not call cd2fcim.  See --create-cim for more details.

    --nodbwrite
         Scan the files, but do not write the dataset to the postgres database.  This option should not be used with --noscan, --thredds, --publish, as it is intended for "dry-runs" to validate metadata.

    --noscan
        Skip the scan phase and just publish. Assumes that the scan has already been done!

    --no-thredds-reinit
        The THREDDS server is not reinitialized, regardless of whether --thredds is used.
        This option is useful to generate one or more new dataset catalogs without the overhead of a TDS reinitialization call,
        followed by a single reinitialization call.
        Use this option with caution, as it will leave the database and THREDDS catalogs in an inconsistent state.

    --offline
        The datasets are offline. A minimal amount of information is published, including file size.
        The datafiles are not scanned, and no aggregations are published.

        Note: The project_id and dataset_id must be specified with this option (see --project and
        --dataset).

    -p, --property 'name=value':
        Add a property/value pair. This option can be used multiple times.

        Note: the property must also be configured in the initialization file
        and project handler.

    --parent parent_id:
        DEPRECATED: The P2P system does not support hierarchical datasets. This option is ignored.
        Name of the parent dataset of ALL the datasets. If not specified, the parent identifier is generated
        for each dataset from the parent_id option of the initialization file. Use this option with caution.

    --per-time
    --per-variable
        Specify how THREDDS catalogs are generated. If per variable, create a dataset and aggregation for
        each variable. If per time, all variables are contained in a single dataset. The options are
        mutually exclusive, and override the configuration option 'variable_per_file'. Offline datasets
        are always written as per time.

    --project project_id:
        Project identifier. This option is mandatory.

    --publish
        Publish the dataset if there are no errors. Implies --thredds.

    -e, --read-directories:
        Read dataset identification information from the directory
        names. THIS ASSUMES THAT EACH FILE IN A LEAF DIRECTORY BELONGS
        TO THE SAME DATASET. See --read-files, and Notes. This option
        is the default, and is generally faster than --read-files.

    --read-files:
        Read dataset identification information from each individual
        file. If not set, the dataset ID is generated by matching the
        directory with the config file option 'directory_format'.  See
        --read-directories and Notes.

    --rename-files
        Rename one or more files in a dataset. The --map form of the command must be used, and each
        line of the dataset map should have the form:

            dataset_id | to_file | size_in_bytes  | *from_file*=path

    -r, --replace
        Replace the dataset. If the dataset exists, all file entries not in the 'new' dataset are removed,
        existing files are replaced, and new files are added. If the dataset does not exist, the operation
        is the same as --create.

    --replica master_gateway_id
        DEPRECATED: Use --set-replica instead.
        Flag the dataset(s) as replicated. master_gateway_id is ignored.

    --rest-api
        Publish using the RESTful publication services. The configuration file option ``rest_service_url''
        defines the service location. If it is undefined, the service location is
        https://HOST/esg-search/ws, where HOST is derived from configuration option ``hessian_service_url''.

        If neither --rest-api nor --hessian-api are specified, then config file option use_rest_api is used,
        and if this option is not found, the REST API is used by default.

    --service service_name
        Specify a THREDDS service name to associate with an offline dataset. If omitted, the name of the
        first offline service in the configuration ''thredds_offline_services'' is used. This determines
        which offline lister to use.

    --set-replica
        Flag the dataset(s) as replicated.

    --summarize-errors
        Print a summary of errors for each dataset scanned.

    --test
        Flag the dataset(s) as test publication. This option must be set for publication of test data that use PIDs.
        WARNING: Do not use this option in production, all PIDs flagged as test will be deleted after a while

    --thredds
        Generate THREDDS files. The THREDDS Server will be reinitalized automatically unless "no-thredds-reinit" is set.

    --thredds-reinit
        The TDS master catalog is regenerated and the THREDDS server is reinitialized to read the catalog hierarchy.
        Please note: This option cannot be combined with other options, the script will exit right after the THREDDS initialization.

    -u, --update
        If a dataset exists, update (replace or append) listed files
        to the dataset. If the --map form of the command is used, each
        line of the dataset map has the form:

            dataset_id | to_file | size_in_bytes [ | *from_file*=path]

        If from_file is specified, the file from_file is replaced by to_file. If from_file is not specified,
        the file to_file replaces the dataset file with the same path. Note: in contrast to --replace,
        any existing file entries not in the 'new' dataset remain in the dataset.

    --use-existing dataset_name
        Run the scan phase based on dataset and file information already in the database.
        This option may be used more than once. Compare with --map, which takes a mapfile.
        To republish an existing or older version, specify the dataset as dataset_name#version.

    --use-list filelist
        Like --use-existing, but read the list of dataset names from a
        file, containing one dataset name per line. If the filelist is '-',
        read from standard input.

    --validate schema_name
        Validate the published catalog against a schema, on the server side. Implies --rest-api.
        By default no schema-specific validation is performed. The schema_name for CMIP5 is 'cmip5'.

    --version
        Print version of the software and exit.

    --version-list versionlist
        Use the version indicated in the version list. versionlist is a file, each line of which
        has the form:

            dataset_id | version

        This option is useful for publishing replica data in bulk, where the datasets must retain
        the replica version.

    --verbose-cim-errors
        In the event of failure to publish CIM documents, display the error from the server.

Examples:

    Publish data in directory /foo/bar, for project test. Obtain metadata by opening and reading the files,
    instead of matching directory names:

        esgpublish --read-files --project test --thredds --publish /foo/bar

    Create a mapfile, and run the scan, thredds, and publish phases separately:

        esgscan_directory --project test --read-files -o test.map /esg/data/test
        esgpublish --map test.map --read-files --project test
        esgpublish --map test.map --project test --noscan --thredds
        esgpublish --map test.map --project test --noscan --publish

    Publish using the RESTful publication API, and validate against the CMIP5 schema.

        esgpublish --read-files --project test --thredds --publish --rest-api --validate cmip5 /foo/bar

"""


def summarize_errors(Session, datasetNames):
    print 'Summary of errors:'
    for name,versionno in datasetNames:
        dset = Dataset.lookup(name, Session)
        print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session)
        if dset.has_warnings(Session):
            print '=== Dataset: %s ==='%dset.name
            for line in dset.get_warnings(Session):
                print line


def yield_parsed_mapfiles(mapfileDir):
    """
    yields (dmap, extraFields) for each valid mapfile found under starting directory
    """
    for root, dirs, files in os.walk(mapfileDir, followlinks=True):
        dirs.sort()
        files.sort()
        for mapfile in files:
            try:
                dmap, extraFields = readDatasetMap(os.path.join(root, mapfile), parse_extra_fields=True)
            except:
                error("Skipping %s: File does not match ESGF mapfile format." % os.path.join(root, mapfile))
                continue
            if len(dmap) == 0:
                error("Skipping %s: File does not contain any datasets." % os.path.join(root, mapfile))
                continue
            yield dmap, extraFields


def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'commit-every=', 'create', 'create-cim', 'create-cim-only', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'hessian-api', 'keep-credentials', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline',  'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-create-cim', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'test', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'verbose-cim-errors', 'nodbwrite', 'version'])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    aggregateDimension = "time"
    commitEvery = None
    createCim = None  # default (behaviour depends on various options and project handler), not same as False
    createCimOnly = False
    verboseCimErrors = False
    datasetMapfile = None
    datasetName = None
    echoSql = False
    filefilt = '.*\.nc$'
    init_dir = '/esg/config/esgcet/'
    initcontext = {}
    keepVersion = False
    las = False
    log_filename = None
    mapfileDir = None
    mapfileProvided = False
    masterGateway = None
    message = None
    offline = False
    parent = None
    perVariable = None
    projectName = None
    properties = {}
    publish = False
    publishOnly = False
    publishOp = CREATE_OP
    readFiles = False
    rescan = False
    rescanDatasetName = []
    restApi = None
    restApiDefault = True
    schema = None
    service = None
    summarizeErrors = False
    testProgress1 = testProgress2 = None
    thredds = False
    threddsReinit = None
    version = None
    versionList = None
    nodbwrite = False
    test_publication = False
    keep_credentials = False

    for flag, arg in args:
        if flag=='-a':
            aggregateDimension = arg
        elif flag=='--append':
            publishOp = UPDATE_OP
        elif flag=='--commit-every':
            try:
                commitEvery = string.atoi(arg)
                if commitEvery <= 0:
                    raise ValueError
            except ValueError:
                raise ESGPublishError('--commit-every must be a positive integer: %s' % arg)
        elif flag in ['-c', '--create']:
            publishOp = CREATE_OP
        elif flag in ('--create-cim', '--no-create-cim', '--create-cim-only') and (createCim != None or createCimOnly):
            raise ESGPublishError('--create-cim/--no-create-cim/--create-cim-only are mutually exclusive')
        elif flag == '--create-cim':
            createCim = True
        elif flag == '--no-create-cim':
            createCim = False
        elif flag == '--create-cim-only':
            createCimOnly = True
        elif flag == '--verbose-cim-errors':
            verboseCimErrors = True
        elif flag=='--dataset':
            datasetName = arg
        elif flag in ['-d', '--delete-files']:
            publishOp = DELETE_OP
        elif flag=='--echo-sql':
            echoSql = True
        elif flag=='--experiment':
            initcontext['experiment'] = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='--hessian-api':
            restApi = False
        elif flag=='-i':
            init_dir = arg
        elif flag == '--keep-credentials':
            keep_credentials = True
        elif flag=='--keep-version':
            keepVersion = True
        elif flag=='--log':
            log_filename = arg
        elif flag=='--map':
            if os.path.isfile(arg):
                datasetMapfile = arg
            elif os.path.isdir(arg):
                mapfileDir = arg
            else:
                raise ESGPublishError("Not a valid file or directory: %s" % arg)
            mapfileProvided = True
        elif flag in ['-m', '--message']:
            message = arg
        elif flag=='--model':
            initcontext['model'] = arg
        elif flag=='--nodbwrite':
            nodbwrite = True
        elif flag=='--new-version':
            try:
                version = string.atoi(arg)
                if version <=0:
                    raise ValueError
            except ValueError:
                raise ESGPublishError("Version number must be a positive integer: %s"%arg)
        elif flag=='--no-thredds-reinit':
            threddsReinit = False
        elif flag=='--noscan':
            publishOnly = True
        elif flag=='--offline':
            offline = True
        elif flag=='--parent':
            parent = arg
        elif flag=='--per-time':
            perVariable = False
        elif flag=='--per-variable':
            perVariable = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag=='--publish':
            publish = True
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--rename-files':
            publishOp = RENAME_OP
        elif flag in ['-r', '--replace']:
            publishOp = REPLACE_OP
        elif flag=='--replica':
            masterGateway = arg
            warning("The --replica option is deprecated. Use --set-replica instead")
        elif flag=='--rest-api':
            restApi = True
        elif flag=='--service':
            service = arg
        elif flag=='--set-replica':
            masterGateway = 'DEFAULT'
        elif flag=='--summarize-errors':
            summarizeErrors = True
        elif flag=='--test':
            test_publication = True
        elif flag=='--thredds':
            thredds = True
        elif flag=='--thredds-reinit':
            threddsReinit = True
        elif flag in ['-u', '--update']:
            publishOp = UPDATE_OP
        elif flag=='--use-existing':
            rescan = True
            rescanDatasetName.append(arg)
        elif flag=='--use-list':
            rescan = True
            if arg=='-':
                namelist=sys.stdin
            else:
                namelist = open(arg)
            for line in namelist.readlines():
                line = line.strip()
                if line[0]!='#':
                    rescanDatasetName.append(line)
        elif flag=='--validate':
            schema = arg
            restApi = True
        elif flag=='--version-list':
            versionList = arg
        elif flag=='--version':
            print "esg-publisher (esgcet) version {}".format(esgcet.__version__)
            sys.exit(0)

    if createCimOnly and not mapfileProvided:
        raise ESGPublishError("--create-cim-only can only be used with --map")

    if createCim and not mapfileProvided:
        raise ESGPublishError("--create-cim can only be used with --map")


    # The project must be specified
    if projectName is None:
        raise ESGPublishError("Must specify project with --project")

    if version is not None and versionList is not None:
        raise ESGPublishError("Cannot specify both --new-version and --version-list")

    if versionList is not None:
        version = {}
        f = open(versionList)
        lines = f.readlines()
        f.close()
        for line in lines:
            line = line.strip()
            dsid, vers = line.split('|')
            dsid = dsid.strip()
            vers = int(vers.strip())
            version[dsid] = vers

    # Load the configuration
    config = loadConfig(init_dir, projectName)
    project_section = "project:%s" % projectName    
    project_config_section = "config:%s" % projectName

    #---------------------------------------------------------------------
    if verboseCimErrors:
        set_verbose_cim_errors()

    # if --create-cim-only specified, just do this and quit without 
    # starting database interaction etc
    if createCimOnly:        

        setup_cdf2cim_environment(config, project_config_section)

        if mapfileDir is not None:
            success = True
            for dmap, extraFields in yield_parsed_mapfiles(mapfileDir):
                if not create_cim_from_dmap(dmap, exception_on_fail=False):
                    success = False
            if success:
                sys.exit(0)
            else:
                error("One or more mapfiles had errors with create CIM")
                sys.exit(1)

        else:
            dmap = readDatasetMap(datasetMapfile)
            create_cim_from_dmap(dmap)
            sys.exit(0)  # assume it raised an exception if it failed

    # set createCim to True or False based on handler and options, 
    # unless argument parsing has already found an explicit option that overrides this 
    # (--create-cim / --no-create-cim / --create-cim-only)
    if createCim == None:
        if config.getboolean(project_section, 'create_cim', default=False):
            createCim = (mapfileProvided and not publishOnly and masterGateway == None)
        else:
            createCim = False

    if createCim:
        try:
            setup_cdf2cim_environment(config, project_config_section)
        except Exception as exc:
            print exc
            print "Turning off create_cim because related configuration variables are missing."
            createCim = False

    #---------------------------------------------------------------------

    # set up a database connection
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600, client_encoding='utf8')
    initLogging('DEFAULT', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    if threddsReinit:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()
        sys.exit(0)
    elif threddsReinit is None:
        threddsReinit = thredds

    # Register project handlers
    registerHandlers(projectName)

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=restApiDefault)

    # Set up dictionary of extra handler args.
    # (Any not supported by the handler will be filtered out.)
    handler_extra_args = {'replica': (masterGateway != None)}        

    # get the handler
    handler = getHandlerByName(projectName, None, Session, offline=offline, **handler_extra_args)

    # check cert and generate a new one, if expired
    if publish and not keep_credentials:
        myproxy_username = get_myproxy_value_from_config(config, 'username')
        myproxy_hostname = get_myproxy_value_from_config(config, 'hostname')
        myproxy_password = get_myproxy_value_from_config(config, 'password')
        if not check_cert(config, myproxy_username):
            info('Invalid myproxy certificate, renewing...')
            try:
                logon(config, myproxy_username, myproxy_password, myproxy_hostname)
            except:
                warning('Certificate generation failed, please try to run myproxy-logon manually...')

    # Check if project uses PIDs and start messaging thread
    pid_connector = None
    try:
        if not publishOnly or thredds:
            pid_prefix = handler.check_pid_avail(project_config_section, config, version=version)
            if pid_prefix:
                pid_connector = establish_pid_connection(pid_prefix, test_publication, project_config_section, config, handler, publish=True)
                if thredds:
                    pid_connector.start_messaging_thread()

        # If the dataset map is input, just read it ...
        dmap = None
        directoryMap = None
        extraFields = None

        validate_standard_name = config.getboolean(project_section, 'validate_standard_name', default=True)

        # Process all mapfiles in a directory
        if mapfileDir is not None:
            for dmap, extraFields in yield_parsed_mapfiles(mapfileDir):
                    datasetNames = dmap.keys()
                    datasetNames.sort()

                    if createCim:
                        create_cim_from_dmap(dmap, exception_on_fail=False)

                    # Iterate over datasets
                    if not publishOnly:
                        datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp,
                                                       filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version,
                                                       extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles,
                                                       nodbwrite=nodbwrite, pid_connector=pid_connector, test_publication=test_publication,
                                                       handlerExtraArgs=handler_extra_args, commitEvery=commitEvery, validate_standard_name=validate_standard_name)

                    if (not nodbwrite):
                        result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service,
                                                    perVariable=perVariable, reinitThredds=False, restInterface=restApi, schema=schema,
                                                    pid_connector=pid_connector, project_config_section=project_config_section)
                    if summarizeErrors:
                        summarize_errors(Session, datasetNames)

            # reinitialize THREDDS catalog only once after processing all mapfiles
            if threddsReinit:
                updateThreddsMasterCatalog(Session)
                result = reinitializeThredds()

        else:
            if datasetMapfile is not None:
                dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
                datasetNames = dmap.keys()                

                if createCim:
                    create_cim_from_dmap(dmap, exception_on_fail=False)

            elif rescan:
                # Note: No need to get the extra fields, such as mod_time, since
                # they are already in the database, and will be used for file comparison if necessary.
                dmap, offline = queryDatasetMap(rescanDatasetName, Session)
                datasetNames = dmap.keys()

            # ... otherwise generate the directory map.
            else:
                # Online dataset(s)
                if not offline:
                    if len(lastargs)==0:
                        print "No directories specified."
                        print usage
                        sys.exit(0)

                    props = properties.copy()
                    props.update(initcontext)
                    if not readFiles:
                        directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName)
                    else:
                        directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName)
                    datasetNames = [(item,-1) for item in directoryMap.keys()]

                # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
                else:
                    dmap = {}
                    listerSection = getOfflineLister(config, "project:%s"%projectName, service)
                    offlineLister = config.get(listerSection, 'offline_lister_executable')
                    commandArgs = "--config-section %s "%listerSection
                    commandArgs += " ".join(lastargs)
                    for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
                        size, mtime = sizet
                        if dmap.has_key((dsetName,-1)):
                            dmap[(dsetName,-1)].append((filepath, str(size)))
                        else:
                            dmap[(dsetName,-1)] = [(filepath, str(size))]

                    datasetNames = dmap.keys()

            datasetNames.sort()
            if len(datasetNames)==0:
                warning("No datasets found.")
                min_version = -1
            else:
                min_version = sorted(datasetNames, key=lambda x: x[1])[0][1]

            # Must specify version for replications
            if min_version == -1 and masterGateway is not None and version is None and versionList is None:
                raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets")

            # Iterate over datasets
            if not publishOnly:
                datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt,
                                               initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields,
                                               masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite,
                                               pid_connector=pid_connector, test_publication=test_publication, handlerExtraArgs=handler_extra_args,
                                               commitEvery=commitEvery, validate_standard_name=validate_standard_name)

            if (not nodbwrite):
                result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service,
                                            perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema,
                                            pid_connector=pid_connector, project_config_section=project_config_section)

            if summarizeErrors:
                summarize_errors(Session, datasetNames)

        # finish PID messaging queue
        if pid_connector:
            if thredds:
                pid_connector.finish_messaging_thread()
            if test_publication:
                print '\n' + bcolors.WARNING + '********************************************************************' + bcolors.ENDC
                print bcolors.WARNING + '***                                                              ***' + bcolors.ENDC
                print bcolors.WARNING + '***                        !!! WARNING !!!                       ***' + bcolors.ENDC
                print bcolors.WARNING + '***                  Datasets are flagged as TEST                ***' + bcolors.ENDC
                print bcolors.WARNING + '***  PIDs will be deleted from the Handle Server after a while.  ***' + bcolors.ENDC
                print bcolors.WARNING + '***                                                              ***' + bcolors.ENDC
                print bcolors.WARNING + '********************************************************************' + bcolors.ENDC + '\n'
    except:
        if pid_connector and thredds:
            pid_connector.force_finish_messaging_thread()
        raise

if __name__=='__main__':
    main(sys.argv[1:])
