#!/usr/bin/env python

import sys
import os
import getopt
import urlparse
import logging
import string

from esgcet.model import *
from esgcet.publish import deleteDatasetList, updateThreddsMasterCatalog, reinitializeThredds, DELETE, UNPUBLISH, NO_OPERATION, UNINITIALIZED, readDatasetMap,\
    parseDatasetVersionId, generateDatasetVersionId, iterateOverDatasets, UPDATE_OP, publishDatasetList, updateThreddsMasterCatalog, \
    establish_pid_connection
from esgcet.config import loadConfig, initLogging, registerHandlers, getHandlerByName
from esgcet.exceptions import *
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, join
from esgcet.messaging import debug, info, warning, error, critical, exception
from esgcet.query import queryDatasetMap
from esgcet.logon import check_cert, logon, get_myproxy_value_from_config


usage = """Usage:
    esgunpublish [options] dataset_name [dataset_name ...]

    -or-

    esgunpublish [options] --map <dataset_mapfile or mapfile_directory>

    -or-

    esgunpublish [options] --rest-api 'dataset_name|data_node' ['dataset_name|data_node' ...]

    Delete or retract dataset(s) or specific versions of dataset(s) in this order:

      - Remove the dataset from the gateway (see --skip-gateway)
      - Remove the LAS entry (see --las)
      - Remove the THREDDS entry (see --skip-thredds)
      - Remove the local database entry, if --database-delete is set. By default the local database entry is
        kept intact, so that data files do not have to be rescanned.

    This script does not remove the data files, but does remove metadata associated
    with the dataset(s). The details of what metadata are removed depends on the options --gateway-retract
    and --database-delete.

    By default esgunpublish deletes all versions of the dataset. To unpublish a specific version n, specify
    the dataset as dataset_name#n.

    IMPORTANT: Note the difference in the way datasets are identified when the --rest-api option is used.
    With --rest-api, dataset IDs have the form 'dataset_name|data_node' where dataset_name includes
    the version number, for example, 'pcmdi.test.v1|pcmdi9.llnl.gov'. Using the other (non-RESTful) options,
    datasets do not include the data_node portion, for example 'pcmdi.test' or 'pcmdi.test#1'. If the wrong
    form is used, the dataset may not be deleted from the SOLR index correctly.

Arguments:
    dataset_name: The string name of the dataset(s) to be unpublished. To indicate a specific version,
    use the syntax dataset_name#n. If no version is indicated, the latest version is deleted.

Options:

    --delete
    --retract

    --data-node data_node:
        Unpublish only from the given data_node, only for Solr publication.
        If data_node is set to "all" it will delete all records in Solr related to the dataset_id(s).
        Default is to extract the data_node from the thredds_url in esg.ini.

    --database-delete: Delete the associated local database entry for this dataset. By default, the
      database information is left intact.

    --database-only: Short for --skip-gateway --skip-thredds --database-delete

    --echo-sql: Echo SQL commands

    -h, --help: Print a help message.

    --hessian-api: Use the legacy publishing API. See also --rest-api.

    -i init_dir:
        Directory containing all initialization files.
        Recommended: one initialization file for the default sections (esg.ini) and one per project, must match the name format esg.<project>.ini
        If not specified, the default installed init files are read.

    --keep-credentials:
        Do not renew myproxy credentials.

    --las: Do not reinitialize the LAS server.

    --log log_file:
        Name of output log file. Overrides the configuration log_filename option. Default is standard output.

    --map dataset_mapfile or mapfile_directory: Delete all datasets in dataset_map. 'dataset_map' is a map file as generated by esgscan_directory.
        Use either a single mapfile as input or scan a directory recursively to unpublish all containing mapfiles.
        If this option is used, trailing command line arguments (dataset_names) are ignored.

    --no-republish: Do not republish the previous dataset version if the latest version is deleted. By default,
      if this option is not set, and the latest dataset version is deleted, the most recent version will be
      rescanned and republished.

    --no-thredds-reinit
        The THREDDS server is not reinitialized. This option is useful to remove one or more dataset catalogs
        without the overhead of a TDS reinitialization call, followed by a single reinitialization call.
        Use this option with caution, as it will leave the database and THREDDS catalogs in an inconsistent
        state.

    --project project_id:
        Project identifier. This option is mandatory.

    --rest-api
        Publish using the RESTful publication services. The configuration file option ``rest_service_url''
        defines the service location. If it is undefined, the service location is
        https://HOST/esg-search/ws, where HOST is derived from configuration option ``hessian_service_url''.

        Note: With this option, the dataset_name should have the form 'dataset_id|data_node'.

        If neither --rest-api nor --hessian-api are specified, then config file option use_rest_api is used,
        and if this option is not found, the **legacy** API is used by default.
        ** This contrasts with 'esgpublish' but it is intended to change this behavior in future. **

    --skip-index: Perform local (node) operations, skip SOLR index dataset delete.

    --skip-thredds: Do not remove associated THREDDS catalogs, or reinitialize the TDS server.
      By default, THREDDS catalogs are removed.

    --sync-thredds: Delete all THREDDS catalogs ('orphaned' catalogs) without a corresponding database entry.

    --test
        Flag the dataset(s) as test publication. This option must be set for unpublication of test data that use PIDs.
        WARNING: Do not use this option in production, all PIDs flagged as test will be deleted after a while.

    --use-list filelist
        Read the list of dataset names from a file, containing one
        dataset name per line. If the filelist is '-', read from
        standard input.

"""

def cleanupCatalogs(arg, dirname, names):
    for name in names:
        base, suffix = os.path.splitext(name)
        if suffix==".xml" and name not in ["catalog.xml"]:
            fullname = os.path.join(dirname, name)
            if not arg.has_key(fullname):
                ans = raw_input("The catalog %s is 'orphaned', delete? [y|n]: "%fullname)
                if ans.lower()=='y':
                    print "Deleting %s"%fullname
                    os.unlink(fullname)


def republishDataset(result, Session, thredds, gatewayOp):
    """
    Republish previous versions as needed. This will happen if the latest version
    was deleted from the database, and is not
    the only version. In this case the previous version will be rescanned to generate the aggregations
    (unless user has overridden with skip_aggregations in project config)
    """
    statusDict, republishList = result
    if len(republishList)>0:
        registerHandlers() # Register project handlers.
        info("Republishing modified datasets:")
        republishDatasetNames = [generateDatasetVersionId(dsetTuple) for dsetTuple in republishList]
        dmap, offline, extraFields = queryDatasetMap(republishDatasetNames, Session, True)
        datasetNames = dmap.keys()
        datasets = iterateOverDatasets(None, dmap, None, republishList, Session, "time", UPDATE_OP, None, {}, offline, {}, forceAggregate=True, extraFields=extraFields)
        republishOp = (gatewayOp != NO_OPERATION) # Don't republish if skipping the gateway op
        result = publishDatasetList(datasetNames, Session, publish=republishOp, thredds=thredds)


def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "hi:", ['data-node=', 'database-delete', 'database-only', 'delete', 'echo-sql', 'help', 'hessian-api',
                                                     'keep-credentials', 'map=', 'no-republish', 'no-thredds-reinit', 'skip-gateway', 'skip-index',
                                                     'las', 'log=', 'project=', 'rest-api', 'retract', 'skip-thredds', 'sync-thredds', 'test', 'use-list='])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    deleteAll = False
    datasetMap = None
    mapfileDir = None
    deleteDset = False
    unpublishOnGateway = False
    echoSql = False
    init_dir = '/esg/config/esgcet/'
    gatewayOp = UNINITIALIZED
    las = False
    log_filename = None
    republish = True
    restApi = None
    restApiDefault = True
    thredds = True
    syncThredds = False
    useList = False
    threddsReinit = True
    projectName = None
    keep_credentials = False
    test_publication = False
    data_node = None

    for flag, arg in args:
        if flag == '--data-node':
            data_node = arg
        elif flag=='--database-delete':
            deleteDset = True
        elif flag=='--database-only':
            gatewayOp = NO_OPERATION
            thredds = False
            deleteDset = True
        elif flag=='--echo-sql':
            echoSql = True
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='--hessian-api':
            rest_api = False
            restApiDefault = False
        elif flag=='-i':
            init_dir = arg
        elif flag == '--keep-credentials':
            keep_credentials = True
        elif flag=='--map':
            if os.path.isfile(arg):
                datasetMap = readDatasetMap(arg)
            elif os.path.isdir(arg):
                mapfileDir = arg
            else:
                raise ESGPublishError("Not a valid file or directory: %s" % arg)
        elif flag=='--skip-gateway':
            gatewayOp = NO_OPERATION
        elif flag=='--skip-index':
            gatewayOp = NO_OPERATION
        elif flag=='--delete':
            if gatewayOp == UNPUBLISH:
                print "Error: invalid arguments: must use either --delete or --retract but not both!"
                exit(1)
            gatewayOp = DELETE
        elif flag=='--retract':
            if gatewayOp == DELETE:
                print "Error: invalid arguments: must use either --delete or --retract but not both!"
                exit(1)
            gatewayOp = UNPUBLISH
        elif flag=='--las':
            las = True
        elif flag=='--log':
            log_filename = arg
        elif flag=='--no-republish':
            republish = False
        elif flag=='--no-thredds-reinit':
            threddsReinit = False
        elif flag == '--project':
            projectName = arg
        elif flag=='--rest-api':
            restApi = True
        elif flag=='--skip-thredds':
            thredds = False
        elif flag=='--sync-thredds':
            syncThredds = True
        elif flag == '--test':
            test_publication = True
        elif flag=='--use-list':
            useList = True
            useListPath = arg

    # The project must be specified
    if projectName is None:
        raise ESGPublishError("Must specify project with --project")


    # Load the configuration and set up a database connection
    config = loadConfig(init_dir)
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
    initLogging('DEFAULT', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    if config is None:
        raise ESGPublishError("No configuration file found.")

    if gatewayOp != NO_OPERATION:
        if not keep_credentials:
            # check cert and generate a new one, if expired
            myproxy_username = get_myproxy_value_from_config(config, 'username')
            myproxy_hostname = get_myproxy_value_from_config(config, 'hostname')
            myproxy_password = get_myproxy_value_from_config(config, 'password')
            if not check_cert(config, myproxy_username):
                info('Invalid myproxy certificate, renewing...')
                try:
                    logon(config, myproxy_username, myproxy_password, myproxy_hostname)
                except:
                    warning('Certificate generation failed, please try to run myproxy-logon manually...')


    if gatewayOp == UNINITIALIZED:
        print "esgunpublish usage error:  index unpublication mode not set.  Must use one of following options:"
        print "--skip-index, --database-only, --delete, --retract"      
        exit(1)  

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=restApiDefault)

    if not data_node:
        data_node = urlparse.urlparse(config.get('DEFAULT', 'thredds_url')).netloc
    elif data_node == 'all':
        if not restApi:
            data_node = None
        else:
            data_node = urlparse.urlparse(config.get('DEFAULT', 'thredds_url')).netloc
            warning('Cannot unpublish from multiple data nodes using REST, unpublishing from %s.' % (data_node))

    # Register project handlers
    registerHandlers(projectName)
    handler = getHandlerByName(projectName, None, Session)

    # Check if project uses PIDs and start messaging thread
    pid_connector = None
    try:
        project_config_section = "config:%s" % projectName
        pid_prefix = handler.check_pid_avail(project_config_section, config)
        if pid_prefix:
            pid_connector = establish_pid_connection(pid_prefix, test_publication, project_config_section, config, handler, publish=False)
            pid_connector.start_messaging_thread()

        if datasetMap is None and mapfileDir is None:
            if not useList:
                datasetNames = [parseDatasetVersionId(item) for item in lastargs]
            else:
                if useListPath=='-':
                    namelist = sys.stdin
                else:
                    namelist = open(useListPath)
                datasetNames = []
                for line in namelist.readlines():
                    versionId = parseDatasetVersionId(line.strip())
                    datasetNames.append(versionId)
            result = deleteDatasetList(datasetNames, Session, gatewayOp, thredds, las, deleteDset, deleteAll=deleteAll,
                                       republish=republish, reinitThredds=threddsReinit, restInterface=restApi,
                                       pid_connector=pid_connector, project_config_section=project_config_section,
                                       data_node=data_node)

        elif mapfileDir is not None:
            for root, dirs, files in os.walk(mapfileDir, followlinks=True):
                for mapfile in files:
                    try:
                        datasetMap = readDatasetMap(os.path.join(root, mapfile))
                    except:
                        error("Skipping %s: File does not match ESGF mapfile format." %os.path.join(root, mapfile))
                        continue
                    datasetNames = datasetMap.keys()
                    datasetNames.sort()
                    result = deleteDatasetList(datasetNames, Session, gatewayOp, thredds, las, deleteDset,
                                               deleteAll=deleteAll, republish=republish, reinitThredds=False,
                                               restInterface=restApi, pid_connector=pid_connector,
                                               project_config_section=project_config_section, data_node=data_node)

                    if republish:
                        republishDataset(result, Session, thredds, gatewayOp)

            # reinitialize THREDDS catalog only once after processing all mapfiles
            if threddsReinit:
                updateThreddsMasterCatalog(Session)
                result = reinitializeThredds()

        else:
            datasetNames = datasetMap.keys()
            datasetNames.sort()

            result = deleteDatasetList(datasetNames, Session, gatewayOp, thredds, las, deleteDset, deleteAll=deleteAll,
                                       republish=republish, reinitThredds=threddsReinit, restInterface=restApi,
                                       pid_connector=pid_connector, project_config_section=project_config_section,
                                       data_node=data_node)

            if republish:
                republishDataset(result, Session, thredds, gatewayOp)

        # Synchronize database and THREDDS catalogs
        if syncThredds:
            threddsRoot = config.get('DEFAULT', 'thredds_root')

            # Make a dictionary of catalogs from the database
            session = Session()
            subcatalogs = session.query(Catalog).select_from(join(Catalog, Dataset, Catalog.dataset_name==Dataset.name)).all()
            catdict = {}
            for catalog in subcatalogs:
                location = os.path.join(threddsRoot, catalog.location)
                catdict[location] = 1
            session.close()

            # Scan all XML files in the threddsroot
            os.path.walk(threddsRoot, cleanupCatalogs, catdict)

        # finish PID messaging queue
        if pid_connector:
            pid_connector.finish_messaging_thread()

    except:
        if pid_connector:
            pid_connector.force_finish_messaging_thread()
        raise

if __name__=='__main__':
    main(sys.argv[1:])
