#!/usr/bin/env python

import sys
import os
import getopt

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from esgcet.publish import readDatasetMap
from esgcet.publish.utility import getTypeAndLen
from esgcet.config import loadConfig, initLogging, registerHandlers

from esgcet.exceptions import *
from esgcet.messaging import debug, info, warning, error, critical, exception
from esgcet.model import Dataset

usage = """Usage:
    esgupdate_metadata mapfile

    Update the local database with metadata defined in a mapfile. This is particularly useful
    for publishing checksums for existing datasets.

Arguments:
    mapfile: A file mapping dataset_ids to absolute paths, as generated by esgscan_directory.

Options:

    --echo-sql: Echo SQL commands

    -v, --verbose
        Print messages.

Notes:

    (1) The mapfile is a text file, with one line per file. Each line has the form:

        dataset_name | absolute_path | byte_length [ | property=value [ | property=value ...]]

        For adding checksum information, the properties are:

        checksum=...
        checksum_type=MD5

        Modification times (epochal times) are indicated with the mod_time property. For example:

        cmip5.output1.INM.inmcm4.1pctCO2.day.atmos.day.r1i1p1 | /foo/bar/output1/INM/inmcm4/1pctCO2/day/atmos/day/r1i1p1/huss/1/huss_day_inmcm4_1pctCO2_r1i1p1_20900101-20991231.nc | 315462976 | mod_time=1276872789.000000 | checksum=7fcd959a4bb57e4079c8e65a7a5d0499 | checksum_type=MD5
        cmip5.output1.INM.inmcm4.1pctCO2.day.atmos.day.r1i1p1 | /foo/bar/output1/INM/inmcm4/1pctCO2/day/atmos/day/r1i1p1/huss/1/huss_day_inmcm4_1pctCO2_r1i1p1_21000101-21091231.nc | 315462976 | mod_time=1276872919.000000 | checksum=6f805cbee324d7151c95c752f5d8352e | checksum_type=MD5

"""

def main(argv):
    try:
        args, lastargs = getopt.getopt(argv, "v", ['echo-sql', 'verbose'])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    echoSql = False
    verbose = False
    for flag, arg in args:
        if flag=='--echo-sql':
            echoSql = True
        elif flag in ['-v', '--verbose']:
            verbose = True

    if len(lastargs)!=1:
        print usage
        sys.exit(0)

    datasetMapfile = lastargs[0]
    init_dir = '/esg/config/esgcet/'

    # Load the configuration and set up a database connection
    config = loadConfig(init_dir)
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
    initLogging('DEFAULT', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)
    session = Session()

    # Register project handlers
    registerHandlers()

    # Read the mapfile: (dataset_name, version) => [(path, size), (path, size), ...]
    # and extraFields: (dataset_name, version, path, attribute) => attribute_value
    dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
    datasetNames = dmap.keys()
    datasetNames.sort()

    # Create a dictionary: (dataset_name, version, basename) => file_version_obj
    # Create a dictionary: (dataset_name, version) => dataset_version_obj
    dsetdict = {}
    dsetVersionDict = {}
    for datasetName, version in datasetNames:
        dset = session.query(Dataset).filter_by(name=datasetName).first()
        dsetVersion = dset.getVersionObj(version)
        dsetVersionDict[(datasetName, version)] = dsetVersion
        filedict = {}
        for fvobj in dsetVersion.files:
            location = fvobj.getLocation()
            basename = os.path.basename(location)
            key = (datasetName, version, basename)
            if key in dsetdict:
                raise ESGPublishError("Duplicate file: %s, dataset=%s, version=%s"%(basename, datasetName, version))
            dsetdict[(datasetName, version, basename)] = fvobj

    # For each mapfile entry for the dataset:
    # - Find the corresponding file_version object
    # - Update the file_version
    for key, value in extraFields.items():
        datasetName, version, location, attrname = key
        basename = os.path.basename(location)
        fvobj = dsetdict[(datasetName, version, basename)]
        dsetVersionObj = dsetVersionDict[(datasetName, version)]
        if attrname=='checksum':
            try:
                checksum_type = extraFields[(datasetName, version, location, 'checksum_type')]
            except KeyError:
                raise ESGPublishError("checksum_type not set for file=%s, dataset=%s, version=%s"%(location, datasetName, version))
            current_csum = fvobj.getChecksum()
            if current_csum not in (None, value):
                warning("checksum = %s for dataset=%s, file=%s, replacing with new checksum=%s"%(current_csum, datasetName, basename, value))
            if verbose:
                print "Setting checksum=%s, type=%s for dataset=%s, file=%s"%(value, checksum_type, datasetName, basename)
            fvobj.checksum = value
            fvobj.checksum_type = checksum_type
        elif attrname=='tech_notes':
            tech_notes_title = extraFields.get((datasetName, version, location, 'tech_notes_title'))
            if verbose:
                print "Setting tech_notes=%s, tech_notes_title=%s for dataset=%s, file=%s"%(value, tech_notes_title, datasetName, basename)
            fvobj.tech_notes = value
            fvobj.tech_notes_title = tech_notes_title
        elif attrname=='dataset_tech_notes':
            dataset_tech_notes_title = extraFields.get((datasetName, version, location, 'dataset_tech_notes_title'))
            if verbose:
                print "Setting dataset_tech_notes=%s, dataset_tech_notes_title=%s for dataset=%s"%(value, dataset_tech_notes_title, datasetName)
            dsetVersionObj.tech_notes = value
            dsetVersionObj.tech_notes_title = dataset_tech_notes_title
        else:
            continue

    session.commit()
    session.close()

if __name__=='__main__':
    main(sys.argv[1:])
