#!python

import sys
import getopt
import string
import re

from lxml import etree
from urllib2 import urlopen, HTTPError

DATASET = 1
FILE = 2
MODEL = 3
AGGREGATION = 4

typeCode = {'d': DATASET,
            'f': FILE,
            'm': MODEL,
            'a': AGGREGATION,
            }
typeName = {DATASET:'Dataset',
            FILE:'File',
            MODEL:'Model',
            AGGREGATION:'Aggregation',
            }
DEFAULT_FORMAT = "narrow"
DEFAULT_CHUNKSIZE = 1000
DEFAULT_QUERY_SERVICE = "http://pcmdi.llnl.gov/esg-search/search"
DEFAULT_WGET_SERVICE = "http://pcmdi.llnl.gov/esg-search/wget"
MAX_RECORDS = 10000000                  # Maximum number of records
WGET_MAX_RECORDS = 1000

usage = """Usage:
    esgquery_index [Options]

    Query an ESGF index node. Return the results in tabular form as defined by --format.

    Various types of queries are supported:

    - faceted search (--facet-query)
    - free text search (--free-text)
    - search for available facet values (--facets)
    - specification of result type (--type)
    - return counts only (--count)
    - wget scripts (--format wget)

    The different search types can be combined in a single query. The script formulates
    an HTTP query to the SOLR query service defined by --service-url. The resulting XML
    document is parsed, and the results are extracted and displayed in the format defined
    by --format.

    By default, all possible metadata fields associated with the result objects are displayed.
    To limit the output fields, list them with the --fields option. Similarly the --limit
    option specifies the maximum number of results.

Options:

    --count
       Return the record count only.

    -d character
    --delimiter character
       Print output separated by delimiter character. Turns off pretty-print option.

    --facets [FIELD | *]
       Return all possible values of the facet that apply to the search results. If the argument is *,
       return the field names available for querying.

    --fields FIELD,FIELD,...
       Limit output to the listed fields.

    --format [narrow | wide | wget | xml]
       Output format. Default is narrow. Note: Some fields may have multiple values associated with an
       object. For example, datasets may contain multiple variables. In that case the
       wide format will only show one of the values for that field.

       narrow (default): Each line is (id, field, value)
       wide: Each line is (id, field1_value, field2_value, ...)
       wget: A wget script is returned. The script will download all files matching the query. See note.
       xml: An XML file is returned, as generated from the SOLR query. At most 1000 results may be requested.

       NOTE: wget format implies:
       - The object type is File: --type f
       - The url field is returned: --fields url
       - The number of files is the minimum of 1000 and the --limit setting.
       - --count and --facets options are ignored.

       HINT: To avoid downloading duplicate (replicated) files, include the query 'replica=false'. See Examples.

    -h, --help
       Help string.

    --limit limit
       Limit the number of records returned. Note: The limit may be larger than the internal per-query
       limit, currently 1000. If so, multiple queries are issued and the results combined.
       The default is to read all records. Also note this option does not apply to facet option
       queries (--facets).

    -o path
       Save output to a file.

    --offset num
       Start at the given record number.

    -p, --pretty-print
       Align results and print field headers. See --delimiter.

    -q FIELD=VALUE
    --facet-query FIELD=VALUE[,FIELD=VALUE...]
       Faceted query. Return all records matching the queries. This option may be used more than once.
       See Notes for more details.

    --service-url URL
       Query the service at URL. If the esgcet publisher package is present, the value of parameter
       'solr_search_service_url' is used, otherwise the default is %s.

       Similarly, if the return format is wget (--format wget), the value of parameter
       'solr_wget_service_url' is used, otherwise the default is %s.

    -t TEXT
    --free-text TEXT
       Free text query.

    --type f|d|a
       List files (f), datasets (d), or aggregations (a). Default is dataset.

    -v, --verbose
       Echo processing information.

Notes:

    - For fields that are different, the queries are combined with the AND operator. For fields that are the same,
      the queries are combined with OR. For example:

      experiment=historical,experiment=piControl,model=CanCM4

      searches for all records where (experiment is historical OR experiment is piControl) AND (model is CanCM4)

Examples:

    - Find all CMIP5 datasets for model INM-CM4. List the model, experiment, and time frequency

      esgquery_index -q project=CMIP5,model=INM-CM4 --fields model,experiment,time_frequency -p

    - Find all CMIP5 monthly decadal2001 files for model CanCM4, variable specific humidity

      esgquery_index -q project=CMIP5,experiment=decadal2001,model=CanCM4,time_frequency=mon,cf_standard_name=specific_humidity --fields id --type f -p

    - Find all metadata associated with a specific dataset  

      esgquery_index -q id='cmip5.output1.INM.inmcm4.historical.mon.atmos.Amon.r1i1p1.v20111201|pcmdi.llnl.gov'

    - Determine if a file with a specific tracking_id is in the latest version of a dataset

      esgquery_index --type f -t 'tracking_id:5d70965b-18c5-4bc1-9e88-ae4cc6b147e5' -q latest=true --fields latest

      or equivalently:

      esgquery_index --type f -q latest=true,tracking_id=5d70965b-18c5-4bc1-9e88-ae4cc6b147e5 --fields latest

    - Find all files in a given dataset

      esgquery_index --type f -q dataset_id='cmip5.output1.CCCma.CanCM4.decadal2001.mon.atmos.Amon.r9i2p1.v20111022|dap.cccma.uvic.ca' --fields id -p

    - Get the checksum of a file

      esgquery_index -v --type f -q id='cmip5.output1.CCCma.CanCM4.decadal2001.mon.atmos.Amon.r9i2p1.v20111022.cl_Amon_CanCM4_decadal2001_r9i2p1_200201-201112.nc|dap.cccma.uvic.ca' --fields checksum

    - Get the datasets stored locally on a non-default node

      esgquery_index --service-url http://esg-datanode.jpl.nasa.gov/esg-search/search -q distrib=false --fields id -p

    - Find all data nodes with any version of a dataset

      esgquery_index -q master_id=cmip5.output1.INM.inmcm4.rcp85.fx.land.fx.r0i0p0 --fields id,data_node -p

    - Find the data node having the master copy of a dataset

      esgquery_index -q master_id=cmip5.output1.INM.inmcm4.rcp85.fx.land.fx.r0i0p0,replica=false --fields version -p

    - Find the URLs of files containing variable fgco2 for experiment rcp85

      esgquery_index -v -q variable=fgco2,experiment=rcp85 --fields url --type f -p

    - Find all versions of a dataset

      esgquery_index -q master_id='cmip5.output1.NOAA-GFDL.GFDL-CM3.historicalNat.mon.atmos.Amon.r3i1p1' --fields id,version,latest -p

    - List all CMIP5 experiments

      esgquery_index --facets experiment -q project=CMIP5 -p

    - Find the CMIP5 models for which data has been published since 3/19/2012

      esgquery_index -q from=2012-03-19T00:00:00Z,project=CMIP5 --facets=model -p

    - List all facet fields

      esgquery_index --facets='*' -p

    - Find the aggregations associated with a dataset

      esgquery_index --type a -q dataset_id='obs4MIPs.CNES.AVISO.mon.v1|esg-datanode.jpl.nasa.gov' -p -v --service-url http://esg-datanode.jpl.nasa.gov/esg-search/search --fields id

    - Download a wget script

      esgquery_index -q project=CMIP5,time_frequency=fx,model=MIROC5,experiment=1pctCO2,replica=false,realm=atmos --fields id --format wget

"""%(DEFAULT_QUERY_SERVICE, DEFAULT_WGET_SERVICE)

def leng(item):
    if item is not None:
        return len(item)
    else:
        return 4

def getItemCount(header, tuples):
    itemCount = [len(item) for item in header]
    for item in tuples:
        try:
            itemCount = map(lambda x,y: max(x,y), itemCount, [leng(t) for t in item])
        except:
            print item
            raise
    return itemCount

def printQueryResult(header, tuples, out=sys.stdout, printHeaders=True):
    """Print a query result.

    header
      A list of property names.

    tuples
      A list of result tuples. Each tuple has the same length as the header, and each item in the tuple corresponds to the
      respective header item. For example, if the header is ['field1', 'field2'] then ``tuples`` might have the value [(dataset1_field1, dataset1_field2), (dataset2_field1, dataset2_field2), ...]

    out
      Output file object.

    printHeaders
      Boolean flag. If True, print header and trailer lines.
      
    """
    itemCount = getItemCount(header, tuples)
    width = reduce(lambda x,y: x+y, itemCount)+3*len(itemCount)-1
    breakline = '+'+width*'-'+'+'
    if printHeaders:
        format = '| '+reduce(lambda x,y: x+' | '+y, ["%%-%ds"%item for item in itemCount])+' |'
    else:
        format = reduce(lambda x,y: x+' | '+y, ["%%-%ds"%item for item in itemCount])
    if printHeaders:
        print >>out, breakline
        print >>out, format%tuple(header)
        print >>out, breakline
    count = 0
    for item in tuples:
        count += 1
        try:
            print >>out, format%item
        except:
            print >>out, 'Barf!'
            print >>out, item
            sys.exit(0)
    if printHeaders:
        print >>out, breakline
        print >>out, '%d results found'%count

try:
    from esgcet.query import printResult
except:
    printResult = printQueryResult
    
# Formulate a SOLR query, returning the http query string.
#
# facets: list of (string,value) tuples
# fields: list of strings
# freetext: string arg to free text query, or None
# objtype: DATASET, FILE, or MODEL
# service: search service URL
# facetValues: list of strings
def formulateQuery(facets, fields, format, freetext, objtype, service, offset, limit, facetValues=None):
    queryList = ["type=%s"%typeName[objtype]]
    if facetValues is not None:
        facetString = string.join(facetValues,',')
        queryList.append("facets=%s"%facetString)
    if freetext is not None:
        queryList.append("query=%s"%freetext)
    if len(fields)>0:
        if 'id' not in fields:
            fields.append('id')
        fieldString = string.join(fields,',')
        queryList.append("fields=%s"%fieldString)
    if len(facets)>0:
        for field,value in facets:
            queryList.append("%s=%s"%(field,value))
    queryList.append("offset=%d"%offset)
    queryList.append("limit=%d"%limit)
    queryString = string.join(queryList,'&')
    query = "%s?%s"%(service,queryString)
    return query

def readChunk(service, query):
    try:
        tree = etree.parse(query)
    except IOError as xe:
        print xe
        try:
            u = urlopen(query)
        except HTTPError as he:
            errorHtml = he.read()
            m = re.search(r'message.*?<u>(.*?)</u>', errorHtml)
            if m is not None:
                print 'Error:', m.group(1)
            sys.exit(1)
    return tree

def downloadResult(query, outpath, outpathIsStdout):
    try:
        u = urlopen(query)
    except HTTPError as he:
        errorHtml = he.read()
        m = re.search(r'message.*?<u>(.*?)</u>', errorHtml)
        if m is not None:
            print 'Error:', m.group(1)
        sys.exit(1)
    result = u.read()
    outpath.write(result)
    u.close()
    if not outpathIsStdout:
        outpath.close()

# Parse the response header for available facet values
# Returns (([valueList],), header), numFound as for parseTrailer
def parseHeader(tree):
    root = tree.getroot()
    headerElem = root[0]
    for item in headerElem.iter('arr'):
        if item.get('name')=='facet.field':
            valueList = [(sub.text,) for sub in item]
            header = ['field']
            break
    else:
        raise RuntimeError("No facet.field list found")
    result = root[1]
    numFound = int(result.get('numFound'))
    return ([valueList], header), numFound

# Parse a SOLR XML response
# tree is an lxml ElementTree
# If includeId is true, the response records include (id,'id',id) where applicable.
# Returns (results, number_found, number_documents) where
# results is a list [(objid, field, value), (objid, field, value), ...], 
# number_found is the total number of results independent of chunk size,
# and number_documents is the number of results parsed by this call.
def parseResponse(tree, includeId, scoreInFields=False):
    root = tree.getroot()
    result = root[1]
    numFound = int(result.get('numFound'))
    start = int(result.get('start'))
    newResults = []
    numDocs = 0
    for doc in result:
        objid = ''
        interResults = []               # [(field, value), (field, value), ...]
        for elem in doc:
            name = elem.get('name')
            if not scoreInFields and name=='score':
                continue
            if name=='id':
                objid = elem.text
                if includeId:
                    interResults.append(('id', objid))
            elif elem.tag=='arr':
                for sub in elem:
                    if sub.tag=='str':
                        interResults.append((name, sub.text))
            else:
                interResults.append((name, elem.text))
        results = [(objid, field, value) for field, value in interResults]
        newResults.extend(results)
        numDocs += 1
    return newResults, numFound, numDocs

# Parse the trailer for facet values and counts.
# Returns ((valueLists, header), numFound where
#   valueLists = [list1, list2, ... listn]
#   listn is a list [(facetOption, facetCount), (facetOption, facetCount), ...]
#   header = [facetName1, facetName2, ...] of same length as valueLists
#   numFound is the total number of records that would be returned
def parseTrailer(tree, facetValues, includeId):
    root = tree.getroot()
    result = root[1]
    numFound = int(result.get('numFound'))
    numResults = 0                      # No actual data records returned
    valueLists = []
    header = []
    trailer = root[2]
    if trailer.get('name') != 'facet_counts':
        raise RuntimeError("Could not find facet counts in response")
    facetFields = trailer[1]
    if facetFields.get('name')!='facet_fields':
        raise RuntimeError("Could not find facet fields in response")
    for item in facetFields:
        field = item.get('name')
        subResults = [(sub.get('name'), sub.text) for sub in item]
        valueLists.append(subResults)
        header.append(item.get('name'))
    return (valueLists, header), numFound

# Print a list of results.
# results = [(id,field,value), (id,field,value), ...]
def outputResults(results, format, header=None, prettyPrint=False, printHeaders=True, delimiter=None, out=sys.stdout):
    if format=='narrow':
        if prettyPrint:
            header = ['id', 'field', 'value']
            printResult(header, results, printHeaders=printHeaders, out=out)
        else:
            if delimiter is not None:
                for item in results:
                    print >>out, delimiter.join(item)
            else:
                for item in results:
                    print >>out, item
    elif format=='wide':
        # valuedict: (id, field) => value
        objset = set()
        fieldset = set()
        valueDict = {}
        for objid,field,value in results:
            objset.add(objid)
            fieldset.add(field)
            valueDict[(objid, field)] = value

        objlist = list(objset)
        objlist.sort()
        if 'id' in fieldset:
            fieldset.remove('id')
        fieldlist = list(fieldset)
        fieldlist.sort()
        wideResults = []
        for obj in objlist:
            wideResults.append(tuple([obj]+[valueDict.get((obj,field), '') for field in fieldlist]))
        if prettyPrint:
            printResult(['id']+fieldlist, wideResults, printHeaders=printHeaders, out=out)
        else:
            if delimiter is not None:
                for item in wideResults:
                    print >>out, delimiter.join(item)
            else:
                for item in wideResults:
                    print >>out, item
    else:
        print 'Format not yet implemented:', format
        sys.exit(1)

# Print facet results.
# results = [(v1, ..., vn), (v1, ..., vn), ...]
# header = [f1, f2, ..., fn]
def outputFacetResults(results, header, prettyPrint=False, printHeaders=True, delimiter=None, out=sys.stdout):
    if prettyPrint:
        printResult(header, results, printHeaders=printHeaders, out=out)
    else:
        if delimiter is not None:
            for item in results:
                print delimiter.join(item)
        else:
            for item in results:
                print item

def main(argv):

    global DEFAULT_QUERY_SERVICE
    global DEFAULT_WGET_SERVICE

    try:
        args, lastargs = getopt.getopt(argv, "d:ho:pq:t:v", ['count', 'delimiter=', 'facet-query=', 'facets=', 'fields=', 'format=', 'free-text=', 'help', 'limit=', 'offset=', 'pretty-print', 'service-url=', 'type=', 'verbose'])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    # Get the search URL from the publisher configuration if possible
    try:
        from esgcet.config import loadConfig

        config = loadConfig(None)
        DEFAULT_QUERY_SERVICE = config.get("DEFAULT", "solr_search_service_url", default=DEFAULT_QUERY_SERVICE)
        DEFAULT_WGET_SERVICE = config.get("DEFAULT", "solr_wget_service_url", default=DEFAULT_WGET_SERVICE)
    except:
        pass

    allFacets = False                   # facets=*
    countOnly = False
    delim = None
    facets = []
    facetValues = None
    fields = []
    format = DEFAULT_FORMAT
    freetext = None
    includeId = False
    objtype = DATASET
    offset = 0
    outpath = sys.stdout
    outpathIsStdout = True
    prettyPrint = False
    service = DEFAULT_QUERY_SERVICE
    userLimit = MAX_RECORDS
    verbose = False
    wgetScript = False
    wgetService = DEFAULT_WGET_SERVICE
    for flag, arg in args:
        if flag=='--count':
            countOnly = True
        elif flag in ['-d', '--delimiter']:
            delim = arg
            prettyPrint = False
        elif flag=='--facets':
            facetList = arg.split(',')
            facetValues = [item.strip() for item in facetList]
            allFacets = (facetValues[0]=='*')
        elif flag=='--fields':
            fieldList = arg.split(',')
            fields = [item.strip() for item in fieldList]
        elif flag=='--format':
            if arg not in ['narrow', 'wide', 'wget', 'xml']:
                raise RuntimeError("Invalid format: %s"%arg)
            format = arg
            if arg=='wget':
                wgetScript = True
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='--limit':
            try:
                userLimit = int(arg)
            except:
                raise RuntimeError("Invalid limit: %s"%arg)
        elif flag=='-o':
            outpath = open(arg, 'w')
            outpathIsStdout = False
        elif flag in ['-p', '--pretty-print']:
            prettyPrint = True
        elif flag=='--offset':
            offset = int(arg)
        elif flag in ['-q', '--facet-query']:
            queries = arg.split(',')
            for q in queries:
                f,v = q.split('=')
                facets.append((f.strip(), v.strip()))
        elif flag=='--service-url':
            service = arg
            wgetService = arg
        elif flag in ['-t', '--free-text']:
	    freetext = arg
	elif flag=='--type':
            try:
                objtype = typeCode[arg]
            except:
                raise RuntimeError("Invalid return type: %s"%arg)
        elif flag in ['-v', '--verbose']:
            verbose = True

    # If a wget script is requested:
    # - Object type is File
    # - fields are url
    # - limit is min(1000, limit)
    if wgetScript:
        objtype = FILE
        fields = ['url']
        userLimit = min(userLimit, WGET_MAX_RECORDS)
        countOnly = False
        allFacets = False
        facetValues = None
        query = formulateQuery(facets, fields, format, freetext, objtype, wgetService, offset, userLimit, facetValues=facetValues)
        if verbose:
            print >>sys.stderr, 'Query: ', query
        downloadResult(query, outpath, outpathIsStdout)
        return

    # XML output
    if format=='xml':
        userLimit = min(userLimit, DEFAULT_CHUNKSIZE)
        query = formulateQuery(facets, fields, format, freetext, objtype, service, offset, userLimit, facetValues=facetValues)
        if verbose:
            print >>sys.stderr, 'Query: ', query
        downloadResult(query, outpath, outpathIsStdout)
        return

    # If returning id only, use wide format
    if fields==['id']:
        format = 'wide'
        includeId = True

    # For facet value queries, use wide format.
    if facetValues is not None:
        format = 'wide'

    # While remaining data:
    fullResults = []
    numFound = 0
    moredata = True
    nread = 0
    nleft = userLimit
    chunksize = DEFAULT_CHUNKSIZE
    while moredata:

        # Formulate a query
        if not (countOnly or facetValues is not None):
            limit = min(nleft, chunksize)
        else:
            limit = 0
        query = formulateQuery(facets, fields, format, freetext, objtype, service, offset, limit, facetValues=facetValues)
        if verbose:
            print >>sys.stderr, 'Query: ', query

        # Read a chunk
        chunk = readChunk(service, query)

        # Parse the response. For facet value searches, parse the response trailer
        if facetValues is None:
            scoreInFields = ('score' in fields)
            results, numFound, numResults = parseResponse(chunk, includeId, scoreInFields)
            fullResults.extend(results)
        else:
            numResults = 0
            if allFacets:
                fullResults, numFound = parseHeader(chunk)
            else:
                fullResults, numFound = parseTrailer(chunk, facetValues, includeId)

        # More data if some results were found and the number of records read < total
        nread += numResults
        nleft -= limit
        moredata = (numResults>0) and (nread<min(numFound, userLimit))
        offset += limit

    # Output the results
    if not (countOnly or facetValues is not None):
        outputResults(fullResults, format, prettyPrint=prettyPrint, printHeaders=True, delimiter=delim, out=outpath)
    elif facetValues is not None:
        for valueList, head in zip(fullResults[0], fullResults[1]):
            if allFacets:
                header = (head,)
            else:
                header=(head,'count')
            outputFacetResults(valueList, header, prettyPrint=prettyPrint, printHeaders=True, delimiter=delim, out=outpath)
    else:
        print numFound

    if not outpathIsStdout:
        outpath.close()

if __name__=='__main__':
    main(sys.argv[1:])
