#!python

# Copyright 2013-2015 University of Warsaw
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""A utility for browsing and simple manipulation of Avro-based files"""

from __future__ import print_function

import argparse
import sys

from argparse import RawTextHelpFormatter
from collections import OrderedDict

import avro.schema

from avroknife.file_system import FileSystemPathFactory, hdfs_filesystem_warning
from avroknife.printer import FilePrinter, StdoutPrinter
from avroknife.error import error
from avroknife.data_store import DataStore
from avroknife.operations import extract, copy, count, get_schema, to_json
from avroknife.record_selector import RecordSelector, Range, EqualitySelection
from avroknife import __version__, __description__

class ModesWithOptions:
    def __init__(self):
        self.__modes = OrderedDict([
            ('getschema', ['output']),
            ('tojson', ['output', 'limit', 'select', 'index', 'pretty', 'schema']),
            ('copy', ['output', 'limit', 'select', 'index', 'schema']),
            ('extract', ['output', 'limit', 'select', 'index', 'schema', 'value_field', 'name_field', 'create_dirs']),
            ('count', ['output', 'limit', 'select', 'index'])])
        ## Inverted self.__modes dict
        self.__options = {}
        for k, vals in self.__modes.iteritems():
            for v in vals:
                self.__options[v] = self.__options.get(v, [])
                self.__options[v].append(k)

    def __get_modes(self):
        return self.__modes.keys()

    def get_modes_for_option_string(self, option):
        modes = [m for (m, opts) in self.__modes.iteritems() if option in opts]
        if len(modes) == len(self.__modes):
            return 'Can be used in all modes.'
        quoted_modes = ['"{}"'.format(m) for m in modes]
        if len(modes) == 1:
            return 'Can be used only in {} mode.'.format(quoted_modes[0])
        return 'Valid in modes: {}.'.format(', '.join(quoted_modes))

    def check_if_proper_options_are_used(self, args):
        mode = args.mode
        if mode not in self.__get_modes():
            self.__parsing_error(
                'Unknown mode selected. Choose one of these: {}.'\
                    .format(', '.join(self.__get_modes())))
        for option, valid_modes in self.__options.iteritems():
            option_value = vars(args)[option]
            if option_value is not None:
                ## If the option is a flag and it is not set, ignore it
                if isinstance(True, bool) and (option_value == False):
                    continue
                if mode not in valid_modes:
                    ModesWithOptions.__incorrect_option_error(option, valid_modes)
        if mode == 'copy' and args.output is None:
            raise self.__parsing_error(
                'The "output" option is mandatory in "copy" mode')
        if mode == 'extract' and args.value_field is None:
            raise self.__parsing_error(
                'The "value_field" is mandatory in "extract" mode')
        if mode == 'extract' and args.output is None and \
                (args.name_field is not None or args.create_dirs is True):
            raise self.__parsing_error(
                'You can use "name_field" and "create_dirs" file system-related '
                'options in the "extract" mode only if the "output" is specified')

    @staticmethod
    def __incorrect_option_error(option, valid_modes):
        quoted_modes = ['"{}"'.format(m) for m in valid_modes]
        raise ModesWithOptions.__parsing_error(
            'The "{}" option can be used only in '\
            'conjunction with the following modes: {}'.\
                format(option, ', '.join(quoted_modes))) 

    @staticmethod
    def __parsing_error(message):
        print('ERROR when parsing command-line arguments: {}'.format(message),
                file=sys.stderr)
        sys.exit(2)

def parse():
    """Parse CLI arguments"""
    parser = argparse.ArgumentParser(
        description='{}.\n'.format(__description__)+
            'Version: {}'.format(__version__)+'\n'
			'\n'+
            'All the parameters that expect a path can be given either a local path\n'+
			'or HDFS path; by default the path is assumed to be a HDFS path.\n'+
            'If you want to refer to a path in local file system, you have to prefix it with\n'+
			'"local:", e.g. "local:dir1\dir2".',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('mode', help='Mode of the execution. The following ones are available:\n'+
        'getschema - prints out schema of data store.\n'+
        'tojson\t- dumps data store as JSON, one record per line,\n'+
        '\t  "bytes" fields are dumped in Base64 format.\n'
        'copy\t- dumps selected records from data store\n'+
        '\t  as a new data store.\n' +
        'extract\t- dumps a field from selected records\n'+
        '\t  to files or to stdout, one field per file.\n'+
        '\t  By default, filenames are equal to indices of\n'+
        '\t  the records but they can be equal to\n'+
        '\t  values of a selected field as well.\n'
        'count\t- prints number of records inside a data store.\n'+
        '\n')
    parser.add_argument('data_store_dir', 
        help='Path to directory corresponding to data store')

    modes_spec = ModesWithOptions()
    
    parser.add_argument('--index', default=None, metavar='RANGE',
        help='Specifies indexes of records to be processed\n'+
            '(the values start from "0").\n'+ 
            'This can have the form of, e.g.\n'+
            '"0","45","2-45","-45","2-.\n'+
            'When not set, all records are processed.\n'+
            modes_spec.get_modes_for_option_string('index'))
    parser.add_argument('--output', default=None, metavar='PATH',
        help='Path to the produced data.\n'+
            modes_spec.get_modes_for_option_string('output'))
    parser.add_argument('--limit', default=None, metavar='NUMBER',
        help='Maximum number of records to be used.\n'+
            modes_spec.get_modes_for_option_string('limit'))
    parser.add_argument('--select', default=None, metavar='FIELD=VALUE',
        help='Retrieve records matching the condition.\n'+
            modes_spec.get_modes_for_option_string('select'))
    parser.add_argument('--value_field', default=None, metavar='NAME',
        help='Field with values to be extracted.\n'+
            modes_spec.get_modes_for_option_string('value_field'))
    parser.add_argument('--name_field', default=None, metavar='NAME',
        help='Field which values will be used as names\n'+
            'of the files containing extracted data.\n'+
            modes_spec.get_modes_for_option_string('name_field'))
    parser.add_argument('--create_dirs', default=False, action='store_true',
        help='Extracted fields corresponding to the same name\n'+
            'are to be placed in the same directory.\n'
            'Name of directory is equal to the `name_field`;\n'+
            'names of the files inside are consecutive numbers.\n'+
            modes_spec.get_modes_for_option_string('create_dirs'))
    parser.add_argument('--schema', default=None, metavar='PATH',
        help='Specifies a schema onto which a data store will\n'+
            'be projected (i.e. the reader schema).\n'+
            modes_spec.get_modes_for_option_string('schema'))
    parser.add_argument('--pretty', default=False, action='store_true',
        help='Produce output as a pretty-printed, valid JSON document.\n'+
            modes_spec.get_modes_for_option_string('pretty'))
    args = parser.parse_args()
    modes_spec.check_if_proper_options_are_used(args)
    return args

def main():
    printer = StdoutPrinter()
    args = parse()
    if args.data_store_dir is not None:
        args.data_store_dir = FileSystemPathFactory.create(args.data_store_dir)
    if args.output is not None:
        args.output = FileSystemPathFactory.create(args.output)
        printer = FilePrinter(args.output)
    if args.schema is not None:
        args.schema = FileSystemPathFactory.create(args.schema)        
    
    if not args.data_store_dir.exists():
        error('"{}" does not exist; {}'\
                .format(args.data_store_dir, hdfs_filesystem_warning()))
        sys.exit(2)
    if not args.data_store_dir.is_dir():
        error('"{}" is not a directory.'.format(args.data_store_dir))
        sys.exit(2)
    if args.limit:
        try:
            limit = int(args.limit)
            args.limit = limit
        except ValueError:
            error('argument supplied to "--limit" option is not a valid integer!')
            raise
    else:
        args.limit = sys.maxint
    if args.schema:
        try:
            avro.schema.parse(args.schema.open().read())
        except TypeError:
            error('supplied schema cannot be parsed!')
            raise

    equality_selection = None
    if args.select is not None:
        equality_selection = EqualitySelection(args.select)

    record_selector = RecordSelector(
            Range(args.index), equality_selection, args.limit)
    data_store = DataStore(args.data_store_dir, args.schema)
    if args.mode == 'getschema':
        printer.print(get_schema(data_store))
    elif args.mode == 'tojson':
        to_json(data_store, record_selector, printer, args.pretty)
    elif args.mode == 'copy':
        copy(data_store, record_selector, args.output)
    elif args.mode == 'extract':
        extract(data_store, record_selector, args.value_field, args.name_field, 
            args.create_dirs, args.output)
    elif args.mode == 'count':
        printer.print(str(count(data_store, record_selector)))

if __name__ == '__main__':
    main()
