#!/usr/bin/env python

import sys
import argparse
from glob import glob

from lxml import etree

from hocr.parse import hocr_page_iterator
from hocr.util import get_header_footer


def process_files(files_to_process):
    top, bottom = get_header_footer(files_to_process[0])

    sys.stdout.buffer.write(top)

    page_no = 0

    for f in files_to_process:
        for page in hocr_page_iterator(f):
            page.tag = 'div'

            page.attrib['id'] = 'page_%.06d' % page_no
            block_no = 0
            par_no = 0
            line_no = 0
            word_no = 0

            blocks = page.xpath("*[@class='ocr_carea']")
            for block in blocks:
                block.attrib['id'] = 'block_%.06d_%.06d' % (page_no,
                                                            block_no)

                paragraphs = block.xpath("*[@class='ocr_par']")
                for par in paragraphs:
                    par.attrib['id'] = 'par_%.06d_%.06d' % (page_no,
                                                            par_no)

                    for line in par.getchildren():
                        line.attrib['id'] = 'line_%.06d_%.06d' % (page_no,
                                                                  line_no)

                        words = line.xpath("*[@class='ocrx_word']")
                        for word in words:
                            word.attrib['id'] = 'word_%.06d_%.06d' % \
                                    (page_no, word_no)

                            word_no += 1

                        line_no += 1

                    par_no += 1

                block_no += 1

            page_no += 1

            s = etree.tostring(page, pretty_print=True, method='xml',
                               encoding='utf-8').decode('utf-8')
            # Let's Remove the xmlns in the div.
            # Yes, this is horrible, but cleanup_namespaces doesn't help
            # since as far as tostring knows, this is the root.
            # Let's also add two spaces for indentation for the first
            # page, and one for all the other pages.
            if page_no == 1:
                s = '  ' + \
                    s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
            else:
                s = ' ' + \
                    s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')

            s = s.encode('utf-8')
            sys.stdout.buffer.write(s)

    sys.stdout.buffer.write(bottom)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Combine hOCR files '
                                     '- streaming version')
    parser.add_argument('-g', '--glob', help='Glob of files to parse',
                        type=str, default=None)
    args = parser.parse_args()

    files_to_process = glob(args.glob)

    if not len(files_to_process):
        print('No files to process!', file=sys.stderr)
        sys.exit(1)

    files_to_process = sorted(files_to_process)

    process_files(files_to_process)
