#!/usr/bin/env python

import sys
import argparse

from lxml import etree

from hocr.parse import hocr_page_iterator
from hocr.util import open_if_required, get_header_footer


def process_files(infile):
    fd = open_if_required(infile)
    top, bottom = get_header_footer(fd)

    sys.stdout.buffer.write(top)

    page_no = 0

    fd.seek(0)
    for page in hocr_page_iterator(fd):
        words = page.xpath(".//*[@class='ocrx_word']")
        for word in words:
            chars = word.xpath("*[@class='ocrx_cinfo']")
            txt = ''
            for char in chars:
                txt += char.text

            ch = word.getchildren()
            for c in ch:
                word.remove(c)
            word.text = txt

        s = etree.tostring(page, pretty_print=True, method='xml',
                           encoding='utf-8').decode('utf-8')
        # Let's Remove the xmlns in the div.
        # Yes, this is horrible, but cleanup_namespaces doesn't help
        # since as far as tostring knows, this is the root.
        # Let's also add two spaces for indentation for the first
        # page, and one for all the other pages.
        if page_no == 1:
            s = '  ' + \
                s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
        else:
            s = ' ' + \
                s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')

        s = s.encode('utf-8')
        sys.stdout.buffer.write(s)

    sys.stdout.buffer.write(bottom)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Fold character-based hOCR '
                                                 'into word-based hOCR')
    parser.add_argument('-f', '--infile', help='Input file',
                        type=str, default=None)
    args = parser.parse_args()

    process_files(args.infile)
