#!/usr/bin/env python

import sys
import argparse

from lxml import etree

from hocr.searching import hocr_load_lookup_table, hocr_lookup_page_by_dat
from hocr.parse import hocr_page_iterator
from hocr.util import open_if_required, get_header_footer

def process_file(filepath, pageno, tablepath):
    fd = open_if_required(filepath)
    top, bottom = get_header_footer(fd)

    sys.stdout.buffer.write(top)

    if tablepath:
        lookup_table = hocr_load_lookup_table(tablepath)
        fp = open_if_required(filepath)

        dat = lookup_table[pageno]
        page = hocr_lookup_page_by_dat(fp, dat)
    else:
        for idx, page in enumerate(hocr_page_iterator(fd)):
            if idx == pageno:
                break

    s = etree.tostring(page, pretty_print=True, method='xml',
                       encoding='utf-8').decode('utf-8')

    s = ' ' + s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
    s = s.encode('utf-8')
    sys.stdout.buffer.write(s)

    sys.stdout.buffer.write(bottom)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='hOCR single page extractor')
    parser.add_argument('-f', '--infile', help='Filename to read',
                        type=str, default=None)
    parser.add_argument('-t', '--table', help='Table to use',
                        type=str, default=None)
    parser.add_argument('-p', '--page', help='Page number to extract (zero based)',
                        type=int, default=None)
    args = parser.parse_args()

    process_file(args.infile, args.page, args.table)

