#!/usr/bin/env python

import sys
import argparse

from lxml import etree

from hocr.searching import hocr_load_lookup_table, hocr_lookup_page_by_dat
from hocr.parse import hocr_page_iterator
from hocr.util import open_if_required, get_header_footer

def process_file(filepath, outfmt):
    fd = open_if_required(filepath)
    top, bottom = get_header_footer(fd)

    for pageno, page in enumerate(hocr_page_iterator(fd)):
        fp = open(outfmt % pageno, 'bw+')

        fp.write(top)
        s = etree.tostring(page, pretty_print=True, method='xml',
                           encoding='utf-8').decode('utf-8')

        s = ' ' + s.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
        s = s.encode('utf-8')
        fp.write(s)

        fp.write(bottom)
        fp.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='hOCR single page extractor')
    parser.add_argument('-f', '--infile', help='Filename to read',
                        type=str, default=None)
    parser.add_argument('-o', '--out-format', help='Outfile format - make sure it '
                        'takes an int as format',
                        type=str, default=None)
    args = parser.parse_args()

    process_file(args.infile, args.out_format)

