#!/usr/bin/env python

import sys
import argparse

from hocr.parse import hocr_page_to_word_data_fast
from hocr.searching import hocr_load_lookup_table, hocr_lookup_page_by_dat, \
        hocr_lookup_by_plaintext_offset
from hocr.util import open_if_required
from hocr.text import hocr_paragraph_text


def process_file(hocrfile, textfile, tablepath):
    lookup_table = hocr_load_lookup_table(tablepath)

    hocrfp = open_if_required(hocrfile)
    textfp = open_if_required(textfile)
    text = textfp.read().decode('utf-8')

    recon_byte_count = 0
    text_byte_count = 0
    current_dat = lookup_table[0]

    page = hocr_lookup_page_by_dat(hocrfp, current_dat)
    paragraphs = hocr_page_to_word_data_fast(page)
    paragraph_count = 0

    # Skip the last line of text, since it contains a newline which is not part
    # of the document
    for line in text[:-1].split('\n'):
        if text_byte_count >= current_dat[1]:
            # We could skip a part of the lookup_table by only seeing beyond
            # our current dat as an optimisation, later
            _, current_dat = hocr_lookup_by_plaintext_offset(lookup_table,
                                                          text_byte_count)

            page = hocr_lookup_page_by_dat(hocrfp, current_dat)
            paragraphs = hocr_page_to_word_data_fast(page)
            paragraph_count = 0

        hocrtxt = hocr_paragraph_text(paragraphs[paragraph_count])

        if line != hocrtxt:
            print('hocrtxt != line')
            print('hocrtxt', repr(hocrtxt))
            print('line', repr(line))
            raise Exception('Line mismatch')

        paragraph_count += 1

        recon_byte_count += len(hocrtxt) + 1 # '\n'
        text_byte_count += len(line) + 1 # '\n'

    print(recon_byte_count, text_byte_count)
    print('All OK')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='hOCR lookup table '
                                                 'paragraph-level validator')
    parser.add_argument('--hocr', help='hOCR Filename to read',
                        type=str, default=None)
    parser.add_argument('--text', help='Fulltext Filename to read',
                        type=str, default=None)
    parser.add_argument('--table', help='Table to use',
                        type=str, default=None)
    args = parser.parse_args()

    process_file(args.hocr, args.text, args.table)
