import zipfile
import os
import shutil
import xml.dom.minidom as minidom
from xml.dom.minidom import getDOMImplementation
import re
import time

def make_docx_schemas(output_filename, source_dir):
    path_cmp = source_dir.split(os.sep)
    main_dir = path_cmp[len(path_cmp)-1] + '\\'
    relroot = os.path.abspath(os.path.join(source_dir, os.pardir))
    with zipfile.ZipFile(output_filename, "w", zipfile.ZIP_DEFLATED) as zip:
        for root, dirs, files in os.walk(source_dir):
            zip.write(root, os.path.relpath(root, relroot).replace(main_dir,''))
            for file in files:
                filename = os.path.join(root, file)
                if os.path.isfile(filename): 
                    arcname = os.path.join(os.path.relpath(root, relroot).replace(main_dir,''), file) 
                    arcname=arcname.replace(main_dir,'')
                    zip.write(filename, arcname)
    if os.path.exists(source_dir + "/docProps"): shutil.rmtree(source_dir + "/docProps")
    if os.path.exists(source_dir + "/_rels"): shutil.rmtree(source_dir + "/_rels")
    if os.path.exists(source_dir + "/word"): shutil.rmtree(source_dir + "/word")
    if os.path.exists(source_dir + "/media"): shutil.rmtree(source_dir + "/media")
    if os.path.exists(source_dir + "/[trash]"): shutil.rmtree(source_dir + "/[trash]")
    if os.path.exists(source_dir + "/customXml"): shutil.rmtree(source_dir + "/customXml")
    os.remove(source_dir + "\\[Content_Types].xml")
    os.rmdir(source_dir)

def unzip(input_filename,dir_tmp):
    archive = zipfile.ZipFile(input_filename)
    file_rels = dir_tmp + "\\word\\_rels\\document.xml.rels"
    for file in archive.filelist:
        archive.extract(file,path = dir_tmp)
        if(file.filename == "word/_rels/document.xml.rels"):
            f = open(file_rels, 'r')
            docx_rels = f.read()
    with open(file_rels, 'w') as f:
        f.write(docx_rels)  
    #make_docx(output_filename,dir_tmp)         

def unzip_docx(input_filename,dir_tmp):
    dir_tmp_zip = dir_tmp + "\\zipdocx"
    if not os.path.exists(dir_tmp_zip): os.mkdir(dir_tmp_zip)
    unzip(input_filename, dir_tmp_zip)

def extract_canvas_docx(
        file_to_extract,
        final_file,
        canvas,
        empty_page=False,
        canvas_br=True,
        canvas_end=""
        ):
    relroot = os.path.abspath(os.path.join(final_file, os.pardir))
    dir_tmp = relroot + '\\tmp' + str(time.strftime('%Y%m%d%H%M%S'))
    os.mkdir(dir_tmp)
    unzip_docx(file_to_extract,dir_tmp)
    extract_canvas(dir_tmp,canvas,canvas_end,empty_page,canvas_br)
    make_docx_schemas(final_file, dir_tmp + '\\zipdocx')
    os.rmdir(dir_tmp)

# retourne un saut de page
def get_sautpage():
    impl = getDOMImplementation()
    wp = impl.createDocument("w", "w:p", None)
    wp_element = wp.documentElement
    wr = impl.createDocument("w", "w:r", None)
    wr_element = wr.documentElement
    wp_element.appendChild(wr_element)
    wbr = impl.createDocument("w", "w:br", None) 
    wbr_element = wbr.documentElement
    wbr_element.setAttribute("w:type","page") 
    wr_element.appendChild(wbr_element)
    return wp_element

def extract_canvas(dir_tmp,canvas,canvas_end="",empty_page=False, canvas_br=False, file_rel='document.xml.rels'):
    document = dir_tmp + '\\zipdocx\\word\\document.xml'
    document_rel = dir_tmp + '\\zipdocx\\word\\_rels\\' + file_rel
    result = []
    tp_result = []
    dom = minidom.parse(document)
    dom_rel = minidom.parse(document_rel)
    dom_rel_elements = dom_rel.getElementsByTagName("Relationships")[0].childNodes

    #Création du document 
    impl = getDOMImplementation()
    doc = impl.createDocument("w", "w:document", None)
    doc_element = doc.documentElement
    body_doc = impl.createDocument("w", "w:body", None)
    body_element = body_doc.documentElement
    doc_element.appendChild(body_element)
    document = dom.getElementsByTagName("w:document")[0]
    for attr,val in document.attributes.items():
        doc_element.setAttribute(attr, val)
    body = dom.getElementsByTagName("w:body")[0]
    if canvas_end!="":
        canvas_end = minidom.parseString(canvas_end.replace(':','-')).documentElement
    for cnvs in canvas:
        canvas_dom = minidom.parseString(cnvs.replace(':','-'))
        current_test=""
        get_elements(body,"",canvas_dom.documentElement,canvas_dom,current_test,result,True,0,canvas_end,tp_result,dom_rel_elements,canvas_br)
        z = 0
        test_result=False
        for elt in result:
            if tp_result[z]=="element":
                test_result=True
                break
            z=z+1
        if test_result==True: 
            break
        else:
            result=[]
            tp_result=[]
    element_docx = []
    z = 0
    last_element_id=0
    sec_f =""
    for elt in result:
        element_docx.append({"type" : tp_result[z],"element" : elt, "id" : z})
        if tp_result[z] == "element" : last_element_id = z
        # si élément final
        if tp_result[z] == "elt_end" : last_element_id = z
        if tp_result[z] == "sec_f" : sec_f = elt
        z=z+1
    # Recomposition du docx
    for elt in element_docx :
       body_element.appendChild(elt["element"])
       if elt["id"]>last_element_id : break

    #Ajout saut de ligne
    if empty_page==True:
        body_element.appendChild(get_sautpage())

    if sec_f!="" : body_element.appendChild(sec_f)

    xml = doc_element.toxml()        
    f = open(dir_tmp + '\\zipdocx\\word\\document.xml', "wb")
    f.write(str.encode('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>') + xml.encode())
    f.close()      

# fonction récursive pour l'élément xml de fin appelé dans la fct récursive de recherche pattern
def test_element_end(elt_end, cnv_end, test):
    i=0
    while i < len(elt_end.childNodes) :
        i=i+1
        if cnv_end.nodeType == cnv_end.TEXT_NODE :
            try:
               if re.search(cnv_end.data, elt_end.childNodes[i-1].data) :
                   test = True
                   break
            except:
                testb = False
        elif cnv_end.tagName.replace("-",":") == elt_end.childNodes[i-1].tagName :
            elt_end = elt_end.childNodes[i-1]
            cnv_end=cnv_end.childNodes[0]
            test = test_element_end(elt_end, cnv_end,test)
    return test
    
# fonction récursive de recherche d'élément xlm suivant canevas     
def get_elements(element,tir,canvas_elt,canvas_dom,current_test,result,start,id,canvas_end,tp_result,dom_rel_elements,canvas_br):
    tir = tir + "-"
    id = id + 1   
    cnv_end_ok = False
    if len(element.childNodes)>0:
        for element in element.childNodes:
            if element.nodeType != element.TEXT_NODE:
                if id==1 :
                    # Element de fin si il y en a 
                    elt_end = element
                    cnv_end = canvas_end
                    if canvas_end!="" and elt_end.tagName == cnv_end.tagName.replace("-",":"):
                        i=0
                        cnv_end=cnv_end.childNodes[0]
                        if test_element_end(elt_end, cnv_end, False) == True and cnv_end_ok==False:
                            cnv_end_ok = True
                            result.append(element)
                            tp_result.append("elt_end")
                            #print("Elément de fin")

                    if element.tagName == canvas_elt.tagName.replace("-",":"):
                        start = True
                        current_test = element
                        canvas_elt = canvas_elt.childNodes[0]
                    else :
                        # detection sauts de ligne et saut de page
                        if element.tagName == "w:p" and canvas_br==True :
                            if len(element.childNodes)==0 :
                                result.append(element)
                                tp_result.append("saut_l")
                                #print("Saut de ligne")
                            if len(element.childNodes)==1 and element.childNodes[0].tagName == "w:pPr" :
                                for child in element.childNodes[0].childNodes:
                                    if child.tagName == "w:rPr" and len(result) > 0 : 
                                        result.append(element)
                                        tp_result.append("saut_l")
                                        #print("Saut de ligne")

                            if len(element.childNodes)==1 and element.childNodes[0].tagName == "w:r" :
                                if len(element.childNodes[0].childNodes)==1 and element.childNodes[0].childNodes[0].tagName == "w:br" and element.childNodes[0].childNodes[0].getAttribute("w:type")=="page" :
                                    result.append(element)
                                    tp_result.append("saut_p")
                                    #print("Saut de page")

                        # detection section config document header footer
                        if element.tagName == "w:sectPr":
                            result.append(element)
                            tp_result.append("sec_f")
                            #print("Section de fin")
                        
                        start= False
                        canvas_elt = canvas_dom.documentElement

                if id>1 and start==True :
                    if element.tagName == canvas_elt.tagName.replace("-",":"):
                        #print(str(id) + tir + element.tagName)
                        for attr,val in canvas_elt.attributes.items():
                            if not re.search(val, element.getAttribute(attr)) :
                                start= False
                                canvas_elt = canvas_dom.documentElement
                                break
                        if start==True :
                            canvas_elt = canvas_elt.childNodes[0]
                            if canvas_elt.nodeType == canvas_elt.TEXT_NODE :
                                if canvas_elt.data =="*" :
                                    result.append(current_test)
                                    tp_result.append("element")
                                    #print("élément")
                                    start= False
                                    canvas_elt = canvas_dom.documentElement  
                            elif canvas_elt.getAttribute("link-rel")!="":
                                link_rel =canvas_elt.getAttribute("link-rel").split("#")
                                search = link_rel[1]
                                attr_link = link_rel[0].replace("-",":")
                                
                                if len(element.childNodes)>0:
                                    for element in element.childNodes:
                                        if element.tagName==canvas_elt.tagName.replace("-",":") and element.getAttribute(attr_link)!="":
                                            id_link = element.getAttribute(attr_link)
                                            for domrel in dom_rel_elements:
                                                if domrel.getAttribute("Id")==id_link:
                                                    if re.search(search,domrel.getAttribute("Target")) :
                                                        result.append(current_test)
                                                        tp_result.append("element")
                                                        #print("élément") 
                                                        break
                                    start= False
                                    canvas_elt = canvas_dom.documentElement 
                                else :
                                    start= False
                                    canvas_elt = canvas_dom.documentElement 

                get_elements(element,tir,canvas_elt,canvas_dom,current_test,result,start,id,canvas_end,tp_result,dom_rel_elements,canvas_br)



