#!python 

###############################################################
# metabolisHMM - A tool for exploring and visualizing the distribution and evolutionary histories of metabolic markers
# create-genome-phylogeny: to create a phylogeny based on ribosomal proteins
# Written by Elizabeth McDaniel emcdaniel@wisc.edu
# November 2018
# This program is free software under the GNU General Public License version 3.0
###############################################################

import glob, argparse, subprocess, os, sys, tempfile, re
from subprocess import Popen, DEVNULL
from distutils.spawn import find_executable
from Bio import BiopythonExperimentalWarning
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore', BiopythonExperimentalWarning)
    from Bio import SearchIO, SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import UnknownSeq, Seq
from Bio.SeqRecord import SeqRecord
from collections import defaultdict, Counter

# Arguments
parser = argparse.ArgumentParser(description = "Create ribosomal phylogenies using specific ribosomal markers for archaea or bacteria")
parser._action_groups.pop()
required = parser.add_argument_group("required arguments")
optional = parser.add_argument_group("optional arguments")
metadata = parser.add_argument_group("metadata output files for ITOL")
required.add_argument('--input', metavar='INPUT', help='Directory where genomes to be screened are held')
required.add_argument('--output', metavar='OUTPUT', help='Directory to store results and intermediate files')
required.add_argument('--domain', metavar='DOMAIN', help='archaea, bacteria')
required.add_argument('--phylogeny', metavar='PHY', help='fastree, raxml')
optional.add_argument('--threads',metavar='THREADS',help='Optional: number of threads for calculating a tree using RAxML. This is not taken into account using Fastree')
optional.add_argument('--loci', metavar='LOCI', default='12', help='Output genomes with less than x number of loci. By default prints genomes that have less than 12 ribosomal loci markers.')
metadata.add_argument('--metadata', metavar='METADATA',help='Option for outputting ITOL formatted metadata files. ON or OFF')
metadata.add_argument('--names', metavar='NAMES', help="Provided .csv formatted metadata file of filenames and corresponding taxonomical or group names")
metadata.add_argument('--itol_file', metavar='ITOL', default="itol_metadata.txt", help="Output iTOL formatted metadata file for changing leaf labels to taxonomical or group names")

# check for required installed dependencies
# prodigal
if find_executable('prodigal') is not None:
    pass
else:
    print('You do not have prodigal installed in your path. Please fix this.')
    sys.exit()
# hmmsearch
if find_executable('hmmsearch') is not None:
    pass
else:
    print('You do not have the hmmsearch executable from HMMER in your path. Please fix this.')
    sys.exit()
# mafft
if find_executable('mafft') is not None:
    pass
else: 
    print('You do not have MAFFT installed in your path. Please fix this.')
    sys.exit()
# fasttree, later check if phytool is raxml and check for correct raxml argument
if find_executable('FastTree') is not None:
    pass
else: 
    print('You do not have FastTree installed in your path. Please fix this.')
    sys.exit()

# if no arguments given, print help message
if len(sys.argv) < 2:
    parser.print_help()
    sys.exit(1)

# version to print
VERSION = '2.21'

# Beginning message
print('')
print('#############################################')
print('metabolisHMM v' + VERSION)

# arguments setup
args = parser.parse_args()
GENOMEDIR = args.input
GENOMEFILES = GENOMEDIR + "/**"
DOMAIN = args.domain
PHYTOOL = args.phylogeny
THREADS = args.threads
OUTPUT = args.output
out_intm = OUTPUT + "/out"
out_results = OUTPUT + "/results"
out_genomes = OUTPUT + "/genomes"
METADATA = args.metadata
NAMES = args.names
ITOL_FILE = args.itol_file

if PHYTOOL == 'raxml':
    if find_executable('raxmlHPC-PTHREADS') is not None:
        pass
    else:
        print('You have opted to use raxml for building trees, but do not have the correct executable, raxmlHPC-PTHREADS installed in your path. Please fix this.')
        sys.exit()

# check if directory exists
if os.path.isdir(OUTPUT) == True:
    print("Directory "+ OUTPUT +" already exists! Please create different directory or remove the existing one.")
    sys.exit()

# check for ribosomal markers directory
if os.path.isdir("curated_markers/ribosomal_markers/") == False:
    print("     The ribosomal markers directory could not be found."+"\n"+"     Please either download the markers from https://github.com/elizabethmcd/metabolisHMM/releases/download/v2.0/metabolisHMM_v2.0_markers.tgz and decompress the tarball, or move the directory to where you are running the workflow from.")
    sys.exit()

# setup directories
genomes = glob.glob(GENOMEFILES)
os.makedirs(out_intm)
os.makedirs(out_results)
os.makedirs(out_genomes)
# turns off printing to stdout
FNULL = open(os.devnull, 'w')
DEVNULL = open(os.devnull, 'wb')

# if metadata option on, check the names file and itol path provided
if METADATA == 'ON':
    if os.path.exists(NAMES) == True:
    # header for itol output
        ITOL_PATH = OUTPUT + "/results/" + ITOL_FILE
        OUT_ITOL = open(ITOL_PATH, "w")
        OUT_ITOL.write("LABELS\nSEPARATOR TAB\nDATA\n")
        with open(NAMES, "r") as f:
            for record in f:
                filename, labelname = record.rstrip().split(',')
                OUT_ITOL.write('%s\t%s\n' % (filename, labelname))
elif METADATA == 'OFF':
    pass


# different ribosomal markers for archaea/bacteria/all
bacteria_list = ['rpL14_bact','rpL15_bact','rpL16_bact','rpL18_bact','rpL22_bact','rpL24_bact','rpL2_bact','rpL3_bact','rpL4_bact','rpL5_bact','rpL6_bact','rpS10_bact','rpS17_bact','rpS19_bact','rpS3_bact','rpS8_bact']
archaea_list = ['rpL14_arch','rpL15_arch','rpL16_arch','rpL18_arch','rpL22_arch','rpL24_arch','rpL2_arch','rpL3_arch','rpL4_arch','rpL5_arch','rpL6_arch', 'rpS10_arch','rpS17_arch','rpS19_arch','rpS3_arch','rpS8_arch']


if DOMAIN == 'archaea':    
    prot_list=archaea_list
elif DOMAIN == 'bacteria':
    prot_list=bacteria_list


# if .fna predict CDS and reformat header names because prodigal makes them stupid
# if .faa reformat the headers just in case contains weirdness
# if the user didn't provide the right files tell them
n = 0
print("Reformatting fasta files...")
for genome in genomes:
    if genome.endswith('.fna'):
        name = os.path.basename(genome).replace(".fna", "").strip().splitlines()[0]
        out_prot = OUTPUT + "/genomes/" + name + ".faa"
        out_gbk = OUTPUT + "/genomes/" + name + ".gbk"
        out_reformatted = OUTPUT + "/genomes/" + name + ".reformatted.faa"
        prodigal_cmd = "prodigal -q -i "+genome+" -a "+out_prot +" -o "+out_gbk
        os.system(prodigal_cmd)
        for seq_record in SeqIO.parse(out_prot, "fasta"):
            n = n + 1
            a = str(n).zfill(5)
            with open(out_reformatted, "a") as outre:
                outre.write(">" + name + "_" + str(a) + "\n")
                outre.write(str(seq_record.seq).replace("*","") + "\n")
    elif genome.endswith('.faa'):
        name = os.path.basename(genome).replace(".faa", "").strip().splitlines()[0]
        out_reformatted = OUTPUT + "/genomes/" + name + ".reformatted.faa"
        for seq_record in SeqIO.parse(genome, "fasta"):
            n = n + 1
            a = str(n).zfill(5)
            with open(out_reformatted, "a") as outre:
                outre.write(">" + name + "_" + str(a) + "\n")
                outre.write(str(seq_record.seq).replace("*","") + "\n")
    else:
        print("These do not look like fasta files that end in .fna or .faa. Please check your genome files.")
        sys.exit()
reformatted_path = OUTPUT + "/genomes/" + "*.reformatted.faa"
reformatted_genomes = glob.glob(reformatted_path)

# setup hmmsearch run depending on HMM list
print("Running ribosomal protein HMM searches...")
for genome in reformatted_genomes:
    name=os.path.basename(genome).replace(".reformatted.faa", "").strip().splitlines()[0]
    dir=name
    os.makedirs(OUTPUT+"/out/"+dir)
    for prot in prot_list:
        marker ="curated_markers/ribosomal_markers/"+prot+".hmm"
        outname= OUTPUT + "/out/"+dir+"/"+name + "-" + prot + ".out"
        cmd = ["hmmsearch", "--tblout="+outname, marker, genome]
        subprocess.call(cmd, stdout=FNULL)

# Parse HMM outputs
print("Parsing results...")
if DOMAIN == 'archaea':    
    prot_list=archaea_list
elif DOMAIN == 'bacteria':
    prot_list=bacteria_list
elif DOMAIN == 'all': 
    prot_list=all_list
result_dirs = os.walk(OUTPUT +"/out/")
for prot in prot_list:
    for path, dirs, files in result_dirs: 
        for file in files:
            genome=file.split("-")[0]
            marker=file.replace(".out", "").split("-")[1]
            result=OUTPUT + "/out/"+genome+"/"+file
            outfasta=OUTPUT + "/results/"+marker+".faa"
            genome_file = OUTPUT + "/genomes/"+genome+".reformatted.faa"
            with open(outfasta, "a") as outf:
                with open(genome_file, "r") as input_fasta:
                    with open(result, "r") as input:
                        for qresult in SearchIO.parse(input, "hmmer3-tab"):
                            hits=qresult.hits
                            num_hits=len(hits)
                            if num_hits >0:
                                for i in range(0,1):
                                    hit_id=hits[i].id
                                for record in SeqIO.parse(input_fasta, "fasta"):
                                    if record.id in hit_id:
                                        outf.write(">"+genome+"\n"+str(record.seq)+"\n")
                       
# Make alignment file for each marker
print("Aligning ribosomal hits...")
prots = OUTPUT + "/results/*.faa"
fastas = glob.glob(prots)
for fasta in fastas:
    outname = os.path.basename(fasta).replace(".faa", "").strip().splitlines()[0]
    output= OUTPUT + "/results/"+outname+".aln"
    mafft_cmd = "mafft --quiet "
    mafft_cmd += fasta+" > "+output
    os.system(mafft_cmd)

# Concatenate alignments
print("Concatenating alignments...")
# referred to the biopython cookbook for concatenating multiple sequence alignments 
# adds question marks for missing loci for a given genome
prot_alignments = OUTPUT + "/results/*.aln"
infiles = glob.glob(prot_alignments)
concatout = OUTPUT + "/out/"+DOMAIN+"-concatenated-ribosomal-alignment.fasta"
alignments = [AlignIO.read(open(f, "r"), "fasta") for f in infiles]
all_labels = set(seq.id for aln in alignments for seq in aln)
tmp = defaultdict(list)
for aln in alignments:
    length = aln.get_alignment_length()
    these_labels = set(rec.id for rec in aln)
    missing = all_labels - these_labels
    for label in missing:
        new_seq = UnknownSeq(length) # prints ? marks for missing
        tmp[label].append(str(new_seq))
    for rec in aln:
        tmp[rec.id].append(str(rec.seq))
msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(v)), id=k)
            for (k,v) in tmp.items())
AlignIO.write(msa,concatout,"fasta")

# check number of loci for a genome and print if less than a certain number to alert the user
counts = dict()
for file in infiles:
    with open(file) as f:
        for line in f:
            if line.startswith(">"):
                id = line.strip('\n').strip('>')
                counts[id] = counts.get(id, 0) + 1
x = int(args.loci)
under_cutoff = OUTPUT + "/results/genomes-under-cutoff.txt"
for (k,v) in counts.items():
    if v < x:
        with open(under_cutoff, 'a') as uc:
            uc.write("Genome "+ k + " has fewer than " + str(x) + ' hits!' + "\n")
print('Find list of genomes with less than ' + str(x) + ' ribosomal markers in ' + under_cutoff)

# get rid of "unknown description" descriptor in alignment file that biopython adds and I haven't figured out how to remove
reformatout=OUTPUT + "/results/"+DOMAIN+"-concatenated-ribosomal-alignment-reformatted.fasta"
filter_cmd = "sed 's/<unknown description>//g' "+concatout+" > "+reformatout
os.system(filter_cmd)

# Create tree
if PHYTOOL == 'fastree':
    print("Calculating tree using FastTree...")
    fileIn=OUTPUT + "/results/"+DOMAIN+"-concatenated-ribosomal-alignment-reformatted.fasta"
    outname = OUTPUT + "/results/"+DOMAIN+"-fastTree-ribosomal-tree.tre"
    logfile = OUTPUT + "/results/"+"ribosomal-tree"+".fastree.logfile"
    fastCmd = ["FastTree","-quiet","-log",logfile,"-out",outname,fileIn]
    subprocess.Popen(fastCmd, stdout=DEVNULL, stderr=DEVNULL)
elif PHYTOOL == "raxml":
    print("Calculating tree with RaxML... be patient...")
    outname= DOMAIN+"-raxml-ribo"
    fileIn=OUTPUT + "/results/"+DOMAIN+"concatenated-ribosomal-alignment-reformatted.fasta"
    raxCmd = "raxmlHPC-PTHREADS -f a -m PROTGAMMAAUTO -p 12345 -x 12345 -# 100 -s "+fileIn+" -T "+THREADS+" -n "+outname
    os.system(raxCmd)

# end message
print("Done! Find your results in "+ OUTPUT + "/results/")
print('#############################################')
