workdir: '{{workdir}}'
configfile: 'config.yaml'
localrules: all, generate_yield, combine_seqdata, qc_snippy, index_reference, calculate_iqtree_command_core,combine_assembly_metrics,assembly_statistics,collate_report,write_html_report

SAMPLE = config['isolates'].split()

min_aln = int(config['min_perc'])
REFERENCE = config['reference']
GUBBINS = config['gubbins']
if GUBBINS:
	CORE_OUTPUT = 'gubbins.aln'
else:
	CORE_OUTPUT = 'core.aln'
	
rule all:
	input:{% raw %}
		expand("{sample}/seqdata.tab", sample = SAMPLE),
		"report/seqdata.tab",
		expand("{sample}/{sample}.fa", sample = SAMPLE),
		expand("{sample}/resistome.tab", sample = SAMPLE),
		expand("prokka/{sample}/{sample}.gff", sample = SAMPLE),
		expand("prokka/{sample}/{sample}.txt", sample = SAMPLE),
		"mlst.tab", 
		"denovo.tab", 
		"assembly.tab", 
		"resistome.tab",
		"report/assembly.tab",
		"report/mlst.tab", 
		"report/resistome.tab",
		"ref.fa",
		"ref.fa.fai",
		expand("{sample}/snps.vcf", sample = SAMPLE),
 		expand("{sample}/snps.aligned.fa", sample = SAMPLE),
		"core.vcf", 
		"distances.tab",
		"core.treefile", 
		"report/core_genome.tab", 
		"report/core.treefile", 
		"report/distances.tab",
		"report/core.tab",
		"report/report.html",
        "roary/gene_presence_absence.csv", 
		"pan_genome.svg", 
		"report/pan_genome.svg",
		"report/summary_statistics.txt",{% endraw %}
		{{kraken_output}}

{{kraken_rule}}
{{kraken_summary}}
{% raw %}

rule seqdata:
	input:
		ancient('READS/{sample}/R1.fq.gz'),
		ancient('READS/{sample}/R2.fq.gz')
	output:
		"{sample}/seqdata.tab"
	singularity:{% endraw %}"{{singularity_dir}}/seqtk"{% raw %}
	shell:
		"""
		seqtk fqchk {input[0]} {input[1]} > {output}
		"""


rule estimate_coverage:
	input:
		ancient("READS/{sample}/R1.fq.gz"),
		ancient("READS/{sample}/R2.fq.gz")
	output:
		"{sample}/mash.txt"
	singularity:{% endraw %}"{{singularity_dir}}/mash_kmc"{% raw %}
	shell:
		"""
		mash sketch -r {input[0]} {input[1]} -m 3 -k 31 -o mash  &> {output}
		"""


rule generate_yield:
	input:
		"{sample}/mash.txt",
		"{sample}/seqdata.tab"
	output:
		"{sample}/yield.tab"
	shell:
		"""
		python3 {% endraw %}{{script_path}}/generate_yield.py{% raw %} {input[1]} {input[0]} {output}
		"""



rule combine_seqdata:
	input:
		expand("{sample}/yield.tab", sample = SAMPLE)
	output:
		"seqdata.tab"
	run:
		import pathlib, pandas, numpy
		sdfiles = f"{input}".split()
		seqdata = pandas.DataFrame()
		for sd in sdfiles:
			p = pathlib.Path(sd)
			df = pandas.read_csv(sd, sep = "\t")
			print(df)
			df['Isolate'] = f"{p.parts[0]}"
			
			if seqdata.empty:
				seqdata = df
			else:
				seqdata = seqdata.append(df, sort = True)
		seqdata['Quality'] = numpy.where(seqdata['Estimated depth'] >= 40, 'PASS','FAIL')
		seqdata = seqdata[['Isolate','Reads','Yield','GC content','Min len','Avg len','Max len','Avg Qual','Estimated depth', 'Quality']]
		seqdata.to_csv(f"{output}", sep = '\t', index = False)
	
rule snippy:
	input:
		ancient('READS/{sample}/R1.fq.gz'),
		ancient('READS/{sample}/R2.fq.gz')
	output:
		'{sample}/snps.vcf',
		'{sample}/snps.aligned.fa'
	threads:
		8
	singularity:{% endraw %}"{{singularity_dir}}/snippy"{% raw %}
	shell:
		"""
		snippy --outdir {wildcards.sample} --ref {REFERENCE} --R1 {input[0]} --R2 {input[1]} --force --cpus {threads}
		"""
	

rule qc_snippy: 
	input:
		expand('{sample}/snps.aligned.fa', sample = SAMPLE)
		
	output:
		'core_isolates.txt'
		
	run:
		from Bio import SeqIO
		import pathlib
		import pandas
		# create an output
		isolate_list = []
		excluded_list = []
		outfile = pathlib.Path(f"{output[0]}")
		# get input file list
		input_list = f"{input}".split()
		# set the log path
		logpath = pathlib.Path('isolates.log')
		for i in input_list: # for each input file
			# get the isolate name
			p = pathlib.Path(f"{i}")
			isolate = p.parts[-2]
			if p.exists(): # if the file exists open it
				fasta = p.open()
				for i in SeqIO.parse(fasta,'fasta'): # use BioPython to determine percent alignment
					length = len(i.seq)
					nocov = i.seq.count('-')
					lowcov = i.seq.count('N')
					het = i.seq.count('n')
					unaln = nocov + lowcov + het
					perc_aln = 100*(length - unaln) / length
					# if the percent alignement is greater than the min alignment
					if perc_aln > min_aln:
						isolate_list.append(f"{isolate}")
					else:
						excluded_list.append(isolate)
						print(f"{isolate} has been excluded from the analysis due to poor alignement with reference")
						
		isolate_list = list(set(isolate_list))
		with open(outfile, 'w') as f:
			f.write('\n'.join(isolate_list))
		# get log if the excluded list has any isolates in
		if excluded_list != []:
			if logpath.exists():
				lf = pandas.read_csv(logpath, sep = '	', index_col = False)
				for e in excluded_list:
					lf.loc[lf['Isolate'] == e.strip('#'), 'Status'] = f"(FAILED ALIGNMENT (<{min_aln}% ALIGNMENT))"
					lf.loc[lf['Isolate'] == e.strip('#'), 'Date'] = f"{config['day']}"
					lf.to_csv(logpath, sep = '	', index=False)

	

rule run_snippy_core:
	input:
		'core_isolates.txt'
	output:
		'core.vcf',
		'core.txt',
		'core.aln', 
		'core.full.aln',
		'core.tab'
	singularity:{% endraw %}"{{singularity_dir}}/snippy"{% raw %}
	shell:
		"""
		snippy-core {% endraw %}{{mask_string}}{% raw %} --ref {REFERENCE}  $(cat core_isolates.txt)
		
		"""

if GUBBINS:

	rule run_gubbins:
		input:
			'core.full.aln'
		output:
			'clean.full.aln',
			'gubbins.aln'
			
		
		shell:
			"""
			snippy-clean_full_aln {input} > {output[0]}
			run_gubbins.py -c 36  --prefix core {output[0]}
			snp-sites -c core.filtered_polymorphic_sites.fasta > {output[1]}
			"""

rule run_snpdists:
	input:
		CORE_OUTPUT
	output:
		'distances.tab' 
	singularity:{% endraw %}"{{singularity_dir}}/snippy"{% raw %}
	shell:
		"""
		snp-dists {input} > {output}
		"""
	

rule index_reference:
	input:
		REFERENCE
	output:
		"ref.fa",
		"ref.fa.fai"
	run:
		from Bio import SeqIO
		import pathlib, subprocess
		ref = f"{output[0]}"
		idx = f"{output[1]}"
		print(type(ref))
		print(type(idx))
		if '.fa' not in REFERENCE:
			print(f"converting {REFERENCE}")
			SeqIO.convert(f"{input[0]}", 'genbank', ref	, 'fasta')
			print(f"converted {REFERENCE}")
		else:
			subprocess.run(f"ln -sf {REFERENCE} {ref}", shell = True)
		subprocess.run(f"samtools faidx {ref}", shell =True)


rule calculate_iqtree_command_core:
	input:
		CORE_OUTPUT,
		"ref.fa"
	output:
		'run_iqtree_core.sh'
	shell:
		"bash {% endraw %}{{script_path}}/iqtree_generator.sh{% raw %} {input[1]} {input[0]} core 20 > {output}"

	

rule run_iqtree_core:
	input:
		'run_iqtree_core.sh'
	
	output:
		'core.iqtree',
		'core.treefile',
		
	singularity:{% endraw %}"{{singularity_dir}}/iqtree"{% raw %}
	shell:
		"""	
		bash run_iqtree_core.sh
		
		rm *.ckp.gz *.contree *.bionj
		"""
		
	

rule assemble:
	input:
		ancient('READS/{sample}/R1.fq.gz'),
		ancient('READS/{sample}/R2.fq.gz')
	output:
		'{sample}/{sample}.fa'
	threads:
		16
	singularity:{% endraw %}"{{singularity_dir}}/assemblers"{% raw %}
	shell:
		"""
		ASSEMBLEPATH={% endraw %}{{prefill_path}}{% raw %}{wildcards.sample}
		if [ -f $ASSEMBLEPATH/contigs.fa ]; then
			cp $ASSEMBLEPATH/contigs.fa {output}

		else
			echo No assembly found. Assembling {wildcards.sample} with shovill
			
			shovill --outdir {wildcards.sample} --R1 {input[0]} --R2 {input[1]} --force --minlen 500 --cpus {threads}
			mv {wildcards.sample}/contigs.fa {output}

		fi		
		"""
	

rule resistome:
	input:
		'{sample}/{sample}.fa'
	output:
		'{sample}/resistome.tab'
	singularity:{% endraw %}"{{singularity_dir}}/abricate"{% raw %}
	shell:
		"""
		abricate --nopath {input} > {output}
		"""

	

rule mlst:
	input:
		expand('{sample}/{sample}.fa', sample = SAMPLE)
	output:
		'mlst.tab'
	singularity:{% endraw %}"{{singularity_dir}}/mlst"{% raw %}
	shell:
		"""
		mlst --nopath {input} | sed 's/\.fa//g' | sed '1iIsolate\tScheme\tST\tAlleles' > {output}
		"""

	
 

rule combine_results:
	input:
		expand('{sample}/resistome.tab', sample = SAMPLE)
		
	output:
		'resistome.tab'
		
	singularity:{% endraw %}"{{singularity_dir}}/abricate"{% raw %}
	shell:
		"""
		abricate --summary {input} | sed 's/\/resistome.tab//g' | sed 's/\#FILE/Isolate/g' > {output[0]}
		
		"""
	

rule assembly_statistics:
	input:
		expand("{sample}/{sample}.fa", sample = SAMPLE)
	output:
		"denovo.tab"
	shell:
		"""
		 python3 {% endraw %}{{script_path}}/assembly_stat.py{% raw %} {input} -m 500 > {output}
		"""
	


rule run_prokka:
	input:
		"{sample}/{sample}.fa"
	output:
		"prokka/{sample}/{sample}.gff","prokka/{sample}/{sample}.txt"
	singularity:{% endraw %}"{{singularity_dir}}/prokka"{% raw %}
	shell:
		"""
		prokka --outdir prokka/{wildcards.sample} --prefix {wildcards.sample} --mincontiglen 500 --notrna --fast --force {input}
		"""

rule run_roary:
    input:
        expand("prokka/{sample}/{sample}.gff", sample = SAMPLE)
    output:
        "roary/gene_presence_absence.csv", "roary/summary_statistics.txt"
    threads:
        36
    singularity:
        "{% endraw %}{{singularity_dir}}/roary"{% raw %}
    shell:
        """
        roary -p {threads} -f roary {input}
        mv roary_*/* roary
        rm -r roary_*
        """

rule pan_figure:
    input:
        "roary/gene_presence_absence.csv"
    output:
        "pan_genome.svg"
    shell:
        """
        perl {% endraw %}{{script_path}}/{% raw %}roary2svg.pl {input} > {output}
        """

rule combine_assembly_metrics:
	input:
		prokka = expand("prokka/{sample}/{sample}.txt",sample = SAMPLE), 
		assembly = "denovo.tab"
	output:
		"assembly.tab"
	run:
		import pandas, pathlib

		prokka = f"{input.prokka}".split()
		gff = pandas.DataFrame()
		
		for p in prokka:
			g = pathlib.Path(p)
			df = pandas.read_csv(g, sep = ':', header = None, names = ['cond', f"{g.parts[1]}"])
			
			if gff.empty:
					gff = df
			else:
					gff = gff.merge(df, how = 'outer')
		gff = gff[gff['cond'].isin(['CDS', 'rRNA'])]
		gff = gff.T
		gff.columns = gff.iloc[0]
		gff = gff.iloc[1:]
	
		d = pathlib.Path(f"{input.assembly}")
		df = pandas.read_csv(d, sep = '\t')
	
		assembly = df.merge(gff, left_on = ['Name'], right_on= gff.index)
		assembly = assembly.rename(columns={'Name':'Isolate'})
		assembly.to_csv(f"{output}", sep = '\t', index = False)



 

rule collate_report:
	input:{% endraw %}
		'seqdata.tab', 'assembly.tab', 'mlst.tab', 'resistome.tab', 'core.txt', 'core.treefile', 'core.tab', 'distances.tab', 'core.tab', 'pan_genome.svg','roary/summary_statistics.txt',{{species_summary}}
	output:
		'report/seqdata.tab', 'report/assembly.tab', 'report/mlst.tab',  'report/resistome.tab', 'report/core_genome.tab', 'report/core.treefile','report/distances.tab','report/core.tab','report/pan_genome.svg', 'report/summary_statistics.txt', 
		 {{species_report}}{% raw %}
	run:
		
		
		import pandas, pathlib, subprocess, numpy

		
		
		# for core.txt
		df = pandas.read_csv(pathlib.Path(f"core.txt"), sep = '\t')
		df['% USED'] = 100 * (df['LENGTH'] - df['UNALIGNED'])/ df['LENGTH']
		df['% USED'] = df['% USED'].round(2)
		df = df.rename(columns={'ID':'Isolate'})
		df.to_csv(f"report/core_genome.tab", sep='\t', index = False)


		

		# calculate mean + 2SD and use as cutoff for quality of contigs and fix column names
		dfass = pandas.read_csv(pathlib.Path(f"assembly.tab"), sep = '\t')
		cut = dfass['# Contigs'].mean() + (2* dfass['# Contigs'].std())
		dfass['Quality'] = numpy.where(dfass['# Contigs'] <= cut, 'PASS','FAIL')
		dfass = dfass.rename(columns={'rRNA':'# rRNA', 'CDS':'# CDS'})
		dfass.to_csv(f"report/assembly.tab", sep= '\t', index=False)

		cmd = f"""
cp seqdata.tab report/seqdata.tab
cp core.treefile report/core.treefile
cp distances.tab report/distances.tab
cp core.tab report/core.tab
cp mlst.tab report/mlst.tab
cp resistome.tab report/resistome.tab
cp pan_genome.svg report/pan_genome.svg
cp roary/summary_statistics.txt report/summary_statistics.txt
{% endraw %}{{copy_species_id}}

"""
		subprocess.run(cmd, shell = True)




rule write_html_report:
	input:
		'report/seqdata.tab', 'report/assembly.tab','report/mlst.tab', 'report/resistome.tab', 'report/core_genome.tab', 'report/core.treefile', 'report/distances.tab','report/pan_genome.svg', 'report/summary_statistics.txt' ,{{species_report}}
	output:
		'report/report.html'
	
	shell:
		"""
		python3 {{script_path}}/write_report.py {{workdir}} {{template_path}} all {{job_id}} {{assembler}} {{run_kraken}}
		"""

