workdir: '{{workdir}}'
configfile: 'config.yaml'
localrules: all, generate_yield, combine_seqdata, qc_snippy, index_reference, calculate_iqtree_command_core,collate_report,write_html_report

SAMPLE = config['isolates'].split()

min_aln = int(config['min_perc'])
REFERENCE = config['reference']
GUBBINS = config['gubbins']
if GUBBINS:
	CORE_OUTPUT = 'gubbins.aln'
else:
	CORE_OUTPUT = 'core.aln'
	

rule all:
	input:{% raw %}
		expand("{sample}/seqdata.tab", sample = SAMPLE),
		"report/seqdata.tab",
		"ref.fa",
		"ref.fa.fai",
		expand("{sample}/snps.vcf", sample = SAMPLE),
 		expand("{sample}/snps.aligned.fa", sample = SAMPLE),
		"core.vcf", 
		"distances.tab",
		"core.treefile", 
		"report/core_genome.tab", 
		"report/core.treefile", 
		"report/distances.tab",
		"report/core.tab",
		"report/report.html",{% endraw %}
		{{kraken_output}}

{{kraken_rule}}
{{kraken_summary}}
{% raw %}

rule seqdata:
	input:
		ancient('READS/{sample}/R1.fq.gz'),
		ancient('READS/{sample}/R2.fq.gz')
	output:
		"{sample}/seqdata.tab"
	singularity:{% endraw %}"{{singularity_dir}}/seqtk"{% raw %}
	shell:
		"""
		seqtk fqchk {input[0]} {input[1]} > {output}
		"""


rule estimate_coverage:
	input:
		ancient("READS/{sample}/R1.fq.gz"),
		ancient("READS/{sample}/R2.fq.gz")
	output:
		"{sample}/mash.txt"
	singularity:{% endraw %}"{{singularity_dir}}/mash_kmc"{% raw %}
	shell:
		"""
		mash sketch -r {input[0]} {input[1]} -m 3 -k 31 -o mash  &> {output}
		"""


rule generate_yield:
	input:
		"{sample}/mash.txt",
		"{sample}/seqdata.tab"
	output:
		"{sample}/yield.tab"
	shell:
		"""
		python3 {% endraw %}{{script_path}}/generate_yield.py{% raw %} {input[1]} {input[0]} {output}
		"""



rule combine_seqdata:
	input:
		expand("{sample}/yield.tab", sample = SAMPLE)
	output:
		"seqdata.tab"
	run:
		import pathlib, pandas, numpy
		sdfiles = f"{input}".split()
		seqdata = pandas.DataFrame()
		for sd in sdfiles:
			p = pathlib.Path(sd)
			df = pandas.read_csv(sd, sep = "\t")
			print(df)
			df['Isolate'] = f"{p.parts[0]}"
			
			if seqdata.empty:
				seqdata = df
			else:
				seqdata = seqdata.append(df, sort = True)
		seqdata['Quality'] = numpy.where(seqdata['Estimated depth'] >= 40, 'PASS','FAIL')
		seqdata = seqdata[['Isolate','Reads','Yield','GC content','Min len','Avg len','Max len','Avg Qual','Estimated depth', 'Quality']]
		seqdata.to_csv(f"{output}", sep = '\t', index = False)
	
rule snippy:
	input:
		ancient('READS/{sample}/R1.fq.gz'),
		ancient('READS/{sample}/R2.fq.gz')
	output:
		'{sample}/snps.vcf',
		'{sample}/snps.aligned.fa'
	threads:
		8
	singularity:{% endraw %}"{{singularity_dir}}/snippy"{% raw %}
	shell:
		"""
		snippy --outdir {wildcards.sample} --ref {REFERENCE} --R1 {input[0]} --R2 {input[1]} --force --cpus {threads}
		"""
	

rule qc_snippy: 
	input:
		expand('{sample}/snps.aligned.fa', sample = SAMPLE)
		
	output:
		'core_isolates.txt'
		
	run:
		from Bio import SeqIO
		import pathlib
		import pandas
		# create an output
		isolate_list = []
		excluded_list = []
		outfile = pathlib.Path(f"{output[0]}")
		# get input file list
		input_list = f"{input}".split()
		# set the log path
		logpath = pathlib.Path('isolates.log')
		for i in input_list: # for each input file
			# get the isolate name
			p = pathlib.Path(f"{i}")
			isolate = p.parts[-2]
			if p.exists(): # if the file exists open it
				fasta = p.open()
				for i in SeqIO.parse(fasta,'fasta'): # use BioPython to determine percent alignment
					length = len(i.seq)
					nocov = i.seq.count('-')
					lowcov = i.seq.count('N')
					het = i.seq.count('n')
					unaln = nocov + lowcov + het
					perc_aln = 100*(length - unaln) / length
					# if the percent alignement is greater than the min alignment
					if perc_aln > min_aln:
						isolate_list.append(f"{isolate}")
					else:
						excluded_list.append(isolate)
						print(f"{isolate} has been excluded from the analysis due to poor alignement with reference")
						
		isolate_list = list(set(isolate_list))
		with open(outfile, 'w') as f:
			f.write('\n'.join(isolate_list))
		# get log if the excluded list has any isolates in
		if excluded_list != []:
			if logpath.exists():
				lf = pandas.read_csv(logpath, sep = '	', index_col = False)
				for e in excluded_list:
					lf.loc[lf['Isolate'] == e.strip('#'), 'Status'] = f"(FAILED ALIGNMENT (<{min_aln}% ALIGNMENT))"
					lf.loc[lf['Isolate'] == e.strip('#'), 'Date'] = f"{config['day']}"
					lf.to_csv(logpath, sep = '	', index=False)

	

rule run_snippy_core:
	input:
		'core_isolates.txt'
	output:
		'core.vcf',
		'core.txt',
		'core.aln', 
		'core.full.aln',
		'core.tab'
	singularity:{% endraw %}"{{singularity_dir}}/snippy"{% raw %}
	shell:
		"""
		snippy-core {% endraw %}{{mask_string}}{% raw %} --ref {REFERENCE}  $(cat core_isolates.txt)
		
		"""

if GUBBINS:
	rule run_gubbins:
		input:
			'core.full.aln'
		output:
			'clean.full.aln',
			'gubbins.aln'
			
		
		shell:
			"""
			snippy-clean_full_aln {input} > {output[0]}
			run_gubbins.py -c 36  --prefix core {output[0]}
			snp-sites -c core.filtered_polymorphic_sites.fasta > {output[1]}
			"""	

rule run_snpdists:
	input:
		CORE_OUTPUT
	output:
		'distances.tab' 
	singularity:{% endraw %}"{{singularity_dir}}/snippy"{% raw %}
	shell:
		"""
		snp-dists {input} > {output}
		"""
	

rule index_reference:
	input:
		REFERENCE
	output:
		"ref.fa",
		"ref.fa.fai"
	run:
		from Bio import SeqIO
		import pathlib, subprocess
		ref = f"{output[0]}"
		idx = f"{output[1]}"
		print(type(ref))
		print(type(idx))
		if '.fa' not in REFERENCE:
			print(f"converting {REFERENCE}")
			SeqIO.convert(f"{input[0]}", 'genbank', ref	, 'fasta')
			print(f"converted {REFERENCE}")
		else:
			subprocess.run(f"ln -sf {REFERENCE} {ref}", shell = True)
		subprocess.run(f"samtools faidx {ref}", shell =True)


rule calculate_iqtree_command_core:
	input:
		CORE_OUTPUT,
		"ref.fa"
	output:
		'run_iqtree_core.sh'
	shell:
		"bash {% endraw %}{{script_path}}/iqtree_generator.sh{% raw %} {input[1]} {input[0]} core 20 > {output}"

	

rule run_iqtree_core:
	input:
		'run_iqtree_core.sh'
	
	output:
		'core.iqtree',
		'core.treefile',
		
	singularity:{% endraw %}"{{singularity_dir}}/iqtree"{% raw %}
	shell:
		"""	
		bash run_iqtree_core.sh
		
		rm *.ckp.gz *.contree *.bionj
		"""
		

rule collate_report:
	input:{% endraw %}
		'seqdata.tab', 'core.txt', 'core.treefile', 'core.tab', 'distances.tab', 'core.tab', {{species_summary}}
	output:
		'report/seqdata.tab', 'report/core_genome.tab', 'report/core.treefile','report/distances.tab','report/core.tab', {{species_report}}{% raw %}
	run:		
		import pandas, pathlib, subprocess, numpy
		
		# for core.txt
		df = pandas.read_csv(pathlib.Path(f"core.txt"), sep = '\t')
		df['% USED'] = 100 * (df['LENGTH'] - df['UNALIGNED'])/ df['LENGTH']
		df['% USED'] = df['% USED'].round(2)
		df = df.rename(columns={'ID':'Isolate'})
		df.to_csv(f"report/core_genome.tab", sep='\t', index = False)

		cmd = f"""
cp seqdata.tab report/seqdata.tab
cp core.treefile report/core.treefile
cp distances.tab report/distances.tab
cp core.tab report/core.tab
{% endraw %}{{copy_species_id}}

"""
		subprocess.run(cmd, shell = True)




rule write_html_report:
	input:
		'report/seqdata.tab',  'report/core_genome.tab', 'report/core.treefile', 'report/distances.tab',{{species_report}}
	output:
		'report/report.html'
	
	shell:
		"""
		python3 {{script_path}}/write_report.py {{workdir}} {{template_path}} s {{job_id}} no_assembler {{run_kraken}}
		"""

