'''
Before you run this snakemake workflow, please ensure that:
0. All in all: you must install snakemake at the base environment in order to run this workflow.
1. FragGeneScanRs has been in your path, with intact functionality.
2. Binny snakemake workflow works fine, and the initialization of conda environment and database passed the check pass. 
3. Set up the CAT database.
4. figure out what is gonna be your file's name, put a list into it! [either from snakemake itself or by bash script controlling.]
5. Add CAT's conda environment bin to your path, with intact functionality.
6. Envrionment name "Kronatools" or a krona.yml in the envs directory.
'''

import os
import sys
import yaml
import logging
from pathlib import Path

# min_version("6.0")

shell.executable("bash")

# Some major configuration parameters and inputs.
assembly_path = config['Inputs']['assembly']
ALIGNMENT_FILES = config['Inputs']['alignment']
DEPTH_TEXT_FILE = config['Inputs']['depth_txt']
try:
    DEPTH_FILE_EXISTS = Path(DEPTH_TEXT_FILE).exists()
except TypeError:
    DEPTH_FILE_EXISTS = False

OUTPUT_DIR = Path(config['outputdir'])
BATCH_NAME = config['Inputs']['batch_name']

BINNY_OUTPUT_DIR = config['binning']['outputdir']    
BINNY_SNAKEMAKE_ENV = config['binning']['snakemake_env']
BINNY_MANTIS_ENV = config['binning']['mantis_env']
# For binny: some global names.
GLOBAL_CONFIG = "config.ChloroScan.yaml"

WORKFLOW_DIR = Path(workflow.basedir)
# BINNY_DIR = WORKFLOW_DIR.parent.parent/"binny_Chloroscan" 

SCR_DIR = WORKFLOW_DIR/"scripts"
 
# For CAT taxonomy prediction, please provide tax and db directory path.
CAT_DB_DIR = config['cat']['database']
CAT_TAXON_DIR = config['cat']['taxonomy']
TAXA = os.path.join(CAT_TAXON_DIR, "names.dmp")

GLOBAL_PREFIX="OUT"
GLOBAL_PREFIX_contigs="out.CAT"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# create temporary directory.
TMP_DIR = config['tmpdir']
if not os.path.isabs(TMP_DIR):
    TMP_DIR = OUTPUT_DIR/TMP_DIR

TMP_DIR.mkdir(parents=True, exist_ok=True) # what if we do want to prepare data and put it here? 

# ANVIO_PREPARE=config['anvio']['prepare']
# ANVIO_READS=config['anvio']['reads']

CONFIG_RECORD=os.path.join(OUTPUT_DIR, "arguments.txt")
with open(CONFIG_RECORD, "w") as cr:
    cr.write("assembly path: {}\n".format(assembly_path))
    cr.write("abundance profile: {}\n".format(ALIGNMENT_FILES))
    cr.write("depth profile: {}\n".format(DEPTH_TEXT_FILE))
    cr.write("output dir: {}\n".format(OUTPUT_DIR))
    cr.write("contig classification settings:\n")
    cr.write("\tprobability threshold: {}\n".format(config['corgi']['pthreshold']))
    cr.write("\tminimum contig length cutoff: {}\n".format(config['corgi']['min_length']))
    cr.write("\tbatch size for prediction: {}\n".format(config['corgi']['batch_size']))
    cr.write("binning settings: # Note: by default binny automatically uses coassembly mode for the input metagenome.\n")
    cr.write("\tcontig length cutoff for clustering and marker gene detection: {}\n".format(config['binning']['universal_length_cutoff']))
    cr.write("\tHDBSCAN epsilon range: {}\n".format(config['binning']['clustering']['epsilon_range']))
    cr.write("\tHDBSCAN minimum sample range: {}\n".format(config['binning']['clustering']['hdbscan_min_sample_range']))
    cr.write("\tMAG starting completeness: {}\n".format(config['binning']['bin_quality']['starting_completeness']))
    cr.write("\tMAG minimum completeness: {}\n".format(config['binning']['bin_quality']['min_completeness']))
    cr.write("\tMAG purity (1-contamination) cutoff: {}\n".format(config['binning']['bin_quality']['purity']))
    cr.write("CAT database: {}\n".format(CAT_DB_DIR))


rule all:
    input:
        OUTPUT_DIR/"ChloroScan.done"

# The most important two rules are: corgi and binny.
# SCRDIR, needs an absolute path?
rule corgi_prediction:
    input: 
        seqs = assembly_path 
    params:
        batch_size = config['corgi']['batch_size'],
        min_length = config['corgi']['min_length'],
        p_threshold = config['corgi']['pthreshold'],
        bash_plastid_accession_script = Path(SCR_DIR/"corgi-gather_accession.sh"),
        python_fasta_writer = Path(SCR_DIR/"corgi-fasta_filter.py"),
        plastid_contigs_identifier = Path(TMP_DIR/"plastid_accession.txt"),
        output_dir=OUTPUT_DIR
    conda:
        "envs/corgi_env.yml"
    output:
        corgi_out = OUTPUT_DIR/"working/corgi/corgi-prediction.csv",
        plastid_contigs = OUTPUT_DIR/"working/corgi/plastid.fasta"
        # here may change the output to a directory for file writing!
    message:
        "Using the machine learning algorithm 'CORGI' to predict contig identity. Don't rise batch size too high."
    resources:
        mem_mb=20000
    log:
        OUTPUT_DIR/"logging_info/corgi.log"
    benchmark:
        OUTPUT_DIR/"logging_info/corgi_benchmarking.tsv"
    shell:
        """
        set -e
        touch {output.plastid_contigs}
        echo "corgi --file {input.seqs} --csv {output.corgi_out} --batch-size {params.batch_size} --min-length {params.min_length} --no-save-filtered"
        corgi --file {input.seqs} --csv {output.corgi_out} --batch-size {params.batch_size} --min-length {params.min_length} --no-save-filtered &> {log}
        ls -lh {output.corgi_out}
        
        header=1
        raw_plastid_contig_count=$(grep -o "plastid" {output.corgi_out} | wc -l)
        plastid_contig_count=$((raw_plastid_contig_count-header))

        zero=0
        if [ $plastid_contig_count -eq $zero ]; then
            echo "No plastid contigs found, rule exit."
            exit 0

        elif [ $plastid_contig_count -le 100 ]; then
            echo "bash {params.bash_plastid_accession_script} -c {output.corgi_out} -o {params.plastid_contigs_identifier}"
            bash {params.bash_plastid_accession_script} -c {output.corgi_out} -o {params.plastid_contigs_identifier}
            echo "python {params.python_fasta_writer} --assembly {input.seqs} --accession {params.plastid_contigs_identifier} --fasta_file_name {output.plastid_contigs}"
            python {params.python_fasta_writer} --assembly {input.seqs} --accession {params.plastid_contigs_identifier} --fasta_file_name {output.plastid_contigs}

            # Do some calculations here.
            prediction_lines=$(wc -l <{output.corgi_out})
            contig_total=$((prediction_lines-header))
            ls {params.output_dir}/working/corgi
            
            plastid_match=$(grep -o ">" {output.plastid_contigs} | wc -l)

            echo "total classified contigs containing all 5 domains: $contig_total"> {params.output_dir}/corgi.summary.txt
            echo "total number of plastid contigs: $plastid_match">>{params.output_dir}/corgi.summary.txt
        else
            echo "bash {params.bash_plastid_accession_script} -c {output.corgi_out} -o {params.plastid_contigs_identifier} -p {params.p_threshold}"
            bash {params.bash_plastid_accession_script} -c {output.corgi_out} -o {params.plastid_contigs_identifier} -p {params.p_threshold}
            python {params.python_fasta_writer} --assembly {input.seqs} --accession {params.plastid_contigs_identifier} --fasta_file_name {output.plastid_contigs}
            # Do some calculations here.
            prediction_lines=$(wc -l <{output.corgi_out})
            contig_total=$((prediction_lines-header))
            
            plastid_match=$(grep -o ">" {output.plastid_contigs} | wc -l)

            echo "total classified contigs containing all 5 domains: $contig_total"> {params.output_dir}/corgi.summary.txt
            echo "total number of plastid contigs: $plastid_match">>{params.output_dir}/corgi.summary.txt
        fi
        
        """    

if not DEPTH_FILE_EXISTS:
    rule contig_depth_calculation:
        input:
            assembly = rules.corgi_prediction.output.plastid_contigs,
            alignment = ALIGNMENT_FILES
        output:
            directory(OUTPUT_DIR/"working/call_contig_depth")
        params:
            script_dir = SCR_DIR,
            tmpdir = TMP_DIR,
            perl_script = SCR_DIR/"calcAvgCoverage.pl"
        conda:
            "envs/DepthCall.yml"
        message:
            "Calculate the depth of each contig in the assembly."
        log:
            OUTPUT_DIR/"logging_info/call_contig_depth.log"
        script:
            "{params.script_dir}/call_contig_depth.sh"

    rule merge_contig_depth:
        input:
            depth_files = directory(OUTPUT_DIR/"working/call_contig_depth")
        output:
            OUTPUT_DIR/"working/depth_profile.txt"
        params:
            script_dir = SCR_DIR
        conda:
            "envs/DepthCall.yml"
        message:
            "Merge the depth of each contig in the assembly."
        script:
            "{params.script_dir}/merge_contig_depth.sh"

rule binny_workflow:
    input:
        plastid_contigs = rules.corgi_prediction.output.plastid_contigs,
        depth_text = lambda wildcards: DEPTH_TEXT_FILE if DEPTH_FILE_EXISTS else rules.merge_contig_depth.output # Binny now only takes the depth text profile as input.
        
    params:
        bash_create_yaml = Path(SCR_DIR/"create_yaml.sh"),
        universal_length_cutoff = config['binning']['universal_length_cutoff'],
        hdbscan_epsilon_range = config['binning']['clustering']['epsilon_range'],
        hdbscan_min_samples_range = config['binning']['clustering']['hdbscan_min_sample_range'],
        quality_min_completeness = config['binning']['bin_quality']['min_completeness'],
        quality_start_completeness = config['binning']['bin_quality']['starting_completeness'],
        quality_min_purity = config['binning']['bin_quality']['purity'],
        outputdir_within_binny = config['binning']['outputdir'], # within binny.
        binny_snakemake_env = BINNY_SNAKEMAKE_ENV,
        binny_mantis_env = BINNY_MANTIS_ENV
    output:
        dir_binny = directory(OUTPUT_DIR/"working/binny")
    message:
        "Use the binning algorithm binny to find all high-coverage bins with completeness>=72.5% and purity>=95%."
    conda:
        "envs/binny.yml"
    threads:
        workflow.cores
    shell:
        """
        set -e
        echo "Running binny with the output binny_output." 
        # Check if plastids are present.
        # If so, then run binny workflow.
        if [ -s {input.plastid_contigs} ]; then
            [ -n $BINNY_DIR/config/config.default.yaml ] && echo "default config file for binny found."
            bash {params.bash_create_yaml} -a "{input.plastid_contigs}" -b "$BINNY_DIR/A2K_database" -d "$BINNY_DIR/config/config.default.yaml" -t "{input.depth_text}" -o "{params.outputdir_within_binny}"\
                -e '{params.hdbscan_epsilon_range}' -m '{params.hdbscan_min_samples_range}' -n {params.binny_snakemake_env} -c {params.quality_min_completeness}\
                -s {params.quality_start_completeness} -p {params.quality_min_purity} -u {params.universal_length_cutoff}\
                -y "$BINNY_DIR/config/config.ChloroScan.yaml"

            echo "Configuration done, now run binny snakemake workflow."
            $BINNY_DIR/binny -l -n "MMA_plastids" -t {threads} -r $BINNY_DIR/config/config.ChloroScan.yaml
            # fi

            # must remove the config.yaml created at the end, so we need it to be changed with name.
            # rm $BINNY_DIR/config/config.ChloroScan.yaml # finally you have to remove it, you don't want to cause confusion.
            echo 'The binny workflow is done, bins are within the binny folder, that may need to move out.'
            mv {BINNY_OUTPUT_DIR} {output.dir_binny}
            chmod u+w {output.dir_binny}
            
            # make sure to remove the bam files within the intermediary folder.
            echo "The binny's output has been moved to the default output directory."
        else
            echo "No plastids present. Create an empty directory for output."
            mkdir -p {output.dir_binny}
        fi
        """

# Add a rule using metabat2 to bin MAGs. 

rule CAT_taxonomy_identification_plus_annotation:
    input: 
        binny_output=OUTPUT_DIR/"working/binny"
    params:
        prefix=GLOBAL_PREFIX_contigs,
        path_to_contigs="intermediary/assembly.formatted.fa"
    output:
        CAT_output=directory(OUTPUT_DIR/"working/CAT")
        # contig_classification=OUTPUT_DIR/"working/CAT/out.CAT.contig2classification.txt"
    log:
        OUTPUT_DIR/"logging_info/CAT.log"
    benchmark:
        OUTPUT_DIR/"logging_info/CAT_benchmark.tsv"
    conda:
        "envs/CAT_env.yml"
    shell:
        """
        set -e
        mkdir -p {output}

        # Check to see if the binny directory is empty or not
        find {input.binny_output} -mindepth 1 -maxdepth 1 >> {log}
        if [ -f "{input.binny_output}/{params.path_to_contigs}" ] ; then
            CAT contigs -c {input.binny_output}/{params.path_to_contigs} -d {CAT_DB_DIR} -t {CAT_TAXON_DIR} -o {output}/{params.prefix} --force >> {log} 
            
            
        fi
        """

# this is the correct one.
rule documenting_binny_results:
    input: 
        # bins = OUTPUT_DIR/"working/binny"/BINNY_OUTPUT_DIR/"bins",
        contig_level_annotation = directory(OUTPUT_DIR/"working/CAT")
    params: 
        bins = Path(OUTPUT_DIR/"working/binny/bins"),
        assembly_files = OUTPUT_DIR/"working/binny/intermediary/assembly.formatted.fa",
        assembly_depth = OUTPUT_DIR/"working/binny/intermediary/assembly.contig_depth.txt",
        marker_gene_gff = OUTPUT_DIR/"working/binny/intermediary/annotation_CDS_RNA_hmms_checkm.gff",
        names_dump=TAXA,
        MMA_summary=OUTPUT_DIR/"MMA.summary.txt",
        ANNOT_FILE="out.CAT.contig2classification.txt",
        script_dir = SCR_DIR
    output:
        OUTPUT_DIR/"working/summary/cross_ref.tsv",
    benchmark:
        OUTPUT_DIR/"logging_info/summarize_binny.tsv"
    conda:
        "envs/documenting_binny.yml"
    message:
        "Run python script summarize_binny_results.py to record each bin's contig information."
    script:
        "{params.script_dir}/summarize_binny_results.py"

rule refine_bins:
    input:
        cross_ref = OUTPUT_DIR/"working/summary/cross_ref.tsv",
        original_bins = Path(OUTPUT_DIR/"working/binny"),
        CAT_prediction = Path(OUTPUT_DIR/"working/CAT"),
    params:
        BACTERIA_ID = "2",
        script_dir = SCR_DIR
    output:
        refine_dir = directory(OUTPUT_DIR/"working/refined_bins"),
        summary_info = OUTPUT_DIR/"working/refinement_contig_summary.txt"
    conda:
        "envs/refinement.yml"
    message:
        "Identify potential contaminations in each plastid MAG. Users can decide to use the refined or binny-generated bins."
    log:
        OUTPUT_DIR/"logging_info/refinement.log"
    script:
        "{params.script_dir}/refine_bins.py"

rule visualize_results:
    input:
        OUTPUT_DIR/"working/summary/cross_ref.tsv",
    params:
        BATCH_NAME,
        script_dir = SCR_DIR,
        refine_bins_dir=OUTPUT_DIR/"working/binny",
    output:
        directory(OUTPUT_DIR/"working/visualizations")
    conda:
        "envs/visualization.yml"
    log:
        OUTPUT_DIR/"logging_info/visualizations.log"
    script:
        "{params.script_dir}/visualization.py"

rule cds_extraction:
    input:
        binny_dir = Path(OUTPUT_DIR/"working/binny"),
    params:
        batch_name=BATCH_NAME,
        gff_file_flag="GFFs",
        script_dir = SCR_DIR,
        path_fraggenescanrs = OUTPUT_DIR/"working/FragGeneScanRs",
    output:
        CDS_EXTRACTION=directory(OUTPUT_DIR/"working/cds-extraction")
    conda:
        "envs/gffread.yml"
    message:
        "Use the fast ORF-predictor FragGeneScan Rust to predict gene contents of the MAGs, then extract the cds via gffread."
    script:
        "{params.script_dir}/cds-extraction.sh"

rule krona_taxonomy_plot:
    input: 
        CAT_pred_whole_data=rules.CAT_taxonomy_identification_plus_annotation.output.CAT_output,
        # summary_info=rules.documenting_binny_results.output,
    params:
        file_to_remodel="out.CAT.contig2classification.txt",
        file_after_remodel="out.CAT.krona.intake.tsv",
        intake_krona="out.CAT.krona.intake2.tsv",
        script_dir = SCR_DIR,
        batch_name=BATCH_NAME
    conda:
        "envs/krona.yml"
    output:
        OUTPUT_DIR/"Krona.html"
    message:
        "essure that in your env there must have pandas, click and of course, Kronatools."
    shell:
        """
        set -e

        # add if statement: if there are empty out.CAT file, we exit with touched files: Krona.html.
        if [ ! -s {input}/{params.file_to_remodel} ]; then
            echo "No contig predictions because previous rules did not find any plastid contigs."
            touch {output}
            exit 0
        fi 

        python {params.script_dir}/create_taxonomy_profile.py --cat_prediction_txt {input}/{params.file_to_remodel} --output {input}/{params.file_after_remodel}
        sed 1d {input}/{params.file_after_remodel} > {input}/{params.intake_krona}
        ktImportTaxonomy -m 1 -o {output} {input}/{params.intake_krona}
        """

# if ANVIO_PREPARE:
#     rule reads_remapping:
#         input:
#             assembly = rules.corgi_prediction.output.plastid_contigs,
#             reads = ANVIO_READS # Directory of reads.
#         conda:
#             "envs/prep_anvio.yml"
#         output:
#             directory(OUTPUT_DIR/"working/anvio_prepare")
#         message:
#             "Remap the reads to filtered assemblies and generate bam files for anvio outputs."
#         shell:
#         """
#         mkdir -p {output}
#         minimap2 -d {output}/assembly.mmi {input.assembly}

#         # only map paired-end reads.
#         for f in {input.reads}/*_1.fastq.gz; do
#             basename=$(basename "$f" _1.fastq.gz)
#             reads2="{input.reads}/${basename}_2.fastq.gz"
#             output_sam="{output}/${basename}.sam"
#             output_bam="{output}/${basename}.bam"
#             output_sorted_bam="{output}/${basename}.sorted.bam"
#             minimap2 -ax sr {output}/assembly.mmi $f $reads2 > $output_sam
#             samtools view -bS $output_sam > $output_bam
#             samtools sort $output_bam -o $output_sorted_bam --threads 16
#             rm $output_sam
#             rm $output_bam
#         done
#         """

rule check_done:
    input:
        OUTPUT_DIR/"working/visualizations",
        OUTPUT_DIR/"working/refined_bins",
        OUTPUT_DIR/"working/binny",
        OUTPUT_DIR/"working/CAT",
        OUTPUT_DIR/"working/cds-extraction",
        OUTPUT_DIR/"working/summary/cross_ref.tsv",
        rules.refine_bins.output.refine_dir,
        OUTPUT_DIR/"Krona.html",
        # anvio_flag = ANVIO_PREPARE
    log:
        OUTPUT_DIR/"logging_info/check_done.log"
    output:
        OUTPUT_DIR/"ChloroScan.done"
    shell:
        """
        # check if all files are present, if yes, then touch the done file.
        flag=0
        for f in {input}; do
            if [ ! -s $f ]; then
                flag=1
                break
            fi
        done

        echo "Flag is $flag" >> {log}

        if [ $flag -eq 0 ]; then
            touch {output}
        fi
        """