#!/bin/bash
#SBATCH --partition=PARTITION
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=THREADS_VALUE
#SBATCH --mem=MEMORY
#SBATCH --time=TIME
#SBATCH --job-name=emtax
#SBATCH --output=WORKDIR_PATH/emtax_%j.out
#SBATCH --error=WORKDIR_PATH/emtax_%j.err

# emtax workflow script
# Generated by emtax

echo "Starting emtax workflow at $(date)"

# Set variables
export WORKDIR="WORKDIR_PATH"
export RAWDATA_DIR="RAWDATA_DIR_PATH"
export RESULTS_DIR="RESULTS_DIR_PATH"
export KRAKEN_DB="KRAKEN_DB_PATH"
export CORN_DB="CORN_DB_PATH"
export THREADS=THREADS_VALUE

# Check if conda is available
if command -v conda &> /dev/null; then
    echo "Conda is available"
    
    # Initialize conda for bash
    CONDA_BASE=$(conda info --base)
    source "${CONDA_BASE}/etc/profile.d/conda.sh"
    
    # Activate conda environment
    echo "Activating conda environment: emtax_env"
    conda activate emtax_env
else
    echo "WARNING: Conda not available, proceeding without environment activation"
fi

# Check network connectivity
if ping -c 1 8.8.8.8 &> /dev/null; then
    echo "Network is reachable, using online mode"
else
    echo "WARNING: Network is unreachable, using offline mode"
fi

# Change to workflow directory
cd "${WORKDIR}" || { echo "ERROR: Failed to change to workflow directory"; exit 1; }

echo "Processing samples from config file: ${WORKDIR}/config.yaml"

# Create output directories
mkdir -p "${RESULTS_DIR}/Taxonomic_Profiling/1_DNA_Kraken2"
mkdir -p "${RESULTS_DIR}/Taxonomic_Profiling/2_DNA_Bracken"
mkdir -p "${RESULTS_DIR}/Taxonomic_Profiling/3_DNA_Krona"
mkdir -p "${RESULTS_DIR}/Taxonomic_Profiling/4_DNA_Krona_HTML"
mkdir -p "${RESULTS_DIR}/Taxonomic_Profiling/5_DNA_Relative_Abundance_Matrix_Python"

# Process each sample
for SAMPLE in SAMPLES; do
    echo "Processing sample: $SAMPLE"
    
    # Run kraken2 if available
    if command -v kraken2 &> /dev/null; then
        echo "Running kraken2 for taxonomic classification for ${SAMPLE}..."
        kraken2 --db "${KRAKEN_DB}" \
                --threads "${THREADS}" \
                --paired \
                --output "${RESULTS_DIR}/Taxonomic_Profiling/1_DNA_Kraken2/${SAMPLE}.kraken" \
                --report "${RESULTS_DIR}/Taxonomic_Profiling/1_DNA_Kraken2/${SAMPLE}.report" \
                "${RAWDATA_DIR}/${SAMPLE}_L001_R1.fastq.gz" \
                "${RAWDATA_DIR}/${SAMPLE}_L001_R2.fastq.gz" || echo "WARNING: kraken2 failed for ${SAMPLE}"
    else
        echo "WARNING: kraken2 not available, skipping taxonomic classification for ${SAMPLE}"
    fi
    
    # Run bracken if available
    if command -v bracken &> /dev/null && [ -f "${RESULTS_DIR}/Taxonomic_Profiling/1_DNA_Kraken2/${SAMPLE}.report" ]; then
        echo "Running bracken for abundance estimation for ${SAMPLE}..."
        bracken -d "${KRAKEN_DB}" \
                -i "${RESULTS_DIR}/Taxonomic_Profiling/1_DNA_Kraken2/${SAMPLE}.report" \
                -o "${RESULTS_DIR}/Taxonomic_Profiling/2_DNA_Bracken/${SAMPLE}.bracken" \
                -r 150 -l S -t 10 || echo "WARNING: bracken failed for ${SAMPLE}"
    else
        echo "WARNING: bracken not available or kraken2 report missing, skipping abundance estimation for ${SAMPLE}"
    fi
done

# Generate abundance matrix
echo "Generating abundance matrix..."
python "${WORKDIR}/scripts/create_abundance_matrix.py" \
    --input_files "${RESULTS_DIR}"/Taxonomic_Profiling/2_DNA_Bracken/*.bracken \
    --output "${RESULTS_DIR}/Taxonomic_Profiling/5_DNA_Relative_Abundance_Matrix_Python/abundance_matrix.csv" || \
    echo "WARNING: Failed to generate abundance matrix"

echo "emtax workflow finished at $(date)"
