diff --git a/rules/evaluate.smk b/rules/evaluate.smk index d82551a2c4fe6bb375d2c00573970d6673e78db1..f996a0c2836ab02dfd95b54537754f14bb5c3b0a 100644 --- a/rules/evaluate.smk +++ b/rules/evaluate.smk @@ -1,27 +1,28 @@ -localrules: symlink_UnzippedFastq_R1_HiC, \ - symlink_UnzippedFastq_R2_HiC, \ - symlink_UnzippedFasta_asm1, \ - symlink_UnzippedFasta_asm2, \ - KeyResults_GenomescopeProfiles, \ - KeyResults_statistics, \ - KeyResults_merqury, \ - KeyResults_busco, \ - KeyResults_smudgeplot, \ - all_metrics_table_asm1, \ - all_metrics_table_asm2, \ - IndividualResults_md, \ - PretextMaps_md, \ - Reports_md, \ - Reports_pdf, \ - ColouredTable_html, \ - HeatmapTable_html, \ - BothTables_pdf, \ - ConcatAll_pdfs, \ - symlink_genomescope, \ - symlink_smudgeplot, \ - all_metrics_final_table, \ - Table_md_all_metrics, \ - compute_kmer_coverage_ear, \ +localrules: symlink_UnzippedFastq_R1_HiC, \ + symlink_UnzippedFastq_R2_HiC, \ + symlink_UnzippedFasta_asm1, \ + symlink_UnzippedFasta_asm2, \ + KeyResults_GenomescopeProfiles, \ + KeyResults_statistics, \ + KeyResults_merqury, \ + KeyResults_busco, \ + KeyResults_smudgeplot, \ + KeyResults_inspector, \ + all_metrics_table_asm1, \ + all_metrics_table_asm2, \ + IndividualResults_md, \ + PretextMaps_md, \ + Reports_md, \ + Reports_pdf, \ + ColouredTable_html, \ + HeatmapTable_html, \ + BothTables_pdf, \ + ConcatAll_pdfs, \ + symlink_genomescope, \ + symlink_smudgeplot, \ + all_metrics_final_table, \ + Table_md_all_metrics, \ + compute_kmer_coverage_ear, \ make_ear def HiC_R1_gzipped(wildcards): @@ -616,9 +617,20 @@ rule merqury: priority: 3 conda: - os.path.join(workflow.basedir, "envs/SMUDGEPLOT_MERQURY.yaml") + os.path.join(workflow.basedir, "envs/MER_STATS.yaml") shell: - """ + r""" + # If the target line exists, process the next 5 lines. + if grep -q "has_module=\$(check_module)" "$MERQURY/eval/spectra-cn.sh"; then + sed -i '/has_module=\$(check_module)/{{ + n; s/^\([^#]\)/#\1/; + n; s/^\([^#]\)/#\1/; + n; s/^\([^#]\)/#\1/; + n; s/^\([^#]\)/#\1/; + n; s/^\([^#]\)/#\1/; + }}' "$MERQURY/eval/spectra-cn.sh" + fi + ln -fs {input.merylDB_provided} {params.symlink_merylDB} cd {params.outPath} export OMP_NUM_THREADS={threads} @@ -626,7 +638,6 @@ rule merqury: """ - ####################################################################################################################################### def busco_lineage_path(wildcards): @@ -710,7 +721,7 @@ rule make_kmer_histogram: params: meryl_path=lambda wildcards: samples[samples['merylDB_basename'] == wildcards.name]['merylDB'].values[0] conda: - os.path.join(workflow.basedir, "envs/SMUDGEPLOT_MERQURY.yaml") + os.path.join(workflow.basedir, "envs/MER_STATS.yaml") log: os.path.join(config['Results'],"1_evaluation/kmer_profiling/{name}/logs/make_kmer_histogram.log") threads: @@ -737,7 +748,7 @@ rule genomescope2: kmer=lambda wildcards: samples[samples['merylDB_basename'] == wildcards.name]['merylDB_kmer'].values[0], cpHist=os.path.join(config['Results'], "1_evaluation/kmer_profiling/{name}/merylDB_10000.hist") conda: - os.path.join(workflow.basedir, "envs/AUXILIARY_R_SCRIPTS.yaml") + os.path.join(workflow.basedir, "envs/R_SMUDG.yaml") log: os.path.join(config['Results'],"1_evaluation/kmer_profiling/{name}/logs/genomescope2.log") threads: @@ -766,7 +777,7 @@ rule smudgeplot: meryl_path=lambda wildcards: samples[samples['merylDB_basename'] == wildcards.name]['merylDB'].values[0], kmer=lambda wildcards: samples[samples['merylDB_basename'] == wildcards.name]['merylDB_kmer'].values[0] conda: - os.path.join(workflow.basedir, "envs/SMUDGEPLOT_MERQURY.yaml") + os.path.join(workflow.basedir, "envs/R_SMUDG.yaml") threads: resource['smudgeplot']['threads'] resources: @@ -853,7 +864,7 @@ rule gfastatsResults_asm1: params: estGenome = lambda wildcards: dictSamples[wildcards.asmID][6] if dictSamples[wildcards.asmID][6] != 0 else readfile(os.path.join(config['Results'], f"1_evaluation/{wildcards.asmID}/KMER_PROFILING/genomescope/results_sizeEst.txt")) conda: - os.path.join(workflow.basedir, "envs/AUXILIARY_R_SCRIPTS.yaml") + os.path.join(workflow.basedir, "envs/MER_STATS.yaml") log: os.path.join(config['Results'],"1_evaluation/{asmID}/logs/assemblyStats_gfastats_asm1.{asmID}.log") threads: @@ -875,7 +886,7 @@ rule gfastatsResults_asm2: params: estGenome = lambda wildcards: dictSamples[wildcards.asmID][6] if dictSamples[wildcards.asmID][6] != 0 else readfile(os.path.join(config['Results'], f"1_evaluation/{wildcards.asmID}/KMER_PROFILING/genomescope/results_sizeEst.txt")) conda: - os.path.join(workflow.basedir, "envs/AUXILIARY_R_SCRIPTS.yaml") + os.path.join(workflow.basedir, "envs/MER_STATS.yaml") log: os.path.join(config['Results'],"1_evaluation/{asmID}/logs/assemblyStats_gfastats_asm2.{asmID}.log") threads: @@ -888,6 +899,83 @@ rule gfastatsResults_asm2: (gfastats --nstar-report -j {threads} {input.assembly} {params.estGenome} > {output.gfaResults}) &> {log} """ +def getLongReads(wildcards): + return samples.loc[(wildcards.asmID), "Long_Reads"] + + +def getLongReads(wildcards): + return samples.loc[(wildcards.asmID), "Long_Reads"] + +# Helper function to check if long reads are provided +def longReadsProvided(wildcards): + long_reads = samples.loc[(wildcards.asmID), "Long_Reads"] + return long_reads != "None" + +rule inspector: + input: + assembly = os.path.join(config['Results'], "1_evaluation/{asmID}/ASSEMBLY_FASTAS/{asmID}.asm1.fasta"), + # Only include long_reads in input if they're provided + long_reads = lambda wildcards: getLongReads(wildcards) if longReadsProvided(wildcards) else [] + output: + pass_flag = os.path.join(config['Results'], "1_evaluation/{asmID}/logs/inspector_asm1.pass") + conda: + os.path.join(workflow.basedir, "envs/INSPECTOR.yaml") + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/inspector_asm1.{asmID}.log") + threads: + resource['inspector']['threads'] + resources: + mem_mb = resource['inspector']['mem_mb'], + time = resource['inspector']['time'] + shell: + """ + long_reads="{input.long_reads}" + if [ "$long_reads" = "" ] || [ "$long_reads" = "None" ]; then + echo "Long_Reads not provided; skipping Inspector." > {log} + touch {output.pass_flag} + else + # Derive the output folder from the dummy flag file path. + out_dir=$(echo {output.pass_flag} | sed 's,/logs/inspector_asm1.pass$,/INSPECTOR/asm1,') + mkdir -p $out_dir + (inspector.py -c {input.assembly} -r {input.long_reads} -o $out_dir --datatype hifi -t {threads}) &> {log} + touch {output.pass_flag} + fi + """ + +rule inspector_alt: + input: + # Only include assembly in input if ALT_asm is not "None" + assembly = lambda wildcards: os.path.join( + config['Results'], "1_evaluation/{asmID}/ASSEMBLY_FASTAS/{asmID}.asm2.fasta" + ).format(asmID=wildcards.asmID) if samples.loc[wildcards.asmID, "ALT_asm"] != "None" else [], + # Only include long_reads in input if they're provided + long_reads = lambda wildcards: getLongReads(wildcards) if longReadsProvided(wildcards) else [] + output: + pass_flag = os.path.join(config['Results'], "1_evaluation/{asmID}/logs/asm2_inspector.pass") + conda: + os.path.join(workflow.basedir, "envs/INSPECTOR.yaml") + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/inspector_asm2.{asmID}.log") + threads: + resource['inspector']['threads'] + resources: + mem_mb=resource['inspector']['mem_mb'], + time=resource['inspector']['time'] + shell: + """ + assembly="{input.assembly}" + long_reads="{input.long_reads}" + if [ -z "$assembly" ] || [ "$long_reads" = "" ] || [ "$long_reads" = "None" ]; then + echo "ALT assembly provided but Long_Reads is not; skipping ALT Inspector." > {log} + touch {output.pass_flag} + else + # Derive the ALT output folder by replacing the dummy flag file path. + alt_out=$(echo {output.pass_flag} | sed 's,/logs/asm2_inspector.pass$,/INSPECTOR/asm2,') + mkdir -p $alt_out + (inspector.py -c {input.assembly} -r {input.long_reads} -o $alt_out --datatype hifi -t {threads}) &> {log} + touch {output.pass_flag} + fi + """ rule all_metrics_table_asm1: input: @@ -1008,7 +1096,32 @@ rule KeyResults_smudgeplot: cp {input.smudgeplot} {output.smudgeplot} cp {input.smudgeplotPlot} {output.smudgeplotPlot} """ - + +rule KeyResults_inspector: + input: + inspector_asm1_flag = os.path.join(config['Results'], "1_evaluation/{asmID}/logs/inspector_asm1.pass"), + inspector_asm2_flag = os.path.join(config['Results'], "1_evaluation/{asmID}/logs/asm2_inspector.pass") + output: + asm1_summary = os.path.join(config['Results'], "1_evaluation/{asmID}/KEY_RESULTS/{asmID}_asm1_summary_statistics.inspector"), + asm2_summary = os.path.join(config['Results'], "1_evaluation/{asmID}/KEY_RESULTS/{asmID}_asm2_summary_statistics.inspector") + shell: + """ + stat_asm1="{config[Results]}/1_evaluation/{wildcards.asmID}/INSPECTOR/asm1/summary_statistics" + stat_asm2="{config[Results]}/1_evaluation/{wildcards.asmID}/INSPECTOR/asm2/summary_statistics" + + if [ -f "$stat_asm1" ]; then + cp "$stat_asm1" {output.asm1_summary} + else + echo "Inspector asm1 statistics not available" > {output.asm1_summary} + fi + + if [ -f "$stat_asm2" ]; then + cp "$stat_asm2" {output.asm2_summary} + else + echo "Inspector asm2 not run or statistics not available" > {output.asm2_summary} + fi + """ + rule KeyResults_statistics: input: gscopeSum=os.path.join(config['Results'], "1_evaluation/{asmID}/KMER_PROFILING/genomescope/results_summary.txt"), @@ -1137,7 +1250,7 @@ rule ColouredTable_html: output: os.path.join(config['Results'],"1_evaluation/finalResults/tables/TABLE_OF_RESULTS_COLOURED.html"), conda: - os.path.join(workflow.basedir, "envs/AUXILIARY_R_SCRIPTS.yaml") + os.path.join(workflow.basedir, "envs/R_SMUDG.yaml") script: os.path.join(workflow.basedir, "scripts/compare_results/fullTable_heatmap_external_metrics.R") @@ -1149,7 +1262,7 @@ rule HeatmapTable_html: output: os.path.join(config['Results'],"1_evaluation/finalResults/tables/TABLE_OF_RESULTS_GRADIENT.html") conda: - os.path.join(workflow.basedir, "envs/AUXILIARY_R_SCRIPTS.yaml") + os.path.join(workflow.basedir, "envs/R_SMUDG.yaml") script: os.path.join(workflow.basedir, "scripts/compare_results/fullTable_heatmap_internalComparison_metrics.R") @@ -1175,7 +1288,7 @@ rule BothTables_pdf: rule Reports_md: input: - indivMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/KEY_RESULTS/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value6) for key, [value1, value2, value3, value4, value5, value6, value7, value8, value9, value10, value11, value12, value13, value14, value15, value16, value17] in dictSamples.items()], + indivMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/KEY_RESULTS/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value6) for key, [value1, value2, value3, value4, value5, value6, value7, value8, value9, value10, value11, value12, value13, value14, value15, value16, value17, value18] in dictSamples.items()], landingPage=os.path.join(workflow.basedir, "scripts/report/reportLandingPage.md"), IndividualPretextMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/KEY_RESULTS/{asmID}_PRETEXT.md"), asmID=list(testDictPRETEXT.keys()))] output: