diff --git a/Snakefile b/Snakefile index 2ae29b62c71cef5229fe7bce6b291a1af00a7ebf..38c8cf60903985cfb0e32743eb87af627056edb9 100644 --- a/Snakefile +++ b/Snakefile @@ -15,6 +15,7 @@ configfile: "configuration/config.yaml" samples = pd.read_csv(config['samplesTSV'], dtype=str, index_col="assemblyName", delim_whitespace=True, skip_blank_lines=True) if 'Library_R1' in samples.columns: + whichRule = "rules/rulesNew.smk" samples['combined']=samples['Library_R1'].astype(str)+' '+samples['Library_R2'].astype(str) samples = samples.drop(columns=['Library_R1','Library_R2']) samples['est_bp'] = samples['est_bp'].fillna(0) @@ -34,6 +35,7 @@ if 'Library_R1' in samples.columns: samples=samples.join(samples6['est_bp']) # print("The following assemblies will be evaluated: ", list(samples.index)) elif 'preBuiltMerylDatabase' in samples.columns: + whichRule = "rules/rulesNew_short.smk" samples['est_bp'] = samples['est_bp'].fillna(0) samples["est_bp"] = pd.to_numeric(samples["est_bp"]) # print("The following assemblies will be evaluated: ", list(samples.index)) @@ -107,7 +109,7 @@ if "Results" not in config: report: "report/workflow.rst" -include: "rules/rulesNew_short.smk" +include: whichRule rule all: diff --git a/rules/rulesNew.smk b/rules/rulesNew.smk index c94a9cb6f394c4ad82e1f1b6a3514a9b0dccbdff..7d356edb41a5985097525b7e9ec98e4b33b301e6 100644 --- a/rules/rulesNew.smk +++ b/rules/rulesNew.smk @@ -23,13 +23,13 @@ rule trimAdapters: threads: 8 log: - os.path.join(config['Results'], "logs/0_qualityControl/{sample}_tGalore.log") + os.path.join(config['Results'], "{sample}" + "/0_qualityControl/logs/{sample}_tGalore.log") priority: 15 conda: "../envs/pigz.yaml" shell: - "(trim_galore -q 5 -j {threads} --dont_gzip --fastqc --length 65 -o {params.base} --paired {params.file}) &> {log}" + "(trim_galore -j {threads} --dont_gzip --fastqc --length 65 -o {params.base} --paired {params.file}) &> {log}" rule multiqc: input: @@ -108,7 +108,7 @@ rule busco5: conda: "../envs/busco_and_assembly.yaml" log: - os.path.join(config['Results'], "logs/1_busco5/{sample}_busco5.log") + os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}" + "/logs/{sample}_busco5.log") priority: 20 shell: @@ -147,23 +147,22 @@ rule meryl_R1: # check = os.path.join(config['Results'], "{sample}" + "/busco5/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt") params: # script = os.path.join(workflow.basedir, "programs/meryl/build/bin/meryl"), - script = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin/meryl"), kmer = 21, - r1= "{sample}" + "_R1.21.meryl", path= os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/") threads: - workflow.cores * 0.5 + workflow.cores * 0.25 output: directory(os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{sample}" + "_R1.21.meryl")) log: - os.path.join(config['Results'], "logs/2_merylAndMerq/{sample}_meryl_R1.log") + os.path.join(config['Results'], "{sample}" + "/2_QVstats_merylAndMerqury/logs/{sample}_meryl_R1.log") priority: 10 conda: - "../envs/merylMerq.yaml" + "../envs/merylMerq_2.yaml" shell: """ - ({params.script} count k={params.kmer} threads={threads} {input.read1} output {output}) &> {log} + cd {params.path} + (meryl count k={params.kmer} threads={threads} {input.read1} output {output}) &> {log} """ rule meryl_R2: @@ -172,23 +171,23 @@ rule meryl_R2: # check = os.path.join(config['Results'], "{sample}" + "/busco5/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt") params: # script = os.path.join(workflow.basedir, "programs/meryl/build/bin/meryl"), - script = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin/meryl"), kmer = 21, - r2= "{sample}" + "_R2.21.meryl", +# r2= "{sample}" + "_R2.21.meryl", path= os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/") threads: - workflow.cores * 0.5 + workflow.cores * 0.25 output: directory(os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{sample}" + "_R2.21.meryl")), log: - os.path.join(config['Results'], "logs/2_merylAndMerq/{sample}_meryl_R2.log") + os.path.join(config['Results'], "{sample}" + "/2_QVstats_merylAndMerqury/logs/{sample}_meryl_R2.log") priority: 10 conda: - "../envs/merylMerq.yaml" + "../envs/merylMerq_2.yaml" shell: """ - ({params.script} count k={params.kmer} threads={threads} {input.read2} output {output}) &> {log} + cd {params.path} + (meryl count k={params.kmer} threads={threads} {input.read2} output {output}) &> {log} """ rule meryl: @@ -198,64 +197,65 @@ rule meryl: # check = os.path.join(config['Results'], "{sample}" + "/busco5/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt") params: # script = os.path.join(workflow.basedir, "programs/meryl/build/bin/meryl"), - script = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin/meryl"), kmer = 21, # r1= "{sample}" + "_R1.21.meryl", # r2= "{sample}" + "_R2.21.meryl", # dB= "{sample}" + "_dB.21.meryl", - path= os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/{sample}_dB.21.meryl") + path= os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/") threads: - workflow.cores * 0.5 + workflow.cores * 0.25 output: directory(os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{sample}" + "_dB.21.meryl")), log: - os.path.join(config['Results'], "logs/2_merylAndMerq/{sample}_meryl_dB.log") + os.path.join(config['Results'], "{sample}" + "/2_QVstats_merylAndMerqury/logs/{sample}_meryl_dB.log") priority: 5 conda: - "../envs/merylMerq.yaml" + "../envs/merylMerq_2.yaml" shell: """ - ({params.script} union-sum {input.read1} {input.read2} threads={threads} output {output}) &> {log} + cd {params.path} + (meryl union-sum {input.read1} {input.read2} threads={threads} output {output}) &> {log} """ + rule merqury: input: - check = os.path.join(config['Results'], "logs/2_merylAndMerq/{sample}_meryl_dB.log"), assembly=lambda wildcards: samples.at[wildcards.sample, 'assembly_fasta'] \ if wildcards.sample in samples.index else '', merylDB=os.path.join(config['Results'],"{sample}"+ "/2_QVstats_merylAndMerqury/" + "{sample}" + "_dB.21.meryl") params: - script = os.path.join(workflow.basedir, "programs/merqury-1.1/merqury.sh"), + # script = os.path.join(workflow.basedir, "programs/merqury-1.1/merqury.sh"), outFile= "{sample}" + "_merq", # outFile= os.path.join(config['Results'], "{sample}" +"/QVstats_merylAndMerqury/{sample}_merq"), # merylPath = os.path.join(workflow.basedir, "programs/meryl/build/bin:$PATH"), - merylPath = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin:$PATH"), + # merylPath = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin:$PATH"), outPath= os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury"), - export= os.path.join(workflow.basedir, "programs/merqury-1.1") + symlink_fasta=os.path.join(config['Results'], "{sample}" + "/{sample}.fasta"), + symlink_merylDB=directory(os.path.join(config['Results'], "{sample}" + "/combinedRead1Read2.21.meryl")) + # export= os.path.join(workflow.basedir, "programs/merqury-1.1") threads: - workflow.cores * 0.5 + workflow.cores * 0.25 output: report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.qv"), caption="../report/merqury/qvScore.rst", category="QV Estimate", subcategory="{sample}"), report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.st.png"), caption="../report/merqury/spectra-cn_stacked.rst", category="Copy Number Spectra Plots", subcategory="{sample}"), report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.fl.png"), caption="../report/merqury/spectra-cn_filled.rst", category="Copy Number Spectra Plots", subcategory="{sample}"), - os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{sample}" + "_dB.21.hist"), - symlink=os.path.join(config['Results'], "{sample}" + "/{sample}.fasta") + os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/combinedRead1Read2.21.hist") log: os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/{sample}_merqury.log") priority: 3 conda: - "../envs/merylMerq.yaml" + "../envs/merylMerq_2.yaml" shell: """ + ln -s {input.assembly} {params.symlink_fasta} + ln -s {input.merylDB} {params.symlink_merylDB} cd {params.outPath} - ln -s {input.assembly} {output.symlink} - export PATH={params.merylPath} - export MERQURY={params.export} export OMP_NUM_THREADS={threads} - (bash {params.script} {input.merylDB} {output.symlink} {params.outFile}) &> {log} + (merqury.sh {params.symlink_merylDB} {params.symlink_fasta} {params.outFile}) &> {log} """ + # bash {params.script} {input.merylDB} {params.symlink} {params.outFile} # """ @@ -265,29 +265,31 @@ rule merqury: # cd {params.outPath} # bash {params.script} {input.merylDB} {input.assembly} {params.outFile} # """ + rule genomescope2: input: - hist=os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{sample}" + "_dB.21.hist"), + hist=os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/combinedRead1Read2.21.hist"), check1=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.fl.png"), check2=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.st.png") params: outFolder=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/"), - cpHist=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_dB.21.histo"), + cpHist=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/combinedRead1Read2_10k.21.histo"), findGscope=os.path.join(workflow.basedir) output: summary=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_summary.txt"), caption="../report/genomescope/summary.rst", category="Genomescope Profile", subcategory="{sample}"), logPlot=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_log_plot.png"), caption="../report/genomescope/logplot.rst", category="Genomescope Profile", subcategory="{sample}"), linearPlot=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_linear_plot.png"), caption="../report/genomescope/linearplot.rst", category="Genomescope Profile", subcategory="{sample}"), - estimatedSize=temp(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_sizeEst.txt")) + estimatedSize=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_sizeEst.txt") conda: "../envs/genomescope.yaml" shell: """ - expand -t 1 {input.hist} > {params.cpHist} + head -n 10000 {input.hist} > {params.cpHist} export genomescope2script=$(find {params.findGscope} -name genomescope2) Rscript $genomescope2script -i {params.cpHist} -o {params.outFolder} -k 21 -n {wildcards.sample} grep "Genome Haploid Length" {output.summary} | awk {{'print $6'}} | sed 's/,//g'> {output.estimatedSize} """ + # expand -t 1 {input.hist} > {params.cpHist} rule assemblyStats: input: @@ -299,18 +301,20 @@ rule assemblyStats: # report(os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), caption="../report/assemblystats/{sample}_scaffold_rst.rst", category="Assembly Statistics", subcategory="{sample}"), scaffStats=report(os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_scaffold_stats.tsv"), caption="../report/assemblystats/stats.rst", category="Assembly Statistics", subcategory="{sample}"), contStats=report(os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_contig_stats.tsv"), caption="../report/assemblystats/stats.rst", category="Assembly Statistics", subcategory="{sample}"), - scaff_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_scaffold_rst.rst")), - contig_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_contig_rst.rst")) + #scaff_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_scaffold_rst.rst")), + # contig_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_contig_rst.rst")) params: script = os.path.join(workflow.basedir, "scripts/stats.py"), path = os.path.join(config['Results'], "{sample}" + "/4_assemblyStats/"), filename="{sample}", - given_size=lambda wildcards: samples.at[wildcards.sample, 'EstimatedSize(bps)'] \ + given_size=lambda wildcards: samples.at[wildcards.sample, 'est_bp'] \ if wildcards.sample in samples.index else '' shell: """ - python {params.script} {input.assembly} {input.estGenome} {params.filename} {output.scaff_rst} {output.contig_rst} {params.given_size} {output.scaffStats} {output.contStats} + python {params.script} {input.assembly} {input.estGenome} {params.filename} {params.given_size} {output.scaffStats} {output.contStats} """ +# python {params.script} {input.assembly} {input.estGenome} {params.filename} {output.scaff_rst} {output.contig_rst} {params.given_size} {output.scaffStats} {output.contStats} + # estGenomeSize=$( grep 'Genome Haploid Length' {input.estGenome} | awk {{'print $6'}} | sed 's/,//g' ) # rule moveMerqOutput: @@ -356,18 +360,18 @@ rule saveConfiguration_and_getKeyValues: scaffStats=os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_scaffold_stats.tsv"), contStats=os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_contig_stats.tsv"), # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), - multiqc=os.path.join(config['Results'],"{sample}/0_qualityControl/02_multiqc/{sample}_multiqc_report.html") +# multiqc=os.path.join(config['Results'],"{sample}/0_qualityControl/02_multiqc/{sample}_multiqc_report.html") params: # sampleSheet= config['samplesTSV'], # config=os.path.join(workflow.basedir, "configuration/config.yaml"), - resultsPath=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results") + resultsPath=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results"), + keyValues=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/results_withoutRowNames.tsv"), + rowNames=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/rowNames.tsv"), # masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv") output: # newSampleSheet=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedSampleSheet.tsv"), # newConfigFile=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedConfig.yaml"), # merqLog=os.path.join(config['Results'],"logs/2_merylAndMerq/" + "{sample}" + "_merqury.spectra-cn.log"), - keyValues=temp(os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/aggregate.tsv")), - rowNames=temp(os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/aggregate_rowNames.tsv")), keyValuesWithRowNames=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/{sample}_aggregatedResults.tsv"), gscopeSum=os.path.join(config['Results'], "{sample}" +"/5_Key_Results/" + "{sample}" + "_gScope_summary.txt"), gscopeLog=os.path.join(config['Results'], "{sample}" +"/5_Key_Results/" + "{sample}" + "_gScope_log_plot.png"), @@ -381,7 +385,7 @@ rule saveConfiguration_and_getKeyValues: scaffStats=os.path.join(config['Results'],"{sample}" + "/5_Key_Results/{sample}_scaffold_stats.tsv"), contStats=os.path.join(config['Results'],"{sample}" + "/5_Key_Results/{sample}_contig_stats.tsv"), # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), - multiqc=os.path.join(config['Results'],"{sample}/5_Key_Results/{sample}_multiqc_report.html") +# multiqc=os.path.join(config['Results'],"{sample}/5_Key_Results/{sample}_multiqc_report.html") # aggregateTsv=os.path.join(config['Results'],"{sample}"+ "/individual_aggregated_results/aggregate.tsv") conda: "../envs/dos2unix.yaml" @@ -396,24 +400,29 @@ rule saveConfiguration_and_getKeyValues: cp {input.spectraFlat} {output.spectraFlat} cp {input.scaffStats} {output.scaffStats} cp {input.contStats} {output.contStats} - cp {input.multiqc} {output.multiqc} - echo "{wildcards.sample}" >> {output.keyValues} - echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {output.keyValues} - echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(awk {{'print $4'}} {input.qv})" >> {output.keyValues} - echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(grep 'N100' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'NG100' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {output.keyValues} - echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues} - dos2unix {output.keyValues} - echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nN50_length\nNG50_length\nN95_length\nNG95_length\nN100_length\nNG100_length\ntotal_num_bases\ntotal_num_scaffolds" > {output.rowNames} - paste -d'\t' {output.rowNames} {output.keyValues} > {output.keyValuesWithRowNames} + echo "{wildcards.sample}" >> {params.keyValues} + echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {params.keyValues} + echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(awk {{'print $4'}} {input.qv})" >> {params.keyValues} + echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'N100' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG100' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} + echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + dos2unix {params.keyValues} + echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nScaffold_N100_length\nScaffold_NG100_length\nContig_N50_length\nContig_NG50_length\ntotal_num_bases\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps" > {params.rowNames} + paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} printf "Evaluation complete for {wildcards.sample}, please find these results in {params.resultsPath} folder" + rm {params.rowNames} {params.keyValues} """ +# # cp {input.checkLog} {output.merqLog} #echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} | sed 's/,//g')" >> {output.keyValues} put this to remove commas @@ -433,7 +442,7 @@ rule aggregateAllAssemblies: """ cp {input.sampleSheet} {output.newSampleSheet} cp {input.config} {output.newConfigFile} - echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nN50_length\nNG50_length\nN95_length\nNG95_length\nN100_length\nNG100_length\ntotal_num_bases\ntotal_num_scaffolds" | \ + echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nScaffold_N100_length\nScaffold_NG100_length\nContig_N50_length\nContig_NG50_length\ntotal_num_bases\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps" | \ paste -d'\t' - {input.allResults} | \ awk -F'\t' '{{s="";for (i=1;i<=NF;i+=2) {{s=s?s FS $i:$i}} print s}}' | \ column -t > {output.results}