diff --git a/Snakefile b/Snakefile index c493bbe4ae1b48cb838b21cbf0eae4fdbf8d86a1..2ae29b62c71cef5229fe7bce6b291a1af00a7ebf 100644 --- a/Snakefile +++ b/Snakefile @@ -32,28 +32,15 @@ if 'Library_R1' in samples.columns: samples = samples1.set_index('assemblyName') # print(samples) samples=samples.join(samples6['est_bp']) - print("The following assemblies will be evaluated: ", list(samples.index)) - +# print("The following assemblies will be evaluated: ", list(samples.index)) elif 'preBuiltMerylDatabase' in samples.columns: samples['est_bp'] = samples['est_bp'].fillna(0) samples["est_bp"] = pd.to_numeric(samples["est_bp"]) - print("The following assemblies will be evaluated: ", list(samples.index)) +# print("The following assemblies will be evaluated: ", list(samples.index)) else: raise ValueError('Sample Sheet structure not recognised. Please make sure you are using the correct sample sheet (prebuilt or non-prebuilt meryl db, etc)') -# print(samples) - - - -# parse arguments -# parser = argparse.ArgumentParser(description="Download busco lineage determined by prefix", formatter_class=argparse.RawTextHelpFormatter) -# parser.add_argument("--l", "-lineage", required=True, type=str, help="Lineage prefix to download --> first match will be downloaded") -# parser.add_argument("--o", "-outputPath", required=True, type=str, help="Location To Download busco databe into") -# #parser.add_argument("--wkd", "-workingDir", required=True, type=str, help="Location To lineage Dir") - -# args = parser.parse_args() -#os.path.join(workflow.basedir, "buscoLineage/" + config['busco5Lineage'] + "_odb10") args_o=os.path.join(workflow.basedir, "buscoLineage") args_l=config['busco5Lineage'] checkLineagePath=args_o + "/" + args_l + "_odb10" @@ -65,14 +52,14 @@ if os.path.isdir(args_l) is True: # subprocess.call("ln -sf %s %s"%(args.l, args.l), shell=True) buscoDataBaseName=os.path.basename(args_l) buscoDataBaseName=buscoDataBaseName[:-6] - print("Lineage path given, basename is:", buscoDataBaseName) +# print("Lineage path given, basename is:", buscoDataBaseName) elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True: # subprocess.call("ln -sf %s %s"%(checkLineagePath, checkLineageDled), shell=True) buscoDataBaseName=os.path.basename(checkLineagePath) buscoDataBaseName=buscoDataBaseName[:-6] - print("Database already in buscoLineage directory, basename is:", buscoDataBaseName) +# print("Database already in buscoLineage directory, basename is:", buscoDataBaseName) else: - print("Database will be downloaded") +# print("Database will be downloaded") url = "https://busco-data.ezlab.org/v5/data/lineages/" html = urlopen(url).read() soup = BeautifulSoup(html, features="html.parser") @@ -87,20 +74,19 @@ else: #identify the lineage file for line in text.splitlines(): if line.startswith(args_l): - print(line) +# print(line) linID=line.split(" ")[0] - print(linID) +# print(linID) break if not linID==None: linLink=url+linID - print(linLink) +# print(linLink) subprocess.run("wget -q -P %s %s"%(outDir, linLink), shell=True, check=True) subprocess.run("tar -xvf %s*.tar.gz -C %s"%(args_o + "/" + args_l, outDir), shell=True, check=True) subprocess.run("rm %s_*.tar.gz"%(checkLineageDled), shell=True, check=True) buscoDataBaseName=args_l - print("Database success downloaded and placed in buscoLineage directory, basename is:", buscoDataBaseName) else: - print("Error - could not identify lineage please check %s for a correct prefix"%(url)) + raise ValueError("Error - could not identify lineage please check busco site for a correct prefix") #Function to check if one of, or both the barcodeP5 and barcodeP7 fasta files are given by the user. @@ -117,7 +103,7 @@ else: # print("the number of threads to be used is", max_threads) if "Results" not in config: - config["Results"] = "results" + config["Results"] = "results" report: "report/workflow.rst" diff --git a/rules/rulesNew_short.smk b/rules/rulesNew_short.smk index 7447a933d3e3a42fe7c11a202d3b90503aa233ba..b45e225f1cd3d8b71d4c32053ee2ca7c999e6f4d 100644 --- a/rules/rulesNew_short.smk +++ b/rules/rulesNew_short.smk @@ -1,14 +1,14 @@ - - +# +# # import csv # aggregateResults = csv.writer(open(os.path.join(config['Results'],"{wildcards.sample}/keyResults/{wildcards.sample}_aggregate"), "w"), delimiter='\t') - - +# +# # rule checkOrDownload_BuscoLineage: # input: # config['buscoDatabase'] # output: - +# # rule trimAdapters: # input: # config['samplesTSV'], @@ -30,7 +30,7 @@ # "../envs/pigz.yaml" # shell: # "(trim_galore -q 5 -j {threads} --dont_gzip --fastqc --length 65 -o {params.base} --paired {params.file}) &> {log}" - +# # rule multiqc: # input: # os.path.join(config['Results'],"{sample}/0_qualityControl/01_fastqc") @@ -43,8 +43,8 @@ # "../envs/pigz.yaml" # shell: # "multiqc {input} -o {params.folder} -n {params.filename}" - - +# +# # rule combine_illuminaReads_R1: # input: # R1=os.path.join(config['Results'],"{sample}/0_qualityControl/01_fastqc") @@ -87,11 +87,10 @@ # # mv {input.contig} {output.contig} # # mv {input.scaffold} {output.scaffold} # """ - rule busco5: input: assembly=lambda wildcards: samples.at[wildcards.sample, 'assembly_fasta'] \ - if wildcards.sample in samples.index else '', + if wildcards.sample in samples.index else '', lineage = os.path.join(workflow.basedir, "buscoLineage/" + buscoDataBaseName + "_odb10") params: mode = config['buscoMode'], @@ -108,7 +107,7 @@ rule busco5: conda: "../envs/busco_and_assembly.yaml" log: - os.path.join(config['Results'], "logs/1_busco5/{sample}_busco5.log") + os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}" + "/logs/{sample}_busco5.log") priority: 20 shell: @@ -230,6 +229,8 @@ rule merqury: # merylPath = os.path.join(workflow.basedir, "programs/meryl/build/bin:$PATH"), # merylPath = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin:$PATH"), outPath= os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury"), + symlink_fasta=os.path.join(config['Results'], "{sample}" + "/{sample}.fasta"), + symlink_merylDB=directory(os.path.join(config['Results'], "{sample}" + "/combinedRead1Read2.21.meryl")) # export= os.path.join(workflow.basedir, "programs/merqury-1.1") threads: workflow.cores * 0.5 @@ -237,9 +238,7 @@ rule merqury: report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.qv"), caption="../report/merqury/qvScore.rst", category="QV Estimate", subcategory="{sample}"), report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.st.png"), caption="../report/merqury/spectra-cn_stacked.rst", category="Copy Number Spectra Plots", subcategory="{sample}"), report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.fl.png"), caption="../report/merqury/spectra-cn_filled.rst", category="Copy Number Spectra Plots", subcategory="{sample}"), - os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/combinedRead1Read2.21.hist"), - symlink_fasta=os.path.join(config['Results'], "{sample}" + "/{sample}.fasta"), - symlink_merylDB=directory(os.path.join(config['Results'], "{sample}" + "/combinedRead1Read2.21.meryl")) + os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/combinedRead1Read2.21.hist") log: os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/{sample}_merqury.log") priority: @@ -248,11 +247,11 @@ rule merqury: "../envs/merylMerq_2.yaml" shell: """ - ln -s {input.assembly} {output.symlink_fasta} - ln -s {input.merylDB} {output.symlink_merylDB} + ln -s {input.assembly} {params.symlink_fasta} + ln -s {input.merylDB} {params.symlink_merylDB} cd {params.outPath} export OMP_NUM_THREADS={threads} - (merqury.sh {output.symlink_merylDB} {output.symlink_fasta} {params.outFile}) &> {log} + (merqury.sh {params.symlink_merylDB} {params.symlink_fasta} {params.outFile}) &> {log} """ # bash {params.script} {input.merylDB} {params.symlink} {params.outFile} @@ -278,7 +277,7 @@ rule genomescope2: summary=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_summary.txt"), caption="../report/genomescope/summary.rst", category="Genomescope Profile", subcategory="{sample}"), logPlot=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_log_plot.png"), caption="../report/genomescope/logplot.rst", category="Genomescope Profile", subcategory="{sample}"), linearPlot=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_linear_plot.png"), caption="../report/genomescope/linearplot.rst", category="Genomescope Profile", subcategory="{sample}"), - estimatedSize=temp(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_sizeEst.txt")) + estimatedSize=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_sizeEst.txt") conda: "../envs/genomescope.yaml" shell: @@ -300,8 +299,8 @@ rule assemblyStats: # report(os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), caption="../report/assemblystats/{sample}_scaffold_rst.rst", category="Assembly Statistics", subcategory="{sample}"), scaffStats=report(os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_scaffold_stats.tsv"), caption="../report/assemblystats/stats.rst", category="Assembly Statistics", subcategory="{sample}"), contStats=report(os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_contig_stats.tsv"), caption="../report/assemblystats/stats.rst", category="Assembly Statistics", subcategory="{sample}"), - scaff_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_scaffold_rst.rst")), - contig_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_contig_rst.rst")) + #scaff_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_scaffold_rst.rst")), + # contig_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_contig_rst.rst")) params: script = os.path.join(workflow.basedir, "scripts/stats.py"), path = os.path.join(config['Results'], "{sample}" + "/4_assemblyStats/"), @@ -310,8 +309,9 @@ rule assemblyStats: if wildcards.sample in samples.index else '' shell: """ - python {params.script} {input.assembly} {input.estGenome} {params.filename} {output.scaff_rst} {output.contig_rst} {params.given_size} {output.scaffStats} {output.contStats} + python {params.script} {input.assembly} {input.estGenome} {params.filename} {params.given_size} {output.scaffStats} {output.contStats} """ +# python {params.script} {input.assembly} {input.estGenome} {params.filename} {output.scaff_rst} {output.contig_rst} {params.given_size} {output.scaffStats} {output.contStats} # estGenomeSize=$( grep 'Genome Haploid Length' {input.estGenome} | awk {{'print $6'}} | sed 's/,//g' ) @@ -362,14 +362,14 @@ rule saveConfiguration_and_getKeyValues: params: # sampleSheet= config['samplesTSV'], # config=os.path.join(workflow.basedir, "configuration/config.yaml"), - resultsPath=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results") + resultsPath=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results"), + keyValues=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/results_withoutRowNames.tsv"), + rowNames=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/rowNames.tsv"), # masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv") output: # newSampleSheet=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedSampleSheet.tsv"), # newConfigFile=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedConfig.yaml"), # merqLog=os.path.join(config['Results'],"logs/2_merylAndMerq/" + "{sample}" + "_merqury.spectra-cn.log"), - keyValues=temp(os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/aggregate.tsv")), - rowNames=temp(os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/aggregate_rowNames.tsv")), keyValuesWithRowNames=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/{sample}_aggregatedResults.tsv"), gscopeSum=os.path.join(config['Results'], "{sample}" +"/5_Key_Results/" + "{sample}" + "_gScope_summary.txt"), gscopeLog=os.path.join(config['Results'], "{sample}" +"/5_Key_Results/" + "{sample}" + "_gScope_log_plot.png"), @@ -398,27 +398,29 @@ rule saveConfiguration_and_getKeyValues: cp {input.spectraFlat} {output.spectraFlat} cp {input.scaffStats} {output.scaffStats} cp {input.contStats} {output.contStats} - echo "{wildcards.sample}" >> {output.keyValues} - echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {output.keyValues} - echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(awk {{'print $4'}} {input.qv})" >> {output.keyValues} - echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(grep 'N100' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'NG100' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {output.keyValues} - echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {output.keyValues} - echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues} - echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues} - dos2unix {output.keyValues} - echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nScaffold_N100_length\nScaffold_NG100_length\nContig_N50_length\nContig_NG50_length\ntotal_num_bases\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps" > {output.rowNames} - paste -d'\t' {output.rowNames} {output.keyValues} > {output.keyValuesWithRowNames} + echo "{wildcards.sample}" >> {params.keyValues} + echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {params.keyValues} + echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(awk {{'print $4'}} {input.qv})" >> {params.keyValues} + echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'N100' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG100' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} + echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + dos2unix {params.keyValues} + echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nScaffold_N100_length\nScaffold_NG100_length\nContig_N50_length\nContig_NG50_length\ntotal_num_bases\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps" > {params.rowNames} + paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} printf "Evaluation complete for {wildcards.sample}, please find these results in {params.resultsPath} folder" + rm {params.rowNames} {params.keyValues} """ +# # cp {input.checkLog} {output.merqLog} #echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} | sed 's/,//g')" >> {output.keyValues} put this to remove commas diff --git a/scripts/stats.py b/scripts/stats.py index fe220410727db1953acf5050ca4be6e035515ed0..7c2f3315b4192bd78e16bef084841f6f7e85b5ed 100644 --- a/scripts/stats.py +++ b/scripts/stats.py @@ -15,7 +15,7 @@ with open(sys.argv[2]) as f: # # size_bp = estSizeFile.readlines() -estSize=int(float(sys.argv[6])) +estSize=int(float(sys.argv[4])) if estSize == 0: size_bp = ff[0][:-1] @@ -286,7 +286,7 @@ if __name__ == "__main__": # contig = csv.writer(open(naming + "_contig_stats.tsv", "w"), delimiter='\t') # for key, val in contig_stats.items(): # - with open(sys.argv[7], 'w') as outputfile: + with open(sys.argv[5], 'w') as outputfile: # # print('#' + libraryName, file=outputfile) # # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile) # # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile) @@ -297,7 +297,7 @@ if __name__ == "__main__": print("", file=outputfile) print(df_scaffold_L_LG.to_string(index=False), file=outputfile) - with open(sys.argv[8], 'w') as outputfile2: + with open(sys.argv[6], 'w') as outputfile2: # # print('#' + libraryName, file=outputfile) # # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile) # # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile) @@ -308,21 +308,21 @@ if __name__ == "__main__": print("", file=outputfile2) print(df_contig_L_LG.to_string(index=False), file=outputfile2) - with open(sys.argv[4], 'w') as outputRst: - print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst) - print("", file=outputRst) - print(tabulate(df_scaffold_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst) - print("", file=outputRst) - print(tabulate(df_scaffold_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst) - print("", file=outputRst) - - with open(sys.argv[5], 'w') as outputRst2: - print(tabulate(df_contig_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) - print("", file=outputRst2) - print(tabulate(df_contig_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) - print("", file=outputRst2) - print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) - print("", file=outputRst2) + # with open(sys.argv[4], 'w') as outputRst: + # print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst) + # print("", file=outputRst) + # print(tabulate(df_scaffold_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst) + # print("", file=outputRst) + # print(tabulate(df_scaffold_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst) + # print("", file=outputRst) + # + # with open(sys.argv[5], 'w') as outputRst2: + # print(tabulate(df_contig_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) + # print("", file=outputRst2) + # print(tabulate(df_contig_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) + # print("", file=outputRst2) + # print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) + # print("", file=outputRst2) # list_of_dfs=[df_scaffold_top,df_scaffold_N_NG,df_scaffold_L_LG] # for df in list_of_dfs: # with open('all_dfs.tsv','a') as f: