quick fix

4a7ece1b · james94 · 1fb5c77a · 4a7ece1b · 4a7ece1b · 4a7ece1b
Commit 4a7ece1b authored May 14, 2021 by james94
--- a/Snakefile
+++ b/Snakefile
@@ -32,28 +32,15 @@ if 'Library_R1' in samples.columns:
 	samples = samples1.set_index('assemblyName')
 	# print(samples)
 	samples=samples.join(samples6['est_bp'])
-	print("The following assemblies will be evaluated: ", list(samples.index))
+#	print("The following assemblies will be evaluated: ", list(samples.index))
 elif 'preBuiltMerylDatabase' in samples.columns:
 	samples['est_bp'] = samples['est_bp'].fillna(0)
 	samples["est_bp"] = pd.to_numeric(samples["est_bp"])
-	print("The following assemblies will be evaluated: ", list(samples.index))
+#	print("The following assemblies will be evaluated: ", list(samples.index))
 else:
 	raise ValueError('Sample Sheet structure not recognised. Please make sure you are using the correct sample sheet (prebuilt or non-prebuilt meryl db, etc)')
-# print(samples)
-# parse arguments
-# parser = argparse.ArgumentParser(description="Download busco lineage determined by prefix", formatter_class=argparse.RawTextHelpFormatter)
-# parser.add_argument("--l", "-lineage", required=True, type=str, help="Lineage prefix to download --> first match will be downloaded")
-# parser.add_argument("--o", "-outputPath", required=True, type=str, help="Location To Download busco databe into")
-# #parser.add_argument("--wkd", "-workingDir", required=True, type=str, help="Location To lineage Dir")
-# args = parser.parse_args()
-#os.path.join(workflow.basedir, "buscoLineage/" + config['busco5Lineage'] + "_odb10")
 args_o=os.path.join(workflow.basedir, "buscoLineage")
 args_l=config['busco5Lineage']
 checkLineagePath=args_o + "/" + args_l + "_odb10"
@@ -65,14 +52,14 @@ if os.path.isdir(args_l) is True:
 #	subprocess.call("ln -sf %s %s"%(args.l, args.l), shell=True)
 	buscoDataBaseName=os.path.basename(args_l)
 	buscoDataBaseName=buscoDataBaseName[:-6]
-	print("Lineage path given, basename is:", buscoDataBaseName)
+#	print("Lineage path given, basename is:", buscoDataBaseName)
 elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True:
 #	subprocess.call("ln -sf %s %s"%(checkLineagePath, checkLineageDled), shell=True)
 	buscoDataBaseName=os.path.basename(checkLineagePath)
 	buscoDataBaseName=buscoDataBaseName[:-6]
-	print("Database already in buscoLineage directory, basename is:", buscoDataBaseName)
+#	print("Database already in buscoLineage directory, basename is:", buscoDataBaseName)
 else:
-	print("Database will be downloaded")
+#	print("Database will be downloaded")
 	url = "https://busco-data.ezlab.org/v5/data/lineages/"
 	html = urlopen(url).read()
 	soup = BeautifulSoup(html, features="html.parser")
@@ -87,20 +74,19 @@ else:
 #identify the lineage file
 	for line in text.splitlines():
 		if line.startswith(args_l):
-			print(line)
+#			print(line)
 			linID=line.split(" ")[0]
-			print(linID)
+#			print(linID)
 			break
 	if not linID==None:
 		linLink=url+linID
-		print(linLink)
+#		print(linLink)
 		subprocess.run("wget -q -P %s %s"%(outDir, linLink), shell=True, check=True)
 		subprocess.run("tar -xvf %s*.tar.gz -C %s"%(args_o + "/" + args_l, outDir), shell=True, check=True)
 		subprocess.run("rm %s_*.tar.gz"%(checkLineageDled), shell=True, check=True)
 		buscoDataBaseName=args_l
-		print("Database success downloaded and placed in buscoLineage directory, basename is:", buscoDataBaseName)
 	else:
-		print("Error - could not identify lineage please check %s for a correct prefix"%(url))
+		raise ValueError("Error - could not identify lineage please check busco site for a correct prefix")
 #Function to check if one of, or both the barcodeP5 and barcodeP7 fasta files are given by the user.

--- a/rules/rulesNew_short.smk
+++ b/rules/rulesNew_short.smk
+#
+#
 # import csv
 # aggregateResults = csv.writer(open(os.path.join(config['Results'],"{wildcards.sample}/keyResults/{wildcards.sample}_aggregate"), "w"), delimiter='\t')
+#
+#
 # rule checkOrDownload_BuscoLineage:
 # 	input:
 # 		config['buscoDatabase']
 # 	output:
+#
 # rule trimAdapters:
 # 	input:
 # 		config['samplesTSV'],
@@ -30,7 +30,7 @@
 # 		"../envs/pigz.yaml"
 # 	shell:
 # 		"(trim_galore -q 5 -j {threads} --dont_gzip --fastqc --length 65 -o {params.base} --paired {params.file}) &> {log}"
+#
 # rule multiqc:
 # 	input:
 # 		os.path.join(config['Results'],"{sample}/0_qualityControl/01_fastqc")
@@ -43,8 +43,8 @@
 # 		"../envs/pigz.yaml"
 # 	shell:
 # 		"multiqc {input} -o {params.folder} -n {params.filename}"
+#
+#
 # rule combine_illuminaReads_R1:
 # 	input:
 # 		R1=os.path.join(config['Results'],"{sample}/0_qualityControl/01_fastqc")
@@ -87,7 +87,6 @@
 # # 		mv {input.contig} {output.contig}
 # # 		mv {input.scaffold} {output.scaffold}
 # 		"""
 rule busco5:
 	input:
 		assembly=lambda wildcards: samples.at[wildcards.sample, 'assembly_fasta'] \
@@ -108,7 +107,7 @@ rule busco5:
 	conda:
 		"../envs/busco_and_assembly.yaml"
 	log:
-		os.path.join(config['Results'], "logs/1_busco5/{sample}_busco5.log")
+		os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}" + "/logs/{sample}_busco5.log")
 	priority:
 		20
 	shell:
@@ -230,6 +229,8 @@ rule merqury:
 #		merylPath = os.path.join(workflow.basedir, "programs/meryl/build/bin:$PATH"),
 		# merylPath = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin:$PATH"),
 		outPath= os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury"),
+		symlink_fasta=os.path.join(config['Results'], "{sample}" + "/{sample}.fasta"),
+		symlink_merylDB=directory(os.path.join(config['Results'], "{sample}" + "/combinedRead1Read2.21.meryl"))
 		# export= os.path.join(workflow.basedir, "programs/merqury-1.1")
 	threads:
 		workflow.cores * 0.5
@@ -237,9 +238,7 @@ rule merqury:
 		report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.qv"), caption="../report/merqury/qvScore.rst", category="QV Estimate", subcategory="{sample}"),
 		report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.st.png"), caption="../report/merqury/spectra-cn_stacked.rst", category="Copy Number Spectra Plots", subcategory="{sample}"),
 		report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.fl.png"), caption="../report/merqury/spectra-cn_filled.rst", category="Copy Number Spectra Plots", subcategory="{sample}"),
-		os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/combinedRead1Read2.21.hist"),
+		os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/combinedRead1Read2.21.hist")
-		symlink_fasta=os.path.join(config['Results'], "{sample}" + "/{sample}.fasta"),
-		symlink_merylDB=directory(os.path.join(config['Results'], "{sample}" + "/combinedRead1Read2.21.meryl"))
 	log:
 		os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/{sample}_merqury.log")
 	priority:
@@ -248,11 +247,11 @@ rule merqury:
 		"../envs/merylMerq_2.yaml"
 	shell:
 		"""
-		ln -s {input.assembly} {output.symlink_fasta}
+		ln -s {input.assembly} {params.symlink_fasta}
-		ln -s {input.merylDB} {output.symlink_merylDB}
+		ln -s {input.merylDB} {params.symlink_merylDB}
 		cd {params.outPath}
 		export OMP_NUM_THREADS={threads}
-		(merqury.sh {output.symlink_merylDB} {output.symlink_fasta} {params.outFile}) &> {log}
+		(merqury.sh {params.symlink_merylDB} {params.symlink_fasta} {params.outFile}) &> {log}
 		"""
 #				bash {params.script} {input.merylDB} {params.symlink} {params.outFile}
@@ -278,7 +277,7 @@ rule genomescope2:
 		summary=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_summary.txt"), caption="../report/genomescope/summary.rst", category="Genomescope Profile", subcategory="{sample}"),
 		logPlot=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_log_plot.png"), caption="../report/genomescope/logplot.rst", category="Genomescope Profile", subcategory="{sample}"),
 		linearPlot=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_linear_plot.png"), caption="../report/genomescope/linearplot.rst", category="Genomescope Profile", subcategory="{sample}"),
-		estimatedSize=temp(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_sizeEst.txt"))
+		estimatedSize=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_sizeEst.txt")
 	conda:
 		"../envs/genomescope.yaml"
 	shell:
@@ -300,8 +299,8 @@ rule assemblyStats:
 #		report(os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), caption="../report/assemblystats/{sample}_scaffold_rst.rst", category="Assembly Statistics", subcategory="{sample}"),
 		scaffStats=report(os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_scaffold_stats.tsv"), caption="../report/assemblystats/stats.rst", category="Assembly Statistics", subcategory="{sample}"),
 		contStats=report(os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_contig_stats.tsv"), caption="../report/assemblystats/stats.rst", category="Assembly Statistics", subcategory="{sample}"),
-		scaff_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_scaffold_rst.rst")),
+		#scaff_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_scaffold_rst.rst")),
-		contig_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_contig_rst.rst"))
+		# contig_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_contig_rst.rst"))
 	params:
 		script = os.path.join(workflow.basedir, "scripts/stats.py"),
 		path = os.path.join(config['Results'], "{sample}" + "/4_assemblyStats/"),
@@ -310,8 +309,9 @@ rule assemblyStats:
 				if wildcards.sample in samples.index else ''
 	shell:
 		"""
-		python {params.script} {input.assembly} {input.estGenome} {params.filename} {output.scaff_rst} {output.contig_rst} {params.given_size} {output.scaffStats} {output.contStats}
+		python {params.script} {input.assembly} {input.estGenome} {params.filename} {params.given_size} {output.scaffStats} {output.contStats}
 		"""
+#		python {params.script} {input.assembly} {input.estGenome} {params.filename} {output.scaff_rst} {output.contig_rst} {params.given_size} {output.scaffStats} {output.contStats}
 #		estGenomeSize=$( grep 'Genome Haploid Length' {input.estGenome} | awk {{'print $6'}} | sed 's/,//g' )
@@ -362,14 +362,14 @@ rule saveConfiguration_and_getKeyValues:
 	params:
 		# sampleSheet= config['samplesTSV'],
 		# config=os.path.join(workflow.basedir, "configuration/config.yaml"),
-		resultsPath=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results")
+		resultsPath=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results"),
+		keyValues=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/results_withoutRowNames.tsv"),
+		rowNames=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/rowNames.tsv"),
 		# masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv")
 	output:
 		# newSampleSheet=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedSampleSheet.tsv"),
 		# newConfigFile=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedConfig.yaml"),
 #		merqLog=os.path.join(config['Results'],"logs/2_merylAndMerq/" + "{sample}" + "_merqury.spectra-cn.log"),
-		keyValues=temp(os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/aggregate.tsv")),
-		rowNames=temp(os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/aggregate_rowNames.tsv")),
 		keyValuesWithRowNames=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/{sample}_aggregatedResults.tsv"),
 		gscopeSum=os.path.join(config['Results'], "{sample}" +"/5_Key_Results/" + "{sample}" + "_gScope_summary.txt"),
 		gscopeLog=os.path.join(config['Results'], "{sample}" +"/5_Key_Results/" + "{sample}" + "_gScope_log_plot.png"),
@@ -398,27 +398,29 @@ rule saveConfiguration_and_getKeyValues:
 		cp {input.spectraFlat} {output.spectraFlat}
 		cp {input.scaffStats} {output.scaffStats}
 		cp {input.contStats} {output.contStats}
-		echo "{wildcards.sample}" >> {output.keyValues}
+		echo "{wildcards.sample}" >> {params.keyValues}
-		echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {output.keyValues}
+		echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {params.keyValues}
-		echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {output.keyValues}
+		echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues}
-		echo "$(awk {{'print $4'}} {input.qv})" >> {output.keyValues}
+		echo "$(awk {{'print $4'}} {input.qv})" >> {params.keyValues}
-		echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
+		echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
-		echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues}
+		echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues}
-		echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
+		echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
-		echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues}
+		echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues}
-		echo "$(grep 'N100' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
+		echo "$(grep 'N100' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
-		echo "$(grep 'NG100' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues}
+		echo "$(grep 'NG100' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues}
-		echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues}
+		echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
-		echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {output.keyValues}
+		echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues}
-		echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {output.keyValues}
+		echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues}
-		echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
+		echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
-		echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues}
+		echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
-		echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues}
+		echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
-		dos2unix {output.keyValues}
+		dos2unix {params.keyValues}
-		echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nScaffold_N100_length\nScaffold_NG100_length\nContig_N50_length\nContig_NG50_length\ntotal_num_bases\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps" > {output.rowNames}
+		echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nScaffold_N100_length\nScaffold_NG100_length\nContig_N50_length\nContig_NG50_length\ntotal_num_bases\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps" > {params.rowNames}
-		paste -d'\t' {output.rowNames} {output.keyValues} > {output.keyValuesWithRowNames}
+		paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames}
 		printf "Evaluation complete for {wildcards.sample}, please find these results in {params.resultsPath} folder"
+		rm {params.rowNames} {params.keyValues}
 		"""
+#
 #		cp {input.checkLog} {output.merqLog}
 #echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} | sed 's/,//g')" >> {output.keyValues}  put this to remove commas

--- a/scripts/stats.py
+++ b/scripts/stats.py
@@ -15,7 +15,7 @@ with open(sys.argv[2]) as f:
 #
 # size_bp = estSizeFile.readlines()
-estSize=int(float(sys.argv[6]))
+estSize=int(float(sys.argv[4]))
 if estSize == 0:
 	size_bp = ff[0][:-1]
@@ -286,7 +286,7 @@ if __name__ == "__main__":
    # contig = csv.writer(open(naming + "_contig_stats.tsv", "w"), delimiter='\t')
    # for key, val in contig_stats.items():
    #
-    with open(sys.argv[7], 'w') as outputfile:
+    with open(sys.argv[5], 'w') as outputfile:
 # #    print('#' + libraryName, file=outputfile)
 # #    print("Total Reads Processed (Paired):        " + total_processed + "   ( 100 %)", file=outputfile)
 # #    print("Discarded reads (Paired):              " + discarded + "    ( "+str(discarded_perc)+"%)", file=outputfile)
@@ -297,7 +297,7 @@ if __name__ == "__main__":
        print("", file=outputfile)
        print(df_scaffold_L_LG.to_string(index=False), file=outputfile)
-    with open(sys.argv[8], 'w') as outputfile2:
+    with open(sys.argv[6], 'w') as outputfile2:
 # #    print('#' + libraryName, file=outputfile)
 # #    print("Total Reads Processed (Paired):        " + total_processed + "   ( 100 %)", file=outputfile)
 # #    print("Discarded reads (Paired):              " + discarded + "    ( "+str(discarded_perc)+"%)", file=outputfile)
@@ -308,21 +308,21 @@ if __name__ == "__main__":
        print("", file=outputfile2)
        print(df_contig_L_LG.to_string(index=False), file=outputfile2)
-    with open(sys.argv[4], 'w') as outputRst:
+    # with open(sys.argv[4], 'w') as outputRst:
-        print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
+    #     print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
-        print("", file=outputRst)
+    #     print("", file=outputRst)
-        print(tabulate(df_scaffold_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
+    #     print(tabulate(df_scaffold_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
-        print("", file=outputRst)
+    #     print("", file=outputRst)
-        print(tabulate(df_scaffold_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
+    #     print(tabulate(df_scaffold_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
-        print("", file=outputRst)
+    #     print("", file=outputRst)
+	#
-    with open(sys.argv[5], 'w') as outputRst2:
+    # with open(sys.argv[5], 'w') as outputRst2:
-        print(tabulate(df_contig_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
+    #     print(tabulate(df_contig_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
-        print("", file=outputRst2)
+    #     print("", file=outputRst2)
-        print(tabulate(df_contig_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
+    #     print(tabulate(df_contig_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
-        print("", file=outputRst2)
+    #     print("", file=outputRst2)
-        print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
+    #     print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
-        print("", file=outputRst2)
+    #     print("", file=outputRst2)
 	# list_of_dfs=[df_scaffold_top,df_scaffold_N_NG,df_scaffold_L_LG]
    # for df in list_of_dfs:
    #     with open('all_dfs.tsv','a') as f: