update busco from v4 to v5

56ef020f · james94 · 6480bd41 · 56ef020f · 56ef020f · 56ef020f
Commit 56ef020f authored 4 years ago by james94
--- a/README.md
+++ b/README.md
@@ -171,7 +171,7 @@ Results: "/<PathTo>/desiredDestination/Results_AssemblyEval"
 samplesTSV: "/<PathTo>/GEP/configuration/samples.tsv"
-busco4Lineage: "vertebrata"
+busco5Lineage: "vertebrata"
 buscoMode: "genome"
@@ -183,15 +183,15 @@ This will be where all your results are stored for the run. It does not have to
 2. Path to your `samplesTSV`.
 This is the path to the aforemention `samples.tsv` that was created/modified just above.  For now, please keep this file inside the `configuration` folder, together with this `config.yaml`
-3. `busco4Lineage` Busco needs a database to be able to run.  Here you have a couple of different options.
+3. `busco5Lineage` Busco needs a database to be able to run.  Here you have a couple of different options.
- - Manually download and unpack your desired database from https://busco-data.ezlab.org/v4/data/lineages/ . In this case (or if you already have the database downloaded to a specific location), you can provide the full path: 
+ - Manually download and unpack your desired database from https://busco-data.ezlab.org/v5/data/lineages/ . In this case (or if you already have the database downloaded to a specific location), you can provide the full path:
 ```
-busco4Lineage: "/<PathTo>/manuallyDownloaded/vertebrata_odb10"
+busco5Lineage: "/<PathTo>/manuallyDownloaded/vertebrata_odb10"
 ```
 - Alternatively, you can just provide the taxonomy name that you wish to use. In this case, the latest database matching the name provided will be automatically downloaded prior to execution, if it doesn't already exist inside the `buscoLineage` directory. If it already exists in this `buscoLineage` either from manual download or from previously automatic download (from previously executed runs), then the pipeline will skip re-downloading.
 ```
-busco4Lineage: "vertebrata"
+busco5Lineage: "vertebrata"
 ```
 4. You can change the busco mode, but considering the scope of this evaluation in it's current state, this option is rather redundant and will be removed/hidden.
@@ -211,7 +211,7 @@ First you should run GEP in drymode:
 snakemake -n
 ```
-Which will check to see if some of your parameters/paths have been modified incorrectly.  Further, it will install all the necessary environments to be utilised by the workflow, as well as download the busco4 database if it doesn't already exist.  Unfortunaly when downloading the busco4 database, there will be lots of output in the terminal - a product of the limitations of the `wget` command used for downloading.
+Which will check to see if some of your parameters/paths have been modified incorrectly.  Further, it will install all the necessary environments to be utilised by the workflow, as well as download the busco5 database if it doesn't already exist.  Unfortunaly when downloading the busco5 database, there will be lots of output in the terminal - a product of the limitations of the `wget` command used for downloading.
 After the dry-run and downloading has complete, you can simply run the full pipeline with:
@@ -288,5 +288,3 @@ The key result files are:
 There is a separately created folder within the main results directory (i.e. `/path/to/Results/allAssemblies_keyResults` )
 Within this folder you will find a combined aggregate file (`/path/to/Results/allAssemblies_keyResults/key_results.tsv`, a tsv that combines the aforementioned key values from each assembly evaluated, respectively, into one single file.  This is useful for plotting the key values across multiple assemblies.
--- a/Snakefile
+++ b/Snakefile
@@ -59,7 +59,7 @@ elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True:
 	print("Database already in buscoLineage directory, basename is:", buscoDataBaseName)
 else:
 	print("Database will be downloaded")
-	url = "https://busco-data.ezlab.org/v4/data/lineages/"
+	url = "https://busco-data.ezlab.org/v5/data/lineages/"
 	html = urlopen(url).read()
 	soup = BeautifulSoup(html, features="html.parser")
 # kill all script and style elements

--- a/configuration/config.yaml
+++ b/configuration/config.yaml
@@ -4,6 +4,6 @@ Results: "/path/to/desiredDestination/Results_Test"
 samplesTSV: "/path/to/configuration/samplesTest.tsv"
-busco4Lineage: "vertebrata"
+busco5Lineage: "vertebrata"
 buscoMode: "genome"
--- a/envs/busco_and_assembly.yaml
+++ b/envs/busco_and_assembly.yaml
@@ -3,6 +3,6 @@ channels:
  - conda-forge
  - anaconda
 dependencies:
-  - python=3.7
+  - python=3.8.8
-  - busco=4.0.6
+  - busco=5.0.0
  - biopython=1.76
--- a/report/test.rst
+++ b/report/test.rst
+this is a test for multiqc
--- a/rules/rulesNew.smk
+++ b/rules/rulesNew.smk
@@ -12,8 +12,8 @@
 rule trimAdapters:
 	input:
 		config['samplesTSV'],
-#		os.path.join(config['Results'], "{sample}" + "/busco4/short_summary.specific." + config['busco4Lineage'] + "_odb10." + "{sample}" + ".txt")
+#		os.path.join(config['Results'], "{sample}" + "/busco5/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt")
-#		check=os.path.join(config['Results'], "{sample}" + "/busco4/short_summary.specific." + config['busco4Lineage'] + "_odb10." + "{sample}" + ".txt")
+#		check=os.path.join(config['Results'], "{sample}" + "/busco5/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt")
 	params:
 		file=lambda wildcards: samples.at[wildcards.sample, 'combined'] \
 				if wildcards.sample in samples.index else '',
@@ -88,7 +88,7 @@ rule combine_illuminaReads_R2:
 # 		mv {input.scaffold} {output.scaffold}
 # 		"""
-rule busco4:
+rule busco5:
 	input:
 		assembly=lambda wildcards: samples.at[wildcards.sample, 'assembly_fasta'] \
                	if wildcards.sample in samples.index else '',
@@ -96,19 +96,19 @@ rule busco4:
 	params:
 		mode = config['buscoMode'],
 		assemblyName = "{sample}",
-		chngDir = os.path.join(config['Results'], "{sample}" + "/1_busco4")
+		chngDir = os.path.join(config['Results'], "{sample}" + "/1_busco5")
 	threads:
 		workflow.cores * 0.5
 	output:
-#		report(os.path.join(config['Results'], "{sample}" + "/busco4/" + "{sample}" + "/short_summary.specific." + config['busco4Lineage'] + "_odb10." + "{sample}" + ".txt"), caption="../report/busco.rst", category="Benchmark Universal Single Copy Orthologs", subcategory="{sample}")
+#		report(os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt"), caption="../report/busco.rst", category="Benchmark Universal Single Copy Orthologs", subcategory="{sample}")
-		os.path.join(config['Results'], "{sample}" + "/1_busco4/" + "{sample}" + "/short_summary.specific." + buscoDataBaseName + "_odb10." + "{sample}" + ".txt"),
+		os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}" + "/short_summary.specific." + buscoDataBaseName + "_odb10." + "{sample}" + ".txt"),
 		# symlink = os.path.join(config['Results'], "{sample}" + "/{sample}.fasta")
-#		mvRunBusco= directory(os.path.join(config['Results'], "{sample}" + "/busco4/" + "{sample}" + "/run_" + config['busco4Lineage'] + "_odb10")),
+#		mvRunBusco= directory(os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/run_" + config['busco5Lineage'] + "_odb10")),
-#		blastDB= directory(os.path.join(config['Results'], "{sample}" + "/busco4/" + "{sample}" + "/blast_db"))
+#		blastDB= directory(os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/blast_db"))
 	conda:
 		"../envs/busco_and_assembly.yaml"
 	log:
-		os.path.join(config['Results'], "logs/1_busco4/{sample}_busco4.log")
+		os.path.join(config['Results'], "logs/1_busco5/{sample}_busco5.log")
 	priority:
 		20
 	shell:
@@ -120,20 +120,20 @@ rule busco4:
 rule moveBuscoOutputs:
 	input:
-		buscoResult=os.path.join(config['Results'], "{sample}" + "/1_busco4/" + "{sample}" + "/short_summary.specific." + buscoDataBaseName + "_odb10." + "{sample}" + ".txt"),
+		buscoResult=os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}" + "/short_summary.specific." + buscoDataBaseName + "_odb10." + "{sample}" + ".txt"),
-#		mvRunBusco= os.path.join(config['Results'], "{sample}" + "/busco4/" + "{sample}" + "/run_" + config['busco4Lineage'] + "_odb10"),
+#		mvRunBusco= os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/run_" + config['busco5Lineage'] + "_odb10"),
-#		blastDB= os.path.join(config['Results'], "{sample}" + "/busco4/" + "{sample}" + "/blast_db")
+#		blastDB= os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/blast_db")
 	params:
-		rmDir= os.path.join(config['Results'], "{sample}" + "/1_busco4/" + "{sample}"),
+		rmDir= os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}"),
-#		logs = os.path.join(config['Results'], "{sample}" + "/busco4/" + "{sample}" + "/logs/*"),
+#		logs = os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/logs/*"),
-#		mvLogs= os.path.join(config['Results'], "{sample}" + "/busco4/logs/"),
+#		mvLogs= os.path.join(config['Results'], "{sample}" + "/busco5/logs/"),
-		mvRunBusco= os.path.join(config['Results'], "{sample}" + "/1_busco4/" + "{sample}" + "/run_" + buscoDataBaseName + "_odb10"),
+		mvRunBusco= os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}" + "/run_" + buscoDataBaseName + "_odb10"),
-		mvRunBuscoDest= os.path.join(config['Results'], "{sample}" + "/1_busco4"),
+		mvRunBuscoDest= os.path.join(config['Results'], "{sample}" + "/1_busco5"),
-		destination= os.path.join(config['Results'], "{sample}" + "/1_busco4/"),
+		destination= os.path.join(config['Results'], "{sample}" + "/1_busco5/"),
-		blastDB= os.path.join(config['Results'], "{sample}" + "/1_busco4/" + "{sample}" + "/blast_db")
+		blastDB= os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}" + "/blast_db")
-#		blastDB= os.path.join(config['Results'], "{sample}" + "/busco4/blast_db")
+#		blastDB= os.path.join(config['Results'], "{sample}" + "/busco5/blast_db")
 	output:
-		file = report(os.path.join(config['Results'], "{sample}" + "/1_busco4/short_summary.specific." + buscoDataBaseName + "_odb10." + "{sample}" + ".txt"), caption="../report/busco.rst", category="Benchmark Universal Single Copy Orthologs", subcategory="{sample}")
+		file = report(os.path.join(config['Results'], "{sample}" + "/1_busco5/short_summary.specific." + buscoDataBaseName + "_odb10." + "{sample}" + ".txt"), caption="../report/busco.rst", category="Benchmark Universal Single Copy Orthologs", subcategory="{sample}")
 	shell:
 		"""
 		mv -t {params.mvRunBuscoDest} {params.mvRunBusco}
@@ -145,7 +145,7 @@ rule moveBuscoOutputs:
 rule meryl_R1:
 	input:
 		read1= rules.combine_illuminaReads_R1.output
-#		check = os.path.join(config['Results'], "{sample}" + "/busco4/short_summary.specific." + config['busco4Lineage'] + "_odb10." + "{sample}" + ".txt")
+#		check = os.path.join(config['Results'], "{sample}" + "/busco5/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt")
 	params:
 		script = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin/meryl"),
 		kmer = 21,
@@ -169,7 +169,7 @@ rule meryl_R1:
 rule meryl_R2:
 	input:
 		read2= rules.combine_illuminaReads_R2.output
-#		check = os.path.join(config['Results'], "{sample}" + "/busco4/short_summary.specific." + config['busco4Lineage'] + "_odb10." + "{sample}" + ".txt")
+#		check = os.path.join(config['Results'], "{sample}" + "/busco5/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt")
 	params:
 		script = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin/meryl"),
 		kmer = 21,
@@ -194,7 +194,7 @@ rule meryl:
 	input:
 		read1=os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{sample}" + "_R1.21.meryl"),
 		read2=os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{sample}" + "_R2.21.meryl")
-#		check = os.path.join(config['Results'], "{sample}" + "/busco4/short_summary.specific." + config['busco4Lineage'] + "_odb10." + "{sample}" + ".txt")
+#		check = os.path.join(config['Results'], "{sample}" + "/busco5/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt")
 	params:
 		script = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin/meryl"),
 		kmer = 21,
@@ -341,7 +341,7 @@ rule saveConfiguration_and_getKeyValues:
 		gscopeSum=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_summary.txt"),
 		gscopeLog=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_log_plot.png"),
 		gscopeLin=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_linear_plot.png"),
-		busco=os.path.join(config['Results'], "{sample}" + "/1_busco4/short_summary.specific." + buscoDataBaseName + "_odb10." + "{sample}" + ".txt"),
+		busco=os.path.join(config['Results'], "{sample}" + "/1_busco5/short_summary.specific." + buscoDataBaseName + "_odb10." + "{sample}" + ".txt"),
 		qv=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.qv"),
 		spectraStacked=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.spectra-cn.st.png"),
 		spectraFlat=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.spectra-cn.fl.png"),
@@ -406,7 +406,7 @@ rule saveConfiguration_and_getKeyValues:
 		echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
 		dos2unix {output.keyValues}
 		echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nN50_length\nNG50_length\nN95_length\nNG95_length\nN100_length\nNG100_length\ntotal_num_bases\ntotal_num_scaffolds" > {output.rowNames}
-		paste -d'\t' {output.rowNames} {output.keyValues} > {output.keyValuesWithRowNames}
+		paste -d'\t' {output.rowNames} {output.keyValues} | column -t > {output.keyValuesWithRowNames}
 		printf "Evaluation complete for {wildcards.sample}, please find these results in {params.resultsPath} folder"
 		"""