Skip to content
Snippets Groups Projects
Commit 4a7ece1b authored by james94's avatar james94
Browse files

quick fix

parent 1fb5c77a
No related branches found
No related tags found
No related merge requests found
......@@ -32,28 +32,15 @@ if 'Library_R1' in samples.columns:
samples = samples1.set_index('assemblyName')
# print(samples)
samples=samples.join(samples6['est_bp'])
print("The following assemblies will be evaluated: ", list(samples.index))
# print("The following assemblies will be evaluated: ", list(samples.index))
elif 'preBuiltMerylDatabase' in samples.columns:
samples['est_bp'] = samples['est_bp'].fillna(0)
samples["est_bp"] = pd.to_numeric(samples["est_bp"])
print("The following assemblies will be evaluated: ", list(samples.index))
# print("The following assemblies will be evaluated: ", list(samples.index))
else:
raise ValueError('Sample Sheet structure not recognised. Please make sure you are using the correct sample sheet (prebuilt or non-prebuilt meryl db, etc)')
# print(samples)
# parse arguments
# parser = argparse.ArgumentParser(description="Download busco lineage determined by prefix", formatter_class=argparse.RawTextHelpFormatter)
# parser.add_argument("--l", "-lineage", required=True, type=str, help="Lineage prefix to download --> first match will be downloaded")
# parser.add_argument("--o", "-outputPath", required=True, type=str, help="Location To Download busco databe into")
# #parser.add_argument("--wkd", "-workingDir", required=True, type=str, help="Location To lineage Dir")
# args = parser.parse_args()
#os.path.join(workflow.basedir, "buscoLineage/" + config['busco5Lineage'] + "_odb10")
args_o=os.path.join(workflow.basedir, "buscoLineage")
args_l=config['busco5Lineage']
checkLineagePath=args_o + "/" + args_l + "_odb10"
......@@ -65,14 +52,14 @@ if os.path.isdir(args_l) is True:
# subprocess.call("ln -sf %s %s"%(args.l, args.l), shell=True)
buscoDataBaseName=os.path.basename(args_l)
buscoDataBaseName=buscoDataBaseName[:-6]
print("Lineage path given, basename is:", buscoDataBaseName)
# print("Lineage path given, basename is:", buscoDataBaseName)
elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True:
# subprocess.call("ln -sf %s %s"%(checkLineagePath, checkLineageDled), shell=True)
buscoDataBaseName=os.path.basename(checkLineagePath)
buscoDataBaseName=buscoDataBaseName[:-6]
print("Database already in buscoLineage directory, basename is:", buscoDataBaseName)
# print("Database already in buscoLineage directory, basename is:", buscoDataBaseName)
else:
print("Database will be downloaded")
# print("Database will be downloaded")
url = "https://busco-data.ezlab.org/v5/data/lineages/"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")
......@@ -87,20 +74,19 @@ else:
#identify the lineage file
for line in text.splitlines():
if line.startswith(args_l):
print(line)
# print(line)
linID=line.split(" ")[0]
print(linID)
# print(linID)
break
if not linID==None:
linLink=url+linID
print(linLink)
# print(linLink)
subprocess.run("wget -q -P %s %s"%(outDir, linLink), shell=True, check=True)
subprocess.run("tar -xvf %s*.tar.gz -C %s"%(args_o + "/" + args_l, outDir), shell=True, check=True)
subprocess.run("rm %s_*.tar.gz"%(checkLineageDled), shell=True, check=True)
buscoDataBaseName=args_l
print("Database success downloaded and placed in buscoLineage directory, basename is:", buscoDataBaseName)
else:
print("Error - could not identify lineage please check %s for a correct prefix"%(url))
raise ValueError("Error - could not identify lineage please check busco site for a correct prefix")
#Function to check if one of, or both the barcodeP5 and barcodeP7 fasta files are given by the user.
......
#
#
# import csv
# aggregateResults = csv.writer(open(os.path.join(config['Results'],"{wildcards.sample}/keyResults/{wildcards.sample}_aggregate"), "w"), delimiter='\t')
#
#
# rule checkOrDownload_BuscoLineage:
# input:
# config['buscoDatabase']
# output:
#
# rule trimAdapters:
# input:
# config['samplesTSV'],
......@@ -30,7 +30,7 @@
# "../envs/pigz.yaml"
# shell:
# "(trim_galore -q 5 -j {threads} --dont_gzip --fastqc --length 65 -o {params.base} --paired {params.file}) &> {log}"
#
# rule multiqc:
# input:
# os.path.join(config['Results'],"{sample}/0_qualityControl/01_fastqc")
......@@ -43,8 +43,8 @@
# "../envs/pigz.yaml"
# shell:
# "multiqc {input} -o {params.folder} -n {params.filename}"
#
#
# rule combine_illuminaReads_R1:
# input:
# R1=os.path.join(config['Results'],"{sample}/0_qualityControl/01_fastqc")
......@@ -87,7 +87,6 @@
# # mv {input.contig} {output.contig}
# # mv {input.scaffold} {output.scaffold}
# """
rule busco5:
input:
assembly=lambda wildcards: samples.at[wildcards.sample, 'assembly_fasta'] \
......@@ -108,7 +107,7 @@ rule busco5:
conda:
"../envs/busco_and_assembly.yaml"
log:
os.path.join(config['Results'], "logs/1_busco5/{sample}_busco5.log")
os.path.join(config['Results'], "{sample}" + "/1_busco5/" + "{sample}" + "/logs/{sample}_busco5.log")
priority:
20
shell:
......@@ -230,6 +229,8 @@ rule merqury:
# merylPath = os.path.join(workflow.basedir, "programs/meryl/build/bin:$PATH"),
# merylPath = os.path.join(workflow.basedir, "programs/meryl-1.0/Linux-amd64/bin:$PATH"),
outPath= os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury"),
symlink_fasta=os.path.join(config['Results'], "{sample}" + "/{sample}.fasta"),
symlink_merylDB=directory(os.path.join(config['Results'], "{sample}" + "/combinedRead1Read2.21.meryl"))
# export= os.path.join(workflow.basedir, "programs/merqury-1.1")
threads:
workflow.cores * 0.5
......@@ -237,9 +238,7 @@ rule merqury:
report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.qv"), caption="../report/merqury/qvScore.rst", category="QV Estimate", subcategory="{sample}"),
report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.st.png"), caption="../report/merqury/spectra-cn_stacked.rst", category="Copy Number Spectra Plots", subcategory="{sample}"),
report(os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/" + "{sample}" + "_merq.{sample}.spectra-cn.fl.png"), caption="../report/merqury/spectra-cn_filled.rst", category="Copy Number Spectra Plots", subcategory="{sample}"),
os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/combinedRead1Read2.21.hist"),
symlink_fasta=os.path.join(config['Results'], "{sample}" + "/{sample}.fasta"),
symlink_merylDB=directory(os.path.join(config['Results'], "{sample}" + "/combinedRead1Read2.21.meryl"))
os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/combinedRead1Read2.21.hist")
log:
os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/{sample}_merqury.log")
priority:
......@@ -248,11 +247,11 @@ rule merqury:
"../envs/merylMerq_2.yaml"
shell:
"""
ln -s {input.assembly} {output.symlink_fasta}
ln -s {input.merylDB} {output.symlink_merylDB}
ln -s {input.assembly} {params.symlink_fasta}
ln -s {input.merylDB} {params.symlink_merylDB}
cd {params.outPath}
export OMP_NUM_THREADS={threads}
(merqury.sh {output.symlink_merylDB} {output.symlink_fasta} {params.outFile}) &> {log}
(merqury.sh {params.symlink_merylDB} {params.symlink_fasta} {params.outFile}) &> {log}
"""
# bash {params.script} {input.merylDB} {params.symlink} {params.outFile}
......@@ -278,7 +277,7 @@ rule genomescope2:
summary=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_summary.txt"), caption="../report/genomescope/summary.rst", category="Genomescope Profile", subcategory="{sample}"),
logPlot=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_log_plot.png"), caption="../report/genomescope/logplot.rst", category="Genomescope Profile", subcategory="{sample}"),
linearPlot=report(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_linear_plot.png"), caption="../report/genomescope/linearplot.rst", category="Genomescope Profile", subcategory="{sample}"),
estimatedSize=temp(os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_sizeEst.txt"))
estimatedSize=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}" + "_sizeEst.txt")
conda:
"../envs/genomescope.yaml"
shell:
......@@ -300,8 +299,8 @@ rule assemblyStats:
# report(os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), caption="../report/assemblystats/{sample}_scaffold_rst.rst", category="Assembly Statistics", subcategory="{sample}"),
scaffStats=report(os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_scaffold_stats.tsv"), caption="../report/assemblystats/stats.rst", category="Assembly Statistics", subcategory="{sample}"),
contStats=report(os.path.join(config['Results'],"{sample}" + "/4_assemblyStats/{sample}_contig_stats.tsv"), caption="../report/assemblystats/stats.rst", category="Assembly Statistics", subcategory="{sample}"),
scaff_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_scaffold_rst.rst")),
contig_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_contig_rst.rst"))
#scaff_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_scaffold_rst.rst")),
# contig_rst = temp(os.path.join(workflow.basedir, "report/assemblystats/{sample}_contig_rst.rst"))
params:
script = os.path.join(workflow.basedir, "scripts/stats.py"),
path = os.path.join(config['Results'], "{sample}" + "/4_assemblyStats/"),
......@@ -310,8 +309,9 @@ rule assemblyStats:
if wildcards.sample in samples.index else ''
shell:
"""
python {params.script} {input.assembly} {input.estGenome} {params.filename} {output.scaff_rst} {output.contig_rst} {params.given_size} {output.scaffStats} {output.contStats}
python {params.script} {input.assembly} {input.estGenome} {params.filename} {params.given_size} {output.scaffStats} {output.contStats}
"""
# python {params.script} {input.assembly} {input.estGenome} {params.filename} {output.scaff_rst} {output.contig_rst} {params.given_size} {output.scaffStats} {output.contStats}
# estGenomeSize=$( grep 'Genome Haploid Length' {input.estGenome} | awk {{'print $6'}} | sed 's/,//g' )
......@@ -362,14 +362,14 @@ rule saveConfiguration_and_getKeyValues:
params:
# sampleSheet= config['samplesTSV'],
# config=os.path.join(workflow.basedir, "configuration/config.yaml"),
resultsPath=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results")
resultsPath=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results"),
keyValues=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/results_withoutRowNames.tsv"),
rowNames=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/rowNames.tsv"),
# masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv")
output:
# newSampleSheet=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedSampleSheet.tsv"),
# newConfigFile=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedConfig.yaml"),
# merqLog=os.path.join(config['Results'],"logs/2_merylAndMerq/" + "{sample}" + "_merqury.spectra-cn.log"),
keyValues=temp(os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/aggregate.tsv")),
rowNames=temp(os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/aggregate_rowNames.tsv")),
keyValuesWithRowNames=os.path.join(config['Results'],"{sample}"+ "/5_Key_Results/{sample}_aggregatedResults.tsv"),
gscopeSum=os.path.join(config['Results'], "{sample}" +"/5_Key_Results/" + "{sample}" + "_gScope_summary.txt"),
gscopeLog=os.path.join(config['Results'], "{sample}" +"/5_Key_Results/" + "{sample}" + "_gScope_log_plot.png"),
......@@ -398,27 +398,29 @@ rule saveConfiguration_and_getKeyValues:
cp {input.spectraFlat} {output.spectraFlat}
cp {input.scaffStats} {output.scaffStats}
cp {input.contStats} {output.contStats}
echo "{wildcards.sample}" >> {output.keyValues}
echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {output.keyValues}
echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {output.keyValues}
echo "$(awk {{'print $4'}} {input.qv})" >> {output.keyValues}
echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues}
echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues}
echo "$(grep 'N100' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
echo "$(grep 'NG100' {input.scaffStats} | awk {{'print $4'}})" >> {output.keyValues}
echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues}
echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {output.keyValues}
echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {output.keyValues}
echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {output.keyValues}
echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues}
echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {output.keyValues}
dos2unix {output.keyValues}
echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nScaffold_N100_length\nScaffold_NG100_length\nContig_N50_length\nContig_NG50_length\ntotal_num_bases\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps" > {output.rowNames}
paste -d'\t' {output.rowNames} {output.keyValues} > {output.keyValuesWithRowNames}
echo "{wildcards.sample}" >> {params.keyValues}
echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {params.keyValues}
echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(awk {{'print $4'}} {input.qv})" >> {params.keyValues}
echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(grep 'N100' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'NG100' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues}
echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
dos2unix {params.keyValues}
echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nScaffold_N100_length\nScaffold_NG100_length\nContig_N50_length\nContig_NG50_length\ntotal_num_bases\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps" > {params.rowNames}
paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames}
printf "Evaluation complete for {wildcards.sample}, please find these results in {params.resultsPath} folder"
rm {params.rowNames} {params.keyValues}
"""
#
# cp {input.checkLog} {output.merqLog}
#echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} | sed 's/,//g')" >> {output.keyValues} put this to remove commas
......
......@@ -15,7 +15,7 @@ with open(sys.argv[2]) as f:
#
# size_bp = estSizeFile.readlines()
estSize=int(float(sys.argv[6]))
estSize=int(float(sys.argv[4]))
if estSize == 0:
size_bp = ff[0][:-1]
......@@ -286,7 +286,7 @@ if __name__ == "__main__":
# contig = csv.writer(open(naming + "_contig_stats.tsv", "w"), delimiter='\t')
# for key, val in contig_stats.items():
#
with open(sys.argv[7], 'w') as outputfile:
with open(sys.argv[5], 'w') as outputfile:
# # print('#' + libraryName, file=outputfile)
# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile)
# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile)
......@@ -297,7 +297,7 @@ if __name__ == "__main__":
print("", file=outputfile)
print(df_scaffold_L_LG.to_string(index=False), file=outputfile)
with open(sys.argv[8], 'w') as outputfile2:
with open(sys.argv[6], 'w') as outputfile2:
# # print('#' + libraryName, file=outputfile)
# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile)
# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile)
......@@ -308,21 +308,21 @@ if __name__ == "__main__":
print("", file=outputfile2)
print(df_contig_L_LG.to_string(index=False), file=outputfile2)
with open(sys.argv[4], 'w') as outputRst:
print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
print("", file=outputRst)
print(tabulate(df_scaffold_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
print("", file=outputRst)
print(tabulate(df_scaffold_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
print("", file=outputRst)
with open(sys.argv[5], 'w') as outputRst2:
print(tabulate(df_contig_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
print("", file=outputRst2)
print(tabulate(df_contig_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
print("", file=outputRst2)
print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
print("", file=outputRst2)
# with open(sys.argv[4], 'w') as outputRst:
# print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
# print("", file=outputRst)
# print(tabulate(df_scaffold_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
# print("", file=outputRst)
# print(tabulate(df_scaffold_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst)
# print("", file=outputRst)
#
# with open(sys.argv[5], 'w') as outputRst2:
# print(tabulate(df_contig_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
# print("", file=outputRst2)
# print(tabulate(df_contig_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
# print("", file=outputRst2)
# print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2)
# print("", file=outputRst2)
# list_of_dfs=[df_scaffold_top,df_scaffold_N_NG,df_scaffold_L_LG]
# for df in list_of_dfs:
# with open('all_dfs.tsv','a') as f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment