Skip to content
Snippets Groups Projects
Commit 687b9caa authored by james94's avatar james94
Browse files

dictionary misnamed

parent efbfdbc0
No related branches found
No related tags found
No related merge requests found
args_o=os.path.join(workflow.basedir, "buscoLineage")
args_l=config['busco5Lineage']
checkLineagePath=args_o + "/" + args_l + "_odb10"
checkLineageDled=args_o + "/" + args_l
outDir=args_o
if os.path.isdir(args_l) is True:
# subprocess.call("ln -sf %s %s"%(args.l, args.l), shell=True)
buscoDataBaseName=os.path.basename(args_l)
buscoDataBaseName=buscoDataBaseName[:-6]
# print("Lineage path given, basename is:", buscoDataBaseName)
elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True:
# subprocess.call("ln -sf %s %s"%(checkLineagePath, checkLineageDled), shell=True)
buscoDataBaseName=os.path.basename(checkLineagePath)
buscoDataBaseName=buscoDataBaseName[:-6]
# print("Database already in buscoLineage directory, basename is:", buscoDataBaseName)
else:
# print("Database will be downloaded")
url = "https://busco-data.ezlab.org/v5/data/lineages/"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
linID=None
#identify the lineage file
for line in text.splitlines():
if line.startswith(args_l):
# print(line)
linID=line.split(" ")[0]
# print(linID)
break
if not linID==None:
linLink=url+linID
# print(linLink)
print('Downloading Busco Database:', args_l)
subprocess.run("wget -q -P %s %s"%(outDir, linLink), shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
subprocess.run("tar -xvf %s*.tar.gz -C %s"%(args_o + "/" + args_l, outDir), shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
subprocess.run("rm %s_*.tar.gz"%(checkLineageDled), shell=True, check=True)
buscoDataBaseName=args_l
print('Downloading Busco Database:', args_l, " - COMPLETE")
else:
raise ValueError("Error - could not identify lineage please check busco site for a correct prefix")
def merylDB(wildcards):
return samples.loc[(wildcards.asmID), "merylDB"]
localrules: symlinkUnzippedHiC_R1, symlinkUnzippedHiC_R2, symlinkUnzippedFasta_PRI, symlinkUnzippedFasta_ALT, moveBuscoOutputs, saveConfiguration_and_getKeyValues_kmer, saveConfiguration_and_getKeyValues, aggregateAllAssemblies, makeReport, pretextMaps2md, addFullTable, aggregateReport
def HiC_R1_gzipped(wildcards):
return yesGzip_HiC_R1.loc[(wildcards.asmID), "HiC_R1"]
rule unzipHiC_R1:
input:
assembly=HiC_R1_gzipped,
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq")),
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.R1.pigzUnzip.log")
conda:
os.path.join(workflow.basedir, "envs/pigz.yaml")
threads:
resource['unzipHiC_R1']['threads']
resources:
mem_mb=resource['unzipHiC_R1']['mem_mb'],
time=resource['unzipHiC_R1']['time'],
shell:
"""
pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log}
"""
def HiC_R1_unzipped(wildcards):
return noGzip_HiC_R1.loc[(wildcards.asmID), "HiC_R1"]
rule symlinkUnzippedHiC_R1:
input:
assembly=HiC_R1_unzipped,
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq")),
container:
None
shell:
"""
ln -s {input} {output}
"""
def HiC_R2_gzipped(wildcards):
return yesGzip_HiC_R2.loc[(wildcards.asmID), "HiC_R2"]
rule unzipHiC_R2:
input:
assembly=HiC_R2_gzipped,
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq")),
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.R2.pigzUnzip.log")
conda:
os.path.join(workflow.basedir, "envs/pigz.yaml")
threads:
resource['unzipHiC_R2']['threads']
resources:
mem_mb=resource['unzipHiC_R2']['mem_mb'],
time=resource['unzipHiC_R2']['time']
shell:
"""
pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log}
"""
def HiC_R2_unzipped(wildcards):
return noGzip_HiC_R2.loc[(wildcards.asmID), "HiC_R2"]
rule symlinkUnzippedHiC_R2:
input:
assembly=HiC_R2_unzipped,
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq")),
container:
None
shell:
"""
ln -s {input} {output}
"""
rule pretext_index_PRI_asm:
input:
assemblyPRI=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"),
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123")),
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb")),
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann")),
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64")),
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac")),
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai"))
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.PRI.indexing.log")
conda:
os.path.join(workflow.basedir, "envs/pretext.yaml")
threads:
resource['pretext_index_PRI_asm']['threads']
resources:
mem_mb=resource['pretext_index_PRI_asm']['mem_mb'],
time=resource['pretext_index_PRI_asm']['time']
shell:
"""
(bwa-mem2 index {input.assemblyPRI}) &> {log}
(samtools faidx {input.assemblyPRI}) &>> {log}
"""
rule pretext_fastq2bam_R1:
input:
HiC_R1=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq"),
assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"),
indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ]
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.bam"))
conda:
os.path.join(workflow.basedir, "envs/pretext.yaml")
threads:
resource['pretext_fastq2bam_R1']['threads']
resources:
mem_mb=resource['pretext_fastq2bam_R1']['mem_mb'],
time=resource['pretext_fastq2bam_R1']['time']
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.fastq2bam.R1.log")
shell:
"""
(bwa-mem2 mem -t {threads} -B8 {input.assembly} {input.HiC_R1} | samtools view -Sb - > {output}) &> {log}
"""
rule pretext_fastq2bam_R2:
input:
HiC_R2=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq"),
assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"),
indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ]
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.bam"))
conda:
os.path.join(workflow.basedir, "envs/pretext.yaml")
threads:
resource['pretext_fastq2bam_R2']['threads']
resources:
mem_mb=resource['pretext_fastq2bam_R2']['mem_mb'],
time=resource['pretext_fastq2bam_R2']['time']
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.fastq2bam.R2.log")
shell:
"""
(bwa-mem2 mem -t {threads} -B8 {input.assembly} {input.HiC_R2} | samtools view -Sb - > {output}) &> {log}
"""
rule pretext_filter_5primeEnd_R1:
input:
HiC_R1_bam=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.bam"),
assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"),
indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ]
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.FILTERED.bam"))
params:
script=os.path.join(workflow.basedir, "scripts/filter_five_end.pl")
conda:
os.path.join(workflow.basedir, "envs/pretext.yaml")
threads:
resource['pretext_filter_5primeEnd_R1']['threads']
resources:
mem_mb=resource['pretext_filter_5primeEnd_R1']['mem_mb'],
time=resource['pretext_filter_5primeEnd_R1']['time']
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.filter5end.R1.log")
shell:
"""
(samtools view -h {input.HiC_R1_bam}| perl {params.script} | samtools view -@{threads} -Sb - > {output}) &> {log}
"""
rule pretext_filter_5primeEnd_R2:
input:
HiC_R2_bam=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.bam"),
assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"),
indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"),
os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ]
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.FILTERED.bam"))
params:
script=os.path.join(workflow.basedir, "scripts/filter_five_end.pl")
conda:
os.path.join(workflow.basedir, "envs/pretext.yaml")
threads:
resource['pretext_filter_5primeEnd_R2']['threads']
resources:
mem_mb=resource['pretext_filter_5primeEnd_R2']['mem_mb'],
time=resource['pretext_filter_5primeEnd_R2']['time']
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.filter5end.R2.log")
shell:
"""
(samtools view -h {input.HiC_R2_bam}| perl {params.script} | samtools view -@{threads} -Sb - > {output}) &> {log}
"""
rule pretext_filtered_combine:
input:
R1=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.FILTERED.bam"),
R2=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.FILTERED.bam")
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.bam"))
params:
script=os.path.join(workflow.basedir, "scripts/two_read_bam_combiner.pl")
conda:
os.path.join(workflow.basedir, "envs/pretext.yaml")
threads:
resource['pretext_filtered_combine']['threads']
resources:
mem_mb=resource['pretext_filtered_combine']['mem_mb'],
time=resource['pretext_filtered_combine']['time']
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.log")
shell:
"""
(perl {params.script} {input.R1} {input.R2} | samtools view -@{threads} -Sb > {output}) &>{log}
"""
rule pretext_map:
input:
HiC_alignment=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.bam")
output:
pretextFile=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.pretext"))
conda:
os.path.join(workflow.basedir, "envs/pretext.yaml")
threads:
resource['pretext_map']['threads']
resources:
mem_mb=resource['pretext_map']['mem_mb'],
time=resource['pretext_map']['time']
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.PretextMap.log")
shell:
"""
(samtools view -h {input.HiC_alignment} | PretextMap -o {output.pretextFile} --sortby length --mapq 10) &> {log}
"""
rule pretext_snapshot:
input:
pretextFile=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.pretext")
output:
pretextSnapshotFULL=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png"),
params:
outDirectory=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/")
conda:
os.path.join(workflow.basedir, "envs/pretext.yaml")
threads:
resource['pretext_snapshot']['threads']
resources:
mem_mb=resource['pretext_snapshot']['mem_mb'],
time=resource['pretext_snapshot']['time']
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.PretextSnapshot.log")
shell:
"""
(PretextSnapshot -m {input.pretextFile} -o {params.outDirectory}) &> {log}
"""
#######################################################################################################################################
def PRI_asm_gzipped(wildcards):
return yesGzip_PRI.loc[(wildcards.asmID), "PRI_asm"]
rule unzipFasta_PRI:
input:
assembly=PRI_asm_gzipped,
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta")),
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.PRI.pigzUnzip.log")
conda:
os.path.join(workflow.basedir, "envs/pigz.yaml")
threads:
resource['unzipFasta_PRI']['threads']
resources:
mem_mb=resource['unzipFasta_PRI']['mem_mb'],
time=resource['unzipFasta_PRI']['time']
shell:
"""
pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log}
"""
def PRI_asm_unzipped(wildcards):
return noGzip_PRI.loc[(wildcards.asmID), "PRI_asm"]
rule symlinkUnzippedFasta_PRI:
input:
assembly=PRI_asm_unzipped,
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta")),
container:
None
shell:
"""
ln -s {input} {output}
"""
################
def ALT_asm_gzipped(wildcards):
return yesGzip_ALT.loc[(wildcards.asmID), "ALT_asm"]
rule unzipFasta_ALT:
input:
assembly=ALT_asm_gzipped,
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta")),
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.ALT.pigzUnzip.log")
conda:
os.path.join(workflow.basedir, "envs/pigz.yaml")
threads:
resource['unzipFasta_ALT']['threads']
resources:
mem_mb=resource['unzipFasta_ALT']['mem_mb'],
time=resource['unzipFasta_ALT']['time']
shell:
"""
pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log}
"""
def ALT_asm_unzipped(wildcards):
return noGzip_ALT.loc[(wildcards.asmID), "ALT_asm"]
rule symlinkUnzippedFasta_ALT:
input:
assembly=ALT_asm_unzipped,
output:
temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta")),
container:
None
shell:
"""
ln -s {input} {output}
"""
####################
def altFile(wildcards):
if samples.loc[(wildcards.asmID), "ALT_present"] == True:
return os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta")
else:
return os.path.join(workflow.basedir, "scripts/ALT_missing.fasta")
rule merqury:
input:
assemblyPRI=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"),
assemblyALT=altFile,
merylDB_provided=merylDB
params:
outFile= "{asmID}" + "_merqOutput",
outPath= os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT"),
symlink_merylDB=directory(os.path.join(config['Results'], "1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.meryl"))
threads:
resource['merqury']['threads']
resources:
mem_mb=resource['merqury']['mem_mb'],
time=resource['merqury']['time']
output:
os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"),
os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"),
os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"),
os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"),
os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"),
os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"),
os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.hist")
log:
os.path.join(config['Results'],"1_evaluation/{asmID}/logs/{asmID}_merqury.log")
priority:
3
conda:
os.path.join(workflow.basedir, "envs/merylMerq_2.yaml")
shell:
"""
cd {params.outPath}
export OMP_NUM_THREADS={threads}
ln -s {input.merylDB_provided} {params.symlink_merylDB}
(merqury.sh {params.symlink_merylDB} {input.assemblyPRI} {input.assemblyALT} {params.outFile}) &> {log}
"""
#######################################################################################################################################
rule busco5:
input:
assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"),
lineage = os.path.join(workflow.basedir, "buscoLineage/" + buscoDataBaseName + "_odb10"),
params:
assemblyName = "{asmID}",
chngDir = os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO")
threads:
resource['busco5']['threads']
resources:
mem_mb=resource['busco5']['mem_mb'],
time=resource['busco5']['time']
output:
os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"),
conda:
os.path.join(workflow.basedir, "envs/busco_and_assembly.yaml")
log:
os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}_busco5.log")
priority:
20
shell:
"""
cd {params.chngDir}
(busco -m genome --offline --in {input.assembly} -o {params.assemblyName} -l {input.lineage} -c {threads} -f --limit 5) &> {log}
"""
rule moveBuscoOutputs:
input:
buscoResult=os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"),
params:
rmDir= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}"),
mvRunBusco= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/run_" + buscoDataBaseName + "_odb10"),
mvRunBuscoDest= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO"),
logs= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/logs")
output:
file = os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"),
container:
None
shell:
"""
mv -t {params.mvRunBuscoDest} {params.logs}
mv -t {params.mvRunBuscoDest} {params.mvRunBusco}
mv {input.buscoResult} {output.file}
rm -r {params.rmDir}
"""
rule genomescope2:
input:
hist=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.hist"),
params:
outFolder=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/"),
name="{asmID}_k{kmer}",
kmer="{kmer}",
cpHist=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/merylDB_providedFor_{asmID}_10000.hist")
output:
summary=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"),
logPlot=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"),
linearPlot=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"),
estimatedSize=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_sizeEst.txt"))
conda:
os.path.join(workflow.basedir, "envs/genomescope.yaml")
log:
os.path.join(config['Results'],"1_evaluation/{asmID}/logs/{asmID}_k{kmer}_gscopelog.txt")
threads:
resource['genomescope2']['threads']
resources:
mem_mb=resource['genomescope2']['mem_mb'],
time=resource['genomescope2']['time']
shell:
"""
head -n 10000 {input.hist} > {params.cpHist}
(genomescope2 -i {params.cpHist} -o {params.outFolder} -k {params.kmer} -n {params.name}) &> {log}
grep "Genome Haploid Length" {output.summary} | awk {{'print $6'}} | sed 's/,//g'> {output.estimatedSize}
"""
rule assemblyStats:
input:
assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"),
estGenome=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_sizeEst.txt"), asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]),
output:
scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_scaffold_stats.tsv"),
contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv")
params:
script = os.path.join(workflow.basedir, "scripts/stats.py"),
path = os.path.join(config['Results'], "1_evaluation/{asmID}/02_assemblyStats/"),
filename="{asmID}",
given_size=lambda wildcards: expand("{genomeSize}", genomeSize=dictSamples[wildcards.asmID][4])
conda:
os.path.join(workflow.basedir, "envs/python_scripts.yaml")
threads:
resource['assemblyStats']['threads']
resources:
mem_mb=resource['assemblyStats']['mem_mb'],
time=resource['assemblyStats']['time']
shell:
"""
python {params.script} {input.assembly} {input.estGenome} {params.filename} {params.given_size} {output.scaffStats} {output.contStats}
"""
rule saveConfiguration_and_getKeyValues_kmer:
input:
gscopeSum=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"),
gscopeLog=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"),
gscopeLin=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"),
params:
keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv")
output:
gscopeSum=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_summary.txt"),
gscopeLog=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_log_plot.png"),
gscopeLin=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_linear_plot.png")
container:
None
shell:
"""
cp {input.gscopeSum} {output.gscopeSum}
cp {input.gscopeLog} {output.gscopeLog}
cp {input.gscopeLin} {output.gscopeLin}
"""
#
rule saveConfiguration_and_getKeyValues:
input:
gscopeSum=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]),
gscopeLog=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]),
gscopeLin=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]),
busco=os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"),
qv=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"),
completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"),
spectraStacked_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"),
spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"),
spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"),
spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"),
scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_scaffold_stats.tsv"),
contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv"),
params:
resultsPath=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults"),
keyValues2=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames_2.tsv"),
keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames.tsv"),
rowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/rowNames.tsv")
output:
keyValuesWithRowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"),
busco=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"),
buscoScores=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/only_buscoScores_{asmID}.txt")),
qv=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.qv"),
completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.completeness.stats"),
spectraStacked_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"),
spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"),
spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.st.png"),
spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.fl.png"),
scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_scaffold_stats.tsv"),
contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_contig_stats.tsv"),
container:
None
shell:
"""
head -n 15 {input.busco} | tail -n 7 | sed "s/^['\t']*//" > {output.buscoScores}
cp {input.busco} {output.busco}
cp {input.completeness} {output.completeness}
cp {input.qv} {output.qv}
cp {input.spectraStacked_PRI} {output.spectraStacked_PRI}
cp {input.spectraFlat_PRI} {output.spectraFlat_PRI}
cp {input.spectraFlat_both} {output.spectraFlat_both}
cp {input.spectraStacked_both} {output.spectraStacked_both}
cp {input.scaffStats} {output.scaffStats}
cp {input.contStats} {output.contStats}
echo "{wildcards.asmID}" >> {params.keyValues}
echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues}
echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}})" >> {params.keyValues}
echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(grep 'gc_content' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues}
echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'longest (bp)' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues}
echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(grep 'longest (bp)' {input.contStats} | awk {{'print $3'}})" >> {params.keyValues}
echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(grep 'N95' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'NG95' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues}
echo "$(awk {{'print $4'}} {input.qv})" | head -n 1 >> {params.keyValues}
echo "$(awk {{'print $5'}} {input.completeness})" | head -n 1 >> {params.keyValues}
echo "$(grep 'C:' {input.busco} | awk -F'[:\\[,]' {{'print $2'}})" >> {params.keyValues}
echo "$(grep 'C:' {input.busco} | awk -F'[:\\[,]' {{'print $4'}})" >> {params.keyValues}
echo -e "ASM_ID\nBases\nEst_Size\nHet_%\nGC_%\nScaff\nCont\nGaps\nLongest_Scaff\nScaff_N50\nScaff_NG50\nScaff_N95\nScaff_NG95\nLongest_Cont\nCont_N50\nCont_NG50\nCont_N95\nCont_NG95\nQV\nCompleteness\nComp_BUSCOs_%\nComp_Single_BUSCOs_%" > {params.rowNames}
paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames}
rm {params.rowNames} {params.keyValues}
"""
rule aggregateAllAssemblies:
input:
allResults=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(dictSamples.keys())),
sampleSheet= config['samplesTSV'],
config=os.path.join(workflow.basedir, "configuration/config.yaml")
output:
results=os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"),
newSampleSheet=os.path.join(config['Results'],"1_evaluation/finalResults/savedSampleSheet.tsv"),
newConfigFile=os.path.join(config['Results'],"1_evaluation/finalResults/savedConfig.yaml")
container:
None
shell:
"""
cp {input.sampleSheet} {output.newSampleSheet}
cp {input.config} {output.newConfigFile}
echo -e "ASM_ID\nBases_Mb\nEst_Size_Mb\nHet_%\nGC_%\nScaff\nCont\nGaps\nLongest_Scaff_Mb\nScaff_N50_Mb\nScaff_NG50_Mb\nScaff_N95_Mb\nScaff_NG95_Mb\nLongest_Cont\nCont_N50_Mb\nCont_NG50_Mb\nCont_N95_Mb\nCont_NG95_Mb\nQV\nCompleteness\nComp_BUSCOs_%\nComp_Single_BUSCOs_%" | \
paste -d'\t' - {input.allResults} | \
awk -F'\t' '{{s="";for (i=1;i<=NF;i+=2) {{s=s?s FS $i:$i}} print s}}' | \
column -t > {output.results}
sed -i 's/,//g' {output.results}
"""
rule makeReport:
input:
os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"),
lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_log_plot.png"),asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]),
lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_linear_plot.png"),asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]),
os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.qv"),
os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.completeness.stats"),
os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/only_buscoScores_{asmID}.txt"),
os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"),
os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.fl.png")
conda:
os.path.join(workflow.basedir, "envs/python_scripts.yaml")
params:
"{asmID}",
"{kmer}",
config['busco5Lineage']
output:
os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md")
script:
os.path.join(workflow.basedir, "scripts/makePDF_indivMD.py")
rule pretextMaps2md:
input:
PretextMap=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png")
output:
pretextCP2keyResults=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png"),
IndividualPretextMD=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_FullPRETEXTMarkdown.md")
params:
assemblyName='{asmID}'
conda:
os.path.join(workflow.basedir, "envs/python_scripts.yaml")
script:
os.path.join(workflow.basedir, "scripts/pretextMapsToMarkdown.py")
rule addFullTable:
input:
results=os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv")
output:
os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown.md")
conda:
os.path.join(workflow.basedir, "envs/python_scripts.yaml")
script:
os.path.join(workflow.basedir, "scripts/addFullTableForReport.py")
rule aggregateReport:
input:
indivMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value4) for key, [value1, value2, value3, value4, value5, value6, value7, value8, value9, value10, value11, value12, value13, value14, value15] in dictSamples.items()],
landingPage=os.path.join(workflow.basedir, "scripts/reportLandingPage.md"),
landingPageTABLE=os.path.join(workflow.basedir, "scripts/reportTABLELandingPage.md"),
endTableMD=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown.md"),
IndividualPretextMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_FullPRETEXTMarkdown.md"), asmID=list(testDictPRETEXT.keys()))]
output:
FullMarkdown=os.path.join(config['Results'],"1_evaluation/finalResults/FullMarkdown.md"),
endTableMD=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown_wPreamble.md")
container:
None
shell:
"""
cat {input.landingPage} {input.indivMD} {input.IndividualPretextMD} >> {output.FullMarkdown}
cat {input.landingPageTABLE} {input.endTableMD} >> {output.endTableMD}
"""
rule makePDF:
input:
md_report=os.path.join(config['Results'],"1_evaluation/finalResults/FullMarkdown.md"),
md_comparison_table=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown_wPreamble.md")
output:
pdf_report=os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"),
pdf_comparison_table=os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf")
log:
os.path.join(config['Results'], "1_evaluation/logs/MAKEPDF_pandoc.log")
conda:
os.path.join(workflow.basedir, "envs/python_scripts.yaml")
threads:
resource['makePDF']['threads']
resources:
mem_mb=resource['makePDF']['mem_mb'],
time=resource['makePDF']['time']
shell:
"""
(pandoc -o {output.pdf_report} {input.md_report} --pdf-engine=tectonic) &>> {log}
(pandoc -o {output.pdf_comparison_table} {input.md_comparison_table} --pdf-engine=tectonic) &>> {log}
"""
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment