diff --git a/rules/evaluate.smk b/rules/evaluate.smk new file mode 100644 index 0000000000000000000000000000000000000000..18f8313d2d0d77fbb8f60b899f9fb8b5972c1efb --- /dev/null +++ b/rules/evaluate.smk @@ -0,0 +1,774 @@ +args_o=os.path.join(workflow.basedir, "buscoLineage") +args_l=config['busco5Lineage'] +checkLineagePath=args_o + "/" + args_l + "_odb10" +checkLineageDled=args_o + "/" + args_l +outDir=args_o + + +if os.path.isdir(args_l) is True: +# subprocess.call("ln -sf %s %s"%(args.l, args.l), shell=True) + buscoDataBaseName=os.path.basename(args_l) + buscoDataBaseName=buscoDataBaseName[:-6] +# print("Lineage path given, basename is:", buscoDataBaseName) +elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True: +# subprocess.call("ln -sf %s %s"%(checkLineagePath, checkLineageDled), shell=True) + buscoDataBaseName=os.path.basename(checkLineagePath) + buscoDataBaseName=buscoDataBaseName[:-6] +# print("Database already in buscoLineage directory, basename is:", buscoDataBaseName) +else: +# print("Database will be downloaded") + url = "https://busco-data.ezlab.org/v5/data/lineages/" + html = urlopen(url).read() + soup = BeautifulSoup(html, features="html.parser") +# kill all script and style elements + for script in soup(["script", "style"]): + script.extract() # rip it out +# get text + text = soup.get_text() +# break into lines and remove leading and trailing space on each + lines = (line.strip() for line in text.splitlines()) + linID=None +#identify the lineage file + for line in text.splitlines(): + if line.startswith(args_l): +# print(line) + linID=line.split(" ")[0] +# print(linID) + break + if not linID==None: + linLink=url+linID +# print(linLink) + print('Downloading Busco Database:', args_l) + subprocess.run("wget -q -P %s %s"%(outDir, linLink), shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + subprocess.run("tar -xvf %s*.tar.gz -C %s"%(args_o + "/" + args_l, outDir), shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + subprocess.run("rm %s_*.tar.gz"%(checkLineageDled), shell=True, check=True) + buscoDataBaseName=args_l + print('Downloading Busco Database:', args_l, " - COMPLETE") + else: + raise ValueError("Error - could not identify lineage please check busco site for a correct prefix") + + +def merylDB(wildcards): + return samples.loc[(wildcards.asmID), "merylDB"] + + +localrules: symlinkUnzippedHiC_R1, symlinkUnzippedHiC_R2, symlinkUnzippedFasta_PRI, symlinkUnzippedFasta_ALT, moveBuscoOutputs, saveConfiguration_and_getKeyValues_kmer, saveConfiguration_and_getKeyValues, aggregateAllAssemblies, makeReport, pretextMaps2md, addFullTable, aggregateReport + +def HiC_R1_gzipped(wildcards): + return yesGzip_HiC_R1.loc[(wildcards.asmID), "HiC_R1"] + +rule unzipHiC_R1: + input: + assembly=HiC_R1_gzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq")), + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.R1.pigzUnzip.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + threads: + resource['unzipHiC_R1']['threads'] + resources: + mem_mb=resource['unzipHiC_R1']['mem_mb'], + time=resource['unzipHiC_R1']['time'], + shell: + """ + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} + """ + +def HiC_R1_unzipped(wildcards): + return noGzip_HiC_R1.loc[(wildcards.asmID), "HiC_R1"] + +rule symlinkUnzippedHiC_R1: + input: + assembly=HiC_R1_unzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq")), + container: + None + shell: + """ + ln -s {input} {output} + """ + + +def HiC_R2_gzipped(wildcards): + return yesGzip_HiC_R2.loc[(wildcards.asmID), "HiC_R2"] + +rule unzipHiC_R2: + input: + assembly=HiC_R2_gzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq")), + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.R2.pigzUnzip.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + threads: + resource['unzipHiC_R2']['threads'] + resources: + mem_mb=resource['unzipHiC_R2']['mem_mb'], + time=resource['unzipHiC_R2']['time'] + shell: + """ + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} + """ + +def HiC_R2_unzipped(wildcards): + return noGzip_HiC_R2.loc[(wildcards.asmID), "HiC_R2"] + +rule symlinkUnzippedHiC_R2: + input: + assembly=HiC_R2_unzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq")), + container: + None + shell: + """ + ln -s {input} {output} + """ + + + + + +rule pretext_index_PRI_asm: + input: + assemblyPRI=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai")) + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.PRI.indexing.log") + conda: + os.path.join(workflow.basedir, "envs/pretext.yaml") + threads: + resource['pretext_index_PRI_asm']['threads'] + resources: + mem_mb=resource['pretext_index_PRI_asm']['mem_mb'], + time=resource['pretext_index_PRI_asm']['time'] + shell: + """ + (bwa-mem2 index {input.assemblyPRI}) &> {log} + (samtools faidx {input.assemblyPRI}) &>> {log} + """ + +rule pretext_fastq2bam_R1: + input: + HiC_R1=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq"), + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ] + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.bam")) + conda: + os.path.join(workflow.basedir, "envs/pretext.yaml") + threads: + resource['pretext_fastq2bam_R1']['threads'] + resources: + mem_mb=resource['pretext_fastq2bam_R1']['mem_mb'], + time=resource['pretext_fastq2bam_R1']['time'] + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.fastq2bam.R1.log") + shell: + """ + (bwa-mem2 mem -t {threads} -B8 {input.assembly} {input.HiC_R1} | samtools view -Sb - > {output}) &> {log} + """ + +rule pretext_fastq2bam_R2: + input: + HiC_R2=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq"), + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ] + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.bam")) + conda: + os.path.join(workflow.basedir, "envs/pretext.yaml") + threads: + resource['pretext_fastq2bam_R2']['threads'] + resources: + mem_mb=resource['pretext_fastq2bam_R2']['mem_mb'], + time=resource['pretext_fastq2bam_R2']['time'] + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.fastq2bam.R2.log") + shell: + """ + (bwa-mem2 mem -t {threads} -B8 {input.assembly} {input.HiC_R2} | samtools view -Sb - > {output}) &> {log} + """ + +rule pretext_filter_5primeEnd_R1: + input: + HiC_R1_bam=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.bam"), + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ] + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.FILTERED.bam")) + params: + script=os.path.join(workflow.basedir, "scripts/filter_five_end.pl") + conda: + os.path.join(workflow.basedir, "envs/pretext.yaml") + threads: + resource['pretext_filter_5primeEnd_R1']['threads'] + resources: + mem_mb=resource['pretext_filter_5primeEnd_R1']['mem_mb'], + time=resource['pretext_filter_5primeEnd_R1']['time'] + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.filter5end.R1.log") + shell: + """ + (samtools view -h {input.HiC_R1_bam}| perl {params.script} | samtools view -@{threads} -Sb - > {output}) &> {log} + """ + +rule pretext_filter_5primeEnd_R2: + input: + HiC_R2_bam=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.bam"), + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ] + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.FILTERED.bam")) + params: + script=os.path.join(workflow.basedir, "scripts/filter_five_end.pl") + conda: + os.path.join(workflow.basedir, "envs/pretext.yaml") + threads: + resource['pretext_filter_5primeEnd_R2']['threads'] + resources: + mem_mb=resource['pretext_filter_5primeEnd_R2']['mem_mb'], + time=resource['pretext_filter_5primeEnd_R2']['time'] + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.filter5end.R2.log") + shell: + """ + (samtools view -h {input.HiC_R2_bam}| perl {params.script} | samtools view -@{threads} -Sb - > {output}) &> {log} + """ + + +rule pretext_filtered_combine: + input: + R1=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.FILTERED.bam"), + R2=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.FILTERED.bam") + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.bam")) + params: + script=os.path.join(workflow.basedir, "scripts/two_read_bam_combiner.pl") + conda: + os.path.join(workflow.basedir, "envs/pretext.yaml") + threads: + resource['pretext_filtered_combine']['threads'] + resources: + mem_mb=resource['pretext_filtered_combine']['mem_mb'], + time=resource['pretext_filtered_combine']['time'] + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.log") + shell: + """ + (perl {params.script} {input.R1} {input.R2} | samtools view -@{threads} -Sb > {output}) &>{log} + """ + +rule pretext_map: + input: + HiC_alignment=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.bam") + output: + pretextFile=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.pretext")) + conda: + os.path.join(workflow.basedir, "envs/pretext.yaml") + threads: + resource['pretext_map']['threads'] + resources: + mem_mb=resource['pretext_map']['mem_mb'], + time=resource['pretext_map']['time'] + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.PretextMap.log") + shell: + """ + (samtools view -h {input.HiC_alignment} | PretextMap -o {output.pretextFile} --sortby length --mapq 10) &> {log} + """ + +rule pretext_snapshot: + input: + pretextFile=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.pretext") + output: + pretextSnapshotFULL=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png"), + params: + outDirectory=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/") + conda: + os.path.join(workflow.basedir, "envs/pretext.yaml") + threads: + resource['pretext_snapshot']['threads'] + resources: + mem_mb=resource['pretext_snapshot']['mem_mb'], + time=resource['pretext_snapshot']['time'] + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.PretextSnapshot.log") + shell: + """ + (PretextSnapshot -m {input.pretextFile} -o {params.outDirectory}) &> {log} + """ + +####################################################################################################################################### + +def PRI_asm_gzipped(wildcards): + return yesGzip_PRI.loc[(wildcards.asmID), "PRI_asm"] + + +rule unzipFasta_PRI: + input: + assembly=PRI_asm_gzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta")), + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.PRI.pigzUnzip.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + threads: + resource['unzipFasta_PRI']['threads'] + resources: + mem_mb=resource['unzipFasta_PRI']['mem_mb'], + time=resource['unzipFasta_PRI']['time'] + shell: + """ + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} + """ + +def PRI_asm_unzipped(wildcards): + return noGzip_PRI.loc[(wildcards.asmID), "PRI_asm"] + +rule symlinkUnzippedFasta_PRI: + input: + assembly=PRI_asm_unzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta")), + container: + None + shell: + """ + ln -s {input} {output} + """ + +################ + +def ALT_asm_gzipped(wildcards): + return yesGzip_ALT.loc[(wildcards.asmID), "ALT_asm"] + +rule unzipFasta_ALT: + input: + assembly=ALT_asm_gzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta")), + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.ALT.pigzUnzip.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + threads: + resource['unzipFasta_ALT']['threads'] + resources: + mem_mb=resource['unzipFasta_ALT']['mem_mb'], + time=resource['unzipFasta_ALT']['time'] + shell: + """ + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} + """ + + +def ALT_asm_unzipped(wildcards): + return noGzip_ALT.loc[(wildcards.asmID), "ALT_asm"] + +rule symlinkUnzippedFasta_ALT: + input: + assembly=ALT_asm_unzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta")), + container: + None + shell: + """ + ln -s {input} {output} + """ + + +#################### + + +def altFile(wildcards): + if samples.loc[(wildcards.asmID), "ALT_present"] == True: + return os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta") + else: + return os.path.join(workflow.basedir, "scripts/ALT_missing.fasta") + + +rule merqury: + input: + assemblyPRI=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + assemblyALT=altFile, + merylDB_provided=merylDB + params: + outFile= "{asmID}" + "_merqOutput", + outPath= os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT"), + symlink_merylDB=directory(os.path.join(config['Results'], "1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.meryl")) + threads: + resource['merqury']['threads'] + resources: + mem_mb=resource['merqury']['mem_mb'], + time=resource['merqury']['time'] + output: + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.hist") + log: + os.path.join(config['Results'],"1_evaluation/{asmID}/logs/{asmID}_merqury.log") + priority: + 3 + conda: + os.path.join(workflow.basedir, "envs/merylMerq_2.yaml") + shell: + """ + cd {params.outPath} + export OMP_NUM_THREADS={threads} + ln -s {input.merylDB_provided} {params.symlink_merylDB} + (merqury.sh {params.symlink_merylDB} {input.assemblyPRI} {input.assemblyALT} {params.outFile}) &> {log} + """ + + +####################################################################################################################################### + + + +rule busco5: + input: + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + lineage = os.path.join(workflow.basedir, "buscoLineage/" + buscoDataBaseName + "_odb10"), + params: + assemblyName = "{asmID}", + chngDir = os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO") + threads: + resource['busco5']['threads'] + resources: + mem_mb=resource['busco5']['mem_mb'], + time=resource['busco5']['time'] + output: + os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), + conda: + os.path.join(workflow.basedir, "envs/busco_and_assembly.yaml") + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}_busco5.log") + priority: + 20 + shell: + """ + cd {params.chngDir} + (busco -m genome --offline --in {input.assembly} -o {params.assemblyName} -l {input.lineage} -c {threads} -f --limit 5) &> {log} + """ + + +rule moveBuscoOutputs: + input: + buscoResult=os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), + params: + rmDir= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}"), + mvRunBusco= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/run_" + buscoDataBaseName + "_odb10"), + mvRunBuscoDest= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO"), + logs= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/logs") + output: + file = os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), + container: + None + shell: + """ + mv -t {params.mvRunBuscoDest} {params.logs} + mv -t {params.mvRunBuscoDest} {params.mvRunBusco} + mv {input.buscoResult} {output.file} + rm -r {params.rmDir} + """ + + + + + +rule genomescope2: + input: + hist=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.hist"), + params: + outFolder=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/"), + name="{asmID}_k{kmer}", + kmer="{kmer}", + cpHist=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/merylDB_providedFor_{asmID}_10000.hist") + output: + summary=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), + logPlot=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), + linearPlot=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), + estimatedSize=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_sizeEst.txt")) + conda: + os.path.join(workflow.basedir, "envs/genomescope.yaml") + log: + os.path.join(config['Results'],"1_evaluation/{asmID}/logs/{asmID}_k{kmer}_gscopelog.txt") + threads: + resource['genomescope2']['threads'] + resources: + mem_mb=resource['genomescope2']['mem_mb'], + time=resource['genomescope2']['time'] + shell: + """ + head -n 10000 {input.hist} > {params.cpHist} + (genomescope2 -i {params.cpHist} -o {params.outFolder} -k {params.kmer} -n {params.name}) &> {log} + grep "Genome Haploid Length" {output.summary} | awk {{'print $6'}} | sed 's/,//g'> {output.estimatedSize} + """ + +rule assemblyStats: + input: + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + estGenome=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_sizeEst.txt"), asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]), + output: + scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_scaffold_stats.tsv"), + contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv") + params: + script = os.path.join(workflow.basedir, "scripts/stats.py"), + path = os.path.join(config['Results'], "1_evaluation/{asmID}/02_assemblyStats/"), + filename="{asmID}", + given_size=lambda wildcards: expand("{genomeSize}", genomeSize=dictSamples[wildcards.asmID][4]) + conda: + os.path.join(workflow.basedir, "envs/python_scripts.yaml") + threads: + resource['assemblyStats']['threads'] + resources: + mem_mb=resource['assemblyStats']['mem_mb'], + time=resource['assemblyStats']['time'] + shell: + """ + python {params.script} {input.assembly} {input.estGenome} {params.filename} {params.given_size} {output.scaffStats} {output.contStats} + """ + + + + +rule saveConfiguration_and_getKeyValues_kmer: + input: + gscopeSum=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), + gscopeLog=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), + gscopeLin=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), + params: + keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv") + output: + gscopeSum=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_summary.txt"), + gscopeLog=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_log_plot.png"), + gscopeLin=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_linear_plot.png") + container: + None + shell: + """ + cp {input.gscopeSum} {output.gscopeSum} + cp {input.gscopeLog} {output.gscopeLog} + cp {input.gscopeLin} {output.gscopeLin} + """ +# + + + +rule saveConfiguration_and_getKeyValues: + input: + gscopeSum=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]), + gscopeLog=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]), + gscopeLin=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]), + busco=os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), + qv=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), + completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), + spectraStacked_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), + spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), + spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"), + spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"), + scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_scaffold_stats.tsv"), + contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv"), + params: + resultsPath=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults"), + keyValues2=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames_2.tsv"), + keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames.tsv"), + rowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/rowNames.tsv") + output: + keyValuesWithRowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), + busco=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), + buscoScores=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/only_buscoScores_{asmID}.txt")), + qv=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.qv"), + completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.completeness.stats"), + spectraStacked_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), + spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), + spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.st.png"), + spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.fl.png"), + scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_scaffold_stats.tsv"), + contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_contig_stats.tsv"), + container: + None + shell: + """ + head -n 15 {input.busco} | tail -n 7 | sed "s/^['\t']*//" > {output.buscoScores} + cp {input.busco} {output.busco} + cp {input.completeness} {output.completeness} + cp {input.qv} {output.qv} + cp {input.spectraStacked_PRI} {output.spectraStacked_PRI} + cp {input.spectraFlat_PRI} {output.spectraFlat_PRI} + cp {input.spectraFlat_both} {output.spectraFlat_both} + cp {input.spectraStacked_both} {output.spectraStacked_both} + cp {input.scaffStats} {output.scaffStats} + cp {input.contStats} {output.contStats} + echo "{wildcards.asmID}" >> {params.keyValues} + echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} + echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}})" >> {params.keyValues} + echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'gc_content' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} + echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'longest (bp)' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} + echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'longest (bp)' {input.contStats} | awk {{'print $3'}})" >> {params.keyValues} + echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'N95' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'NG95' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} + echo "$(awk {{'print $4'}} {input.qv})" | head -n 1 >> {params.keyValues} + echo "$(awk {{'print $5'}} {input.completeness})" | head -n 1 >> {params.keyValues} + echo "$(grep 'C:' {input.busco} | awk -F'[:\\[,]' {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'C:' {input.busco} | awk -F'[:\\[,]' {{'print $4'}})" >> {params.keyValues} + echo -e "ASM_ID\nBases\nEst_Size\nHet_%\nGC_%\nScaff\nCont\nGaps\nLongest_Scaff\nScaff_N50\nScaff_NG50\nScaff_N95\nScaff_NG95\nLongest_Cont\nCont_N50\nCont_NG50\nCont_N95\nCont_NG95\nQV\nCompleteness\nComp_BUSCOs_%\nComp_Single_BUSCOs_%" > {params.rowNames} + paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} + rm {params.rowNames} {params.keyValues} + """ + +rule aggregateAllAssemblies: + input: + allResults=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(dictSamples.keys())), + sampleSheet= config['samplesTSV'], + config=os.path.join(workflow.basedir, "configuration/config.yaml") + output: + results=os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), + newSampleSheet=os.path.join(config['Results'],"1_evaluation/finalResults/savedSampleSheet.tsv"), + newConfigFile=os.path.join(config['Results'],"1_evaluation/finalResults/savedConfig.yaml") + container: + None + shell: + """ + cp {input.sampleSheet} {output.newSampleSheet} + cp {input.config} {output.newConfigFile} + echo -e "ASM_ID\nBases_Mb\nEst_Size_Mb\nHet_%\nGC_%\nScaff\nCont\nGaps\nLongest_Scaff_Mb\nScaff_N50_Mb\nScaff_NG50_Mb\nScaff_N95_Mb\nScaff_NG95_Mb\nLongest_Cont\nCont_N50_Mb\nCont_NG50_Mb\nCont_N95_Mb\nCont_NG95_Mb\nQV\nCompleteness\nComp_BUSCOs_%\nComp_Single_BUSCOs_%" | \ + paste -d'\t' - {input.allResults} | \ + awk -F'\t' '{{s="";for (i=1;i<=NF;i+=2) {{s=s?s FS $i:$i}} print s}}' | \ + column -t > {output.results} + sed -i 's/,//g' {output.results} + """ + +rule makeReport: + input: + os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), + lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_log_plot.png"),asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]), + lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_linear_plot.png"),asmID=wildcards.asmID, kmer=dictSamples[wildcards.asmID][3]), + os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.qv"), + os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.completeness.stats"), + os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/only_buscoScores_{asmID}.txt"), + os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.fl.png") + conda: + os.path.join(workflow.basedir, "envs/python_scripts.yaml") + params: + "{asmID}", + "{kmer}", + config['busco5Lineage'] + output: + os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md") + script: + os.path.join(workflow.basedir, "scripts/makePDF_indivMD.py") + + + +rule pretextMaps2md: + input: + PretextMap=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png") + output: + pretextCP2keyResults=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png"), + IndividualPretextMD=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_FullPRETEXTMarkdown.md") + params: + assemblyName='{asmID}' + conda: + os.path.join(workflow.basedir, "envs/python_scripts.yaml") + script: + os.path.join(workflow.basedir, "scripts/pretextMapsToMarkdown.py") + +rule addFullTable: + input: + results=os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv") + output: + os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown.md") + conda: + os.path.join(workflow.basedir, "envs/python_scripts.yaml") + script: + os.path.join(workflow.basedir, "scripts/addFullTableForReport.py") + +rule aggregateReport: + input: + indivMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value4) for key, [value1, value2, value3, value4, value5, value6, value7, value8, value9, value10, value11, value12, value13, value14, value15] in dictSamples.items()], + landingPage=os.path.join(workflow.basedir, "scripts/reportLandingPage.md"), + landingPageTABLE=os.path.join(workflow.basedir, "scripts/reportTABLELandingPage.md"), + endTableMD=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown.md"), + IndividualPretextMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_FullPRETEXTMarkdown.md"), asmID=list(testDictPRETEXT.keys()))] + output: + FullMarkdown=os.path.join(config['Results'],"1_evaluation/finalResults/FullMarkdown.md"), + endTableMD=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown_wPreamble.md") + container: + None + shell: + """ + cat {input.landingPage} {input.indivMD} {input.IndividualPretextMD} >> {output.FullMarkdown} + cat {input.landingPageTABLE} {input.endTableMD} >> {output.endTableMD} + """ + + + +rule makePDF: + input: + md_report=os.path.join(config['Results'],"1_evaluation/finalResults/FullMarkdown.md"), + md_comparison_table=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown_wPreamble.md") + output: + pdf_report=os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"), + pdf_comparison_table=os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf") + log: + os.path.join(config['Results'], "1_evaluation/logs/MAKEPDF_pandoc.log") + conda: + os.path.join(workflow.basedir, "envs/python_scripts.yaml") + threads: + resource['makePDF']['threads'] + resources: + mem_mb=resource['makePDF']['mem_mb'], + time=resource['makePDF']['time'] + shell: + """ + (pandoc -o {output.pdf_report} {input.md_report} --pdf-engine=tectonic) &>> {log} + (pandoc -o {output.pdf_comparison_table} {input.md_comparison_table} --pdf-engine=tectonic) &>> {log} + """