diff --git a/Snakefile b/Snakefile index 7ed7d9f05d55526518594587f8d339ee91c583c2..397ee4cbc662e61897642c803e7c38c5e5f92d59 100644 --- a/Snakefile +++ b/Snakefile @@ -36,7 +36,7 @@ def to_be_smrtTrimmed(userAnswer): samples = pd.read_csv(config['samplesTSV'], dtype=str, index_col=False, delim_whitespace=True, skip_blank_lines=True) if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'trimAdapters', 'fastQC']).issubset(samples.columns): - whichRule = "rules/illumina.smk" + whichRule = "rules/illumina_05_11_21.smk" samples=samples.reset_index() samples['readCounter'] = samples.groupby(['sample']).cumcount()+1 samples['readCounter'] = samples['readCounter'].astype(str) @@ -103,7 +103,7 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri ruleAll=[expand(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/{sample}_illuminaDb.{trim10x}.{trimAdapters}.{kmer}.meryl"), sample=key, kmer=value1, trim10x=value2, trimAdapters=value3) for key, [value1, value2, value3] in testDict.items()] elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']).issubset(samples.columns): - whichRule = "rules/hifi.smk" + whichRule = "rules/hifi_05_11_21.smk" samples=samples.reset_index() samples['readCounter'] = samples.groupby(['sample']).cumcount()+1 samples['readCounter'] = samples['readCounter'].astype(str) @@ -171,10 +171,10 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']). ruleAllQCFiles=[] if samples['fastQC'].str.contains('True').any(): - ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifi_reads/05_multiqc/{sample}.multiqcReport.html"), sample=key, smrtornot=value1) for key, [value1, value2] in testDictQC.items()] - ruleAll=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifi_reads/03_merylDb/complete_hifi_{sample}_dB.{kmer}.meryl"), sample=key, kmer=value1, smrtornot=value2) for key, [value1, value2] in testDict.items()] + ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{sample}.{smrtornot}.multiqcReport.html"), sample=key, smrtornot=value1) for key, [value1, value2] in testDictQC.items()] + ruleAll=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/03_merylDb/complete_hifi_{sample}_dB.{smrtornot}.{kmer}.meryl"), sample=key, kmer=value1, smrtornot=value2) for key, [value1, value2] in testDict.items()] elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize']).issubset(samples.columns): - whichRule = "rules/run.smk" + whichRule = "rules/run_05_11_21.smk" samples['genomeSize'] = samples['genomeSize'].fillna(0) samples["genomeSize"] = pd.to_numeric(samples["genomeSize"]) diff --git a/installGEP.yaml b/installGEP.yaml index 4f1f395ed78224b335e351e419531fb6ffb9f2b1..55edd6f1d557f89b018c09ebf69bdf2ca328f2e8 100644 --- a/installGEP.yaml +++ b/installGEP.yaml @@ -9,3 +9,4 @@ dependencies: - tabulate=0.8.7 - beautifulsoup4=4.9 - mamba=0.15.2 + - pandoc=2.16.1 diff --git a/rules/hifi_05_11_21.smk b/rules/hifi_05_11_21.smk index 83bd0aee4e3673384b178f35d3d6d00224a2bb76..2648a562c8590bc90ae552c5a2ef17a1bb2d3670 100644 --- a/rules/hifi_05_11_21.smk +++ b/rules/hifi_05_11_21.smk @@ -33,7 +33,7 @@ rule unzipHifi: input: fastq=hifi_gzipped, output: - temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq")), + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq"), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/logs/hifiReads/{readCounter}.{smrtornot}_pigzUnzip.log") conda: @@ -47,7 +47,7 @@ rule symlinkUnzippedHifi: input: fastq=hifi_notgzipped, output: - temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq")), + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq"), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/logs/hifiReads/{readCounter}.{smrtornot}_pigzUnzip.log") shell: @@ -98,7 +98,7 @@ rule symlinkfornotSmartTrimmed: input: os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.notsmrtTrimmed.fastq"), output: - outputFile=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.notsmrtTrimmed.fastq")) + outputFile=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.notsmrtTrimmed.fastq") shell: """ ln -s {input} {output.outputFile} @@ -110,25 +110,28 @@ rule fastqc_hifi_SMRT_removed: params: folder2out=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/") output: - os.path.join(config['Results'],"0_buildDatabases/{sample}/hifi_reads/05_multiqc/{readCounter}.{smrtornot}_fastqc.html") + os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{readCounter}.{smrtornot}_fastqc.html") threads: 1 log: - os.path.join(config['Results'], "0_buildDatabases/{sample}/hifi_reads/logs/{readCounter}.{smrtornot}.FastQC.log") + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{readCounter}.{smrtornot}.FastQC.log") conda: "../envs/pigz.yaml" shell: - "(fastqc {input} -o {params.folder2out} -t {threads}) &> {log}" + """ + mkdir {params.folder2out} + (fastqc {input} -o {params.folder2out} -t {threads}) &> {log} + """ rule multiqc_hifi_SMRTremoved: input: - lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifi_reads/05_multiqc/{readCounter}.{smrtornot}_fastqc.html"), sample=wildcards.sample, readCounter=d[wildcards.sample], smrtornot=testDict[wildcards.sample][1]) + lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{readCounter}.{smrtornot}_fastqc.html"), sample=wildcards.sample, readCounter=d[wildcards.sample], smrtornot=testDict[wildcards.sample][1]) params: - folder2qc=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifi_reads/05_multiqc/"), + folder2qc=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/"), filename="{sample}.{smrtornot}.multiqcReport.html" output: - os.path.join(config['Results'],"0_buildDatabases/{sample}/hifi_reads/05_multiqc/{sample}.{smrtornot}.multiqcReport.html") + os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{sample}.{smrtornot}.multiqcReport.html") log: - os.path.join(config['Results'], "0_buildDatabases/{sample}/hifi_reads/logs/{sample}.{smrtornot}.multiqc.log") + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{sample}.{smrtornot}.multiqc.log") conda: "../envs/pigz.yaml" shell: @@ -146,18 +149,17 @@ rule meryl_hifi_trimmed_count: kmer = "{kmer}" # path= os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/") threads: - workflow.cores + workflow.cores * 0.25 output: - directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifi_reads/03_merylDb/" + "{readCounter}" + "_hifi_dB.{smrtornot}.{kmer}.meryl")), + directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/03_merylDb/" + "{readCounter}" + "_hifi_dB.{smrtornot}.{kmer}.meryl")), log: - os.path.join(config['Results'], "0_buildDatabases/{sample}/hifi_reads/logs/{readCounter}_hifi_{kmer}.{smrtornot}.log") + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{readCounter}_hifi_{kmer}.{smrtornot}.log") priority: 10 conda: "../envs/merylMerq_2.yaml" shell: """ - export OMP_NUM_THREADS={threads} (meryl count k={params.kmer} threads={threads} {input.reads} output {output}) &> {log} """ @@ -165,24 +167,23 @@ rule meryl_hifi_build: input: # reads= rules.trimSMRTbell.output.outputFile # lambda wildcards: expand(config['Results'],"{group}/alignment/{sample}.bam", group=wildcards.group, sample=GROUPS[wildcards.group] - lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifi_reads/03_merylDb/{readCounter}_hifi_dB.{smrtornot}.{kmer}.meryl"), sample=wildcards.sample, readCounter=d[wildcards.sample], kmer=testDict[wildcards.sample][0], smrtornot=testDict[wildcards.sample][1]) + lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/03_merylDb/{readCounter}_hifi_dB.{smrtornot}.{kmer}.meryl/"), sample=wildcards.sample, readCounter=d[wildcards.sample], kmer=testDict[wildcards.sample][0], smrtornot=testDict[wildcards.sample][1]) # reads=os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{readCounter}" + "_hifi_dB.21.meryl") params: kmer = "{kmer}" threads: - workflow.cores + workflow.cores * 0.5 output: - directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifi_reads/03_merylDb/complete_hifi_{sample}_dB.{smrtornot}.{kmer}")), + directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/03_merylDb/complete_hifi_{sample}_dB.{smrtornot}.{kmer}.meryl")), log: - os.path.join(config['Results'], "0_buildDatabases/{sample}/hifi_reads/logs/{sample}_hifi_{smrtornot}.{kmer}.log") + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{sample}_hifi_{smrtornot}.{kmer}.log") priority: 10 conda: "../envs/merylMerq_2.yaml" shell: """ - export OMP_NUM_THREADS={threads} - (meryl union-sum k={params.kmer} threads={threads} {input} output {output}) &> {log} + (meryl union-sum {input} output {output}) &> {log} """ # # diff --git a/rules/illumina_05_11_21.smk b/rules/illumina_05_11_21.smk index de12cc7bcb186884e1df91d49204678d5b274c21..b8729f913e9f11f4a6eedef4766532d832495813 100644 --- a/rules/illumina_05_11_21.smk +++ b/rules/illumina_05_11_21.smk @@ -40,7 +40,7 @@ rule unzipFasta_R1: input: assembly=R1_gzipped, output: - temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq")), + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq"), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_R1_pigzUnzip.log") conda: @@ -64,7 +64,7 @@ rule unzipFasta_R2: input: assembly=R2_gzipped, output: - temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R2.fastq")), + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R2.fastq"), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_R2_pigzUnzip.log") conda: @@ -88,10 +88,10 @@ rule symlinkUnzippedFasta_R2: rule trim10xbarcodes: input: read1=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R1.fastq"), - read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R2.fastq") +# read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R2.fastq") output: read1=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read1.fastq")), - read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read2.fastq")) +# read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read2.fastq")) threads: workflow.cores * 0.125 log: @@ -101,12 +101,23 @@ rule trim10xbarcodes: shell: """ (trimmomatic SE -threads {threads} {input.read1} {output.read1} HEADCROP:23) &> {log} - ln -s {input.read2} {output.read2} """ +# ln -s {input.read2} {output.read2} - - +rule trim10xbarcodesR2: + input: + read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R2.fastq") + output: + read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read2.fastq")) + threads: + 1 + conda: + "../envs/pigz.yaml" + shell: + """ + ln -s {input.read2} {output.read2} + """ @@ -274,6 +285,5 @@ rule meryl_hifi_build: "../envs/merylMerq_2.yaml" shell: """ - export OMP_NUM_THREADS={threads} - (meryl union-sum k={params.kmer} threads={threads} {input} output {output}) &> {log} + (meryl union-sum {input} output {output}) &> {log} """ diff --git a/rules/run_05_11_21.smk b/rules/run_05_11_21.smk index c2c847181aa2124dbd736cf5d7db10a7aaef5c47..e4bcc42d98ba5edecde6aef89e0fa8083b70de22 100644 --- a/rules/run_05_11_21.smk +++ b/rules/run_05_11_21.smk @@ -87,7 +87,7 @@ rule busco5: assemblyName = "{asmID}", chngDir = os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO") threads: - workflow.cores * 0.5 + workflow.cores * 0.125 output: # report(os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt"), caption="../report/busco.rst", category="Benchmark Universal Single Copy Orthologs", subcategory="{fastq}") os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), @@ -149,7 +149,7 @@ rule merqury: # symlink_fasta=os.path.join(config['Results'], "01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}.fasta"), symlink_merylDB=directory(os.path.join(config['Results'], "1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.meryl")) threads: - workflow.cores * 0.5 + workflow.cores * 0.125 output: os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), @@ -529,7 +529,7 @@ rule makeReport: os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.completeness.stats"), os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/only_buscoScores_{asmID}.txt"), os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), - os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.fl.png") + os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png") params: "{asmID}", "{kmer}",