diff --git a/SUBMIT_CONFIG/default/config.yaml b/SUBMIT_CONFIG/default/config.yaml index d2f3f62f3edf0e834066716157fc758fd8bef020..b30148a9f21ffa3cf02367c6592e911217891935 100644 --- a/SUBMIT_CONFIG/default/config.yaml +++ b/SUBMIT_CONFIG/default/config.yaml @@ -2,8 +2,7 @@ cores: 64 dry-run: False use-conda: True latency-wait: 60 -rerun-incomplete: False -printshellcmds: True +rerun-incomplete: True +printshellcmds: False keep-going: False -#shadow-prefix: /srv/public/user/james94/tempfileshere -#use-singularity: True +#use-singularity: False diff --git a/SUBMIT_CONFIG/slurm/cluster.yaml b/SUBMIT_CONFIG/slurm/cluster.yaml index 22b094e5d7929e8f9fbb0a5d18a29e985b2dab9b..3098c7c861d3e624bcdaf70add41c16cc0f2b8a1 100644 --- a/SUBMIT_CONFIG/slurm/cluster.yaml +++ b/SUBMIT_CONFIG/slurm/cluster.yaml @@ -1,143 +1,143 @@ -__default__: - jobname: 'GEP.{rule}' - partition: begendiv,main - # nCPUs: "{threads}" - qos: 'standard' - +# __default__: +# jobname: 'GEP.{rule}' +# partition: begendiv,main +# # nCPUs: "{threads}" +# qos: 'standard' +# #### PRETEXT #### unzipHiC_R1: jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:30:05" + mem_mb: 500 + time: "01:30:05" threads: 4 -symlinkUnzippedHiC_R1: - jobname: GEP.{rule}.{wildcards.asmID} - memory: 10 - time: "00:05:00" - threads: 1 +# symlinkUnzippedHiC_R1: +# jobname: GEP.{rule}.{wildcards.asmID} +# mem_mb: 10 +# time: "00:05:00" +# threads: 1 unzipHiC_R2: jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:30:05" + mem_mb: 500 + time: "01:30:05" threads: 4 -symlinkUnzippedHiC_R2: - jobname: GEP.{rule}.{wildcards.asmID} - memory: 10 - time: "00:05:00" - threads: 1 +# symlinkUnzippedHiC_R2: +# jobname: GEP.{rule}.{wildcards.asmID} +# mem_mb: 10 +# time: "00:05:00" +# threads: 1 pretext_index_PRI_asm: jobname: GEP.{rule}.{wildcards.asmID} - memory: 72000 + mem_mb: 72000 time: "02:30:00" threads: 1 pretext_fastq2bam_R1: jobname: GEP.{rule}.{wildcards.asmID} - memory: 32000 + mem_mb: 32000 time: "08:00:00" threads: 12 pretext_fastq2bam_R2: jobname: GEP.{rule}.{wildcards.asmID} - memory: 32000 + mem_mb: 32000 time: "08:00:00" threads: 12 pretext_filter_5primeEnd_R1: jobname: GEP.{rule}.{wildcards.asmID} - memory: 200 + mem_mb: 200 time: "08:00:00" threads: 4 pretext_filter_5primeEnd_R2: jobname: GEP.{rule}.{wildcards.asmID} - memory: 200 + mem_mb: 200 time: "08:00:00" threads: 4 pretext_filtered_combine: jobname: GEP.{rule}.{wildcards.asmID} - memory: 9500 + mem_mb: 9500 time: "08:30:00" threads: 12 pretext_map: jobname: GEP.{rule}.{wildcards.asmID} - memory: 32000 + mem_mb: 32000 time: "08:30:00" threads: 12 pretext_snapshot: jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 + mem_mb: 500 time: "00:10:00" threads: 1 -pretextMaps2md: - jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:10:00" - threads: 1 +# pretextMaps2md: +# jobname: GEP.{rule}.{wildcards.asmID} +# mem_mb: 500 +# time: "00:10:00" +# threads: 1 unzipFasta_PRI: jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:05:00" - threads: 1 + mem_mb: 500 + time: "00:15:00" + threads: 4 -symlinkUnzippedFasta_PRI: - jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:05:00" - threads: 1 +# symlinkUnzippedFasta_PRI: +# jobname: GEP.{rule}.{wildcards.asmID} +# mem_mb: 500 +# time: "00:15:00" +# threads: 1 unzipFasta_ALT: jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:05:00" - threads: 1 + mem_mb: 500 + time: "00:15:00" + threads: 4 -symlinkUnzippedFasta_ALT: - jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:05:05" - threads: 1 +# symlinkUnzippedFasta_ALT: +# jobname: GEP.{rule}.{wildcards.asmID} +# mem_mb: 500 +# time: "00:05:05" +# threads: 1 merqury: jobname: GEP.{rule}.{wildcards.asmID} - memory: 40000 + mem_mb: 40000 time: "08:00:00" threads: 8 busco5: jobname: GEP.{rule}.{wildcards.asmID} - memory: 150000 + mem_mb: 150000 time: "1-00:00:00" threads: 16 -moveBuscoOutputs: - jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:05:05" - threads: 1 +# moveBuscoOutputs: +# jobname: GEP.{rule}.{wildcards.asmID} +# mem_mb: 500 +# time: "00:05:05" +# threads: 1 @@ -148,163 +148,163 @@ moveBuscoOutputs: genomescope2: jobname: GEP.{rule}.{wildcards.asmID}.{wildcards.kmer} - memory: 500 - time: "00:05:00" + mem_mb: 500 + time: "00:15:00" threads: 1 assemblyStats: jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:05:00" - threads: 1 - -saveConfiguration_and_getKeyValues_kmer: - jobname: GEP.{rule}.{wildcards.asmID}.{wildcards.kmer} - memory: 500 - time: "00:05:00" - threads: 1 - - -saveConfiguration_and_getKeyValues: - jobname: GEP.{rule}.{wildcards.asmID} - memory: 500 - time: "00:05:05" - threads: 1 - - -aggregateAllAssemblies: - jobname: GEP.{rule} - memory: 500 - time: "00:05:05" - threads: 1 - -makeReport: - jobname: GEP.{rule}.{wildcards.asmID}.{wildcards.kmer} - memory: 500 - time: "00:05:05" - threads: 1 - - -addFullTable: - jobname: GEP.{rule} - memory: 500 - time: "00:05:05" + mem_mb: 500 + time: "00:15:00" threads: 1 -aggregateReport: - jobname: GEP.{rule} - memory: 500 - time: "00:05:05" - threads: 1 +# saveConfiguration_and_getKeyValues_kmer: +# jobname: GEP.{rule}.{wildcards.asmID}.{wildcards.kmer} +# mem_mb: 500 +# time: "00:05:00" +# threads: 1 +# +# +# saveConfiguration_and_getKeyValues: +# jobname: GEP.{rule}.{wildcards.asmID} +# mem_mb: 500 +# time: "00:05:05" +# threads: 1 +# +# +# aggregateAllAssemblies: +# jobname: GEP.{rule} +# mem_mb: 500 +# time: "00:05:05" +# threads: 1 +# +# makeReport: +# jobname: GEP.{rule}.{wildcards.asmID}.{wildcards.kmer} +# mem_mb: 500 +# time: "00:05:05" +# threads: 1 +# +# +# addFullTable: +# jobname: GEP.{rule} +# mem_mb: 500 +# time: "00:05:05" +# threads: 1 +# +# aggregateReport: +# jobname: GEP.{rule} +# mem_mb: 500 +# time: "00:05:05" +# threads: 1 makePDF: jobname: GEP.{rule} - memory: 1000 + mem_mb: 1000 time: "00:10:05" threads: 1 ###### Rules for illuminadb building ########## -unzipFasta_R1: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x}.{wildcards.trimAdapters} - memory: 96000 - time: "00:30:05" - threads: 2 - - -unzipFasta_R2: +unzipFastq_R1: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x}.{wildcards.trimAdapters} - memory: 96000 - time: "00:30:05" + mem_mb: 96000 + time: "01:45:05" threads: 4 -symlinkUnzippedFasta_R1: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x}.{wildcards.trimAdapters} - memory: 500 - time: "00:01:35" - threads: 1 -symlinkUnzippedFasta_R2: +unzipFastq_R2: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x}.{wildcards.trimAdapters} - memory: 500 - time: "00:01:35" - threads: 1 - - - - -symLink_trim10xbarcodes_notrimAdapt: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} - memory: 500 - time: "00:01:35" - threads: 1 - -symlinks_no10xOrAdaptTrim: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} - memory: 500 - time: "00:01:35" - threads: 1 - - -symlinks_no10xwithAdaptTrim: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} - memory: 500 - time: "00:01:35" - threads: 1 - + mem_mb: 96000 + time: "01:45:05" + threads: 4 -symlink_trim10xbarcodesR2: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters} - memory: 500 - time: "00:01:35" - threads: 1 +# symlinkUnzippedFastq_R1: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x}.{wildcards.trimAdapters} +# mem_mb: 500 +# time: "00:01:35" +# threads: 1 +# +# symlinkUnzippedFastq_R2: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x}.{wildcards.trimAdapters} +# mem_mb: 500 +# time: "00:01:35" +# threads: 1 + + + + +# symLink_trim10xbarcodes_notrimAdapt: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} +# mem_mb: 500 +# time: "00:01:35" +# threads: 1 +# +# symlinks_no10xOrAdaptTrim: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} +# mem_mb: 500 +# time: "00:01:35" +# threads: 1 +# +# +# symlinks_no10xwithAdaptTrim: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} +# mem_mb: 500 +# time: "00:01:35" +# threads: 1 +# +# +# symlink_trim10xbarcodesR2: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters} +# mem_mb: 500 +# time: "00:01:35" +# threads: 1 trim10xbarcodes: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters} - memory: 500 - time: "04:00:05" + mem_mb: 500 + time: "02:30:05" threads: 4 trimAdapters: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x} - memory: 500 - time: "04:00:05" + mem_mb: 500 + time: "02:30:05" threads: 8 fastqc_Illumina: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters}.{wildcards.trim10x} - memory: 500 + mem_mb: 1000 time: "04:00:05" threads: 4 -multiqc_hifi: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.trimAdapters}.{wildcards.trim10x} - memory: 500 - time: "01:00:05" - threads: 1 +# multiqc_hifi: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.trimAdapters}.{wildcards.trim10x} +# mem_mb: 500 +# time: "01:00:05" +# threads: 1 meryl_R1: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters}.{wildcards.trim10x}.{wildcards.kmer} - memory: 20000 - time: "01:00:05" + mem_mb: 20000 + time: "01:30:05" threads: 12 meryl_R2: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters}.{wildcards.trim10x}.{wildcards.kmer} - memory: 20000 - time: "01:00:05" + mem_mb: 20000 + time: "01:30:05" threads: 12 meryl_illumina_build: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.trimAdapters}.{wildcards.trim10x}.{wildcards.kmer} - memory: 30000 - time: "00:45:05" + mem_mb: 30000 + time: "02:45:05" threads: 12 @@ -314,42 +314,42 @@ meryl_illumina_build: unzipHifi: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} - memory: 128000 + mem_mb: 128000 time: "00:30:05" -symlinkUnzippedHifi: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} - memory: 500 - time: "00:05:05" +# symlinkUnzippedHifi: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} +# mem_mb: 500 +# time: "00:05:05" -symlinkfornotSmartTrimmed: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} - memory: 500 - time: "00:05:05" +# symlinkfornotSmartTrimmed: +# jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} +# mem_mb: 500 +# time: "00:05:05" fastqc_hifi: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} - memory: 12000 - time: "00:45:05" + mem_mb: 12000 + time: "04:00:05" multiqc_hifi: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} - memory: 4000 - time: "00:15:05" + mem_mb: 4000 + time: "01:15:05" meryl_hifi_count: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot}.{wildcards.kmer} - memory: 96000 + mem_mb: 96000 time: "02:30:05" meryl_hifi_build: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.smrtornot}.{wildcards.kmer} - memory: 96000 + mem_mb: 96000 time: "01:30:05" trimSMRTbell: jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} - memory: 96000 + mem_mb: 96000 time: "01:30:05" diff --git a/Snakefile b/Snakefile index f2d423aa13a80f3fd1c7c4aa69ef28e71bfdfad1..93ac2d3cb27b03aa53c6f2d31533c41e7e0a9983 100644 --- a/Snakefile +++ b/Snakefile @@ -10,6 +10,7 @@ from urllib.request import urlopen from bs4 import BeautifulSoup import argparse, subprocess +container: "docker://gepdocker/gep_dockerimage:dev" configfile: "configuration/config.yaml" with open(config['resources'], 'r') as f: @@ -45,7 +46,7 @@ def to_be_smrtTrimmed(userAnswer): samples = pd.read_csv(config['samplesTSV'], dtype=str, index_col=False, delim_whitespace=True, skip_blank_lines=True) if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'trimAdapters', 'fastQC']).issubset(samples.columns): - whichRule = "rules/illumina_05_11_21.smk" + whichRule = "rules/build_illumina.smk" samples=samples.reset_index() samples['readCounter'] = samples.groupby(['sample']).cumcount()+1 samples['readCounter'] = samples['readCounter'].astype(str) @@ -85,34 +86,33 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri dkmerSize = {} samples['gzipped_R1']=samples['Library_R1'].apply(gzipped_or_not) samples['gzipped_R2']=samples['Library_R2'].apply(gzipped_or_not) -# print(samples) + noGzip_R1 = samples[samples['gzipped_R1'] == False] noGzip_R1=noGzip_R1.set_index(['sample','readCounter']) -# print(noGzip_R1) + noGzip_R2 = samples[samples['gzipped_R2'] == False] noGzip_R2=noGzip_R2.set_index(['sample','readCounter']) -# print(noGzip_R2) + yesGzip_R1 = samples[samples['gzipped_R1'] == True] yesGzip_R1=yesGzip_R1.set_index(['sample','readCounter']) -# print(yesGzip_R1) + yesGzip_R2 = samples[samples['gzipped_R2'] == True] yesGzip_R2=yesGzip_R2.set_index(['sample','readCounter']) -# print(yesGzip_R2) + samples=samples.set_index(['sample','readCounter']) -# print('this is testDict',testDict) -# print('this is testDict',testDictQC) + ruleAllQCFiles=[] if samples['fastQC'].str.contains('True').any(): ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/05_multiqc/{sample}.{trim10x}.{trimAdapters}.multiqcReport.html"), sample=key, trim10x=value1, trimAdapters=value2) for key, [value1, value2, value3] in testDictQC.items()] ruleAll=[expand(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/complete_{sample}_illuminaDb.{trim10x}.{trimAdapters}.{kmer}.meryl"), sample=key, kmer=value1, trim10x=value2, trimAdapters=value3) for key, [value1, value2, value3] in testDict.items()] elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']).issubset(samples.columns): - whichRule = "rules/hifi_05_11_21.smk" + whichRule = "rules/build_hifi.smk" samples=samples.reset_index() samples['readCounter'] = samples.groupby(['sample']).cumcount()+1 samples['readCounter'] = samples['readCounter'].astype(str) @@ -123,21 +123,20 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']). samples=samples.reset_index() testDict=samples[['sample','meryl_kmer_size', 'smrtornot']] testDict=testDict.set_index(['sample']).T.to_dict('list') -# print('test', testDict) + testDictQC=samples[['sample', 'smrtornot', 'fastQC']] testDictQC = testDictQC[testDictQC['fastQC'] == "True"] -# print(testDictQC) + testDictQC=testDictQC.drop_duplicates('sample', keep='first') testDictQC=testDictQC.set_index(['sample']).T.to_dict('list') -# print() -# print(testDictQC) + runQC = samples[samples['fastQC'] == "True"] drunQCtrim = {} for i in runQC['sample'].unique(): drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index] -# print('this is runqc', drunQCtrim) + runQC=runQC.set_index(['sample','readCounter']) @@ -166,15 +165,15 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']). for i in samples['sample'].unique(): d[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index] samples['gzipped_hifi']=samples['hifi_reads'].apply(gzipped_or_not) -# print(samples) + noGzip_hifi = samples[samples['gzipped_hifi'] == False] noGzip_hifi=noGzip_hifi.set_index(['sample','readCounter']) -# print(noGzip_hifi) + yesGzip_hifi = samples[samples['gzipped_hifi'] == True] yesGzip_hifi=yesGzip_hifi.set_index(['sample','readCounter']) -# print(yesGzip_hifi) + samples=samples.set_index(['sample','readCounter']) ruleAllQCFiles=[] @@ -183,7 +182,7 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']). ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{sample}.{smrtornot}.multiqcReport.html"), sample=key, smrtornot=value1) for key, [value1, value2] in testDictQC.items()] ruleAll=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/03_merylDb/complete_hifi_{sample}_dB.{smrtornot}.{kmer}.meryl"), sample=key, kmer=value1, smrtornot=value2) for key, [value1, value2] in testDict.items()] elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', 'HiC_R1', 'HiC_R2']).issubset(samples.columns): - whichRule = "rules/run_05_11_21.smk" + whichRule = "rules/evaluate.smk" samples['genomeSize']=samples['genomeSize'].apply(genomeSize_auto_or_not) ### make new column for whether or not path/file given is gzipped (True or False) @@ -199,11 +198,11 @@ elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', ' samples['merylDB_present']=samples['merylDB'].apply(FILE_present_or_not) testDictPRETEXT = samples[['ID', 'HiC_R1_present', 'HiC_R2_present']] -# + testDictPRETEXT = testDictPRETEXT[(testDictPRETEXT['HiC_R1_present'] == True) & (testDictPRETEXT['HiC_R2_present'] == True)] -# print(testDictPRETEXT) + testDictPRETEXT=testDictPRETEXT.set_index(['ID']).T.to_dict('list') -# print(testDictPRETEXT) + noGzip_HiC_R1 = samples[samples['gzipped_HiC_R1'] == False] @@ -236,14 +235,14 @@ elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', ' yesGzip_ALT=yesGzip_ALT.set_index(['ID']) samples=samples.set_index(['ID']) -# print(samples) + testDict=samples.T.to_dict('list') -# print(testDict) + ruleAllQCFiles=[] ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"),os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf") else: - raise ValueError('Sample Sheet for illumina meryl building not recognised. Please make sure you are using the correct sample sheet (prebuilt or non-prebuilt meryl db, etc)') + raise ValueError('Sample Sheet for not recognised. Please make sure you are using the correct sample sheet') diff --git a/envs/merylMerq_2.yaml b/envs/merylMerq_2.yaml index 1158ca4fba0de36d2bc1db58259ff6a6adffe155..5554c767bb794d20971caa793c3f11fbac6cd186 100644 --- a/envs/merylMerq_2.yaml +++ b/envs/merylMerq_2.yaml @@ -3,12 +3,14 @@ channels: - defaults - conda-forge dependencies: - - merqury 1.3.* - - bedtools >=2.29.2 - - meryl 1.3.* - - openjdk >=11.0.1 - - r-argparse >=2.0.1 - - r-base >=4 - - r-ggplot2 >=3.3.2 - - r-scales >=1.1.1 - - samtools >=1.10 + - merqury == 1.3 + - bedtools == 2.29.2 + - meryl == 1.3 + - cairo == 1.16 + - openjdk == 11.0.13 + - r-argparse == 2.1.5 + - r-base == 4.1.3 + - r-ggplot2 == 3.3.5 + - r-scales == 1.2.0 + - samtools == 1.15.1 + - xorg-libxrender == 0.9.10 diff --git a/rules/build_hifi.smk b/rules/build_hifi.smk new file mode 100644 index 0000000000000000000000000000000000000000..ff8a3fcf39c9fb1ce0b18fa4e52e7e1520e006e5 --- /dev/null +++ b/rules/build_hifi.smk @@ -0,0 +1,172 @@ + +localrules: symlinkUnzippedHifi, symlinkfornotSmartTrimmed, multiqc_hifi + + + +def fq_to_trimSMRTbell(wildcards): + return trimSMRTbell.loc[(wildcards.sample, wildcards.readCounter), "hifi_reads"] + + +def fq_to_notTrimSMRTbell(wildcards): + return notrimSMRTbell.loc[(wildcards.sample, wildcards.readCounter), "hifi_reads"] + + +def hifi_gzipped(wildcards): + return yesGzip_hifi.loc[(wildcards.sample, wildcards.readCounter), "hifi_reads"] + + +def hifi_notgzipped(wildcards): + return noGzip_hifi.loc[(wildcards.sample, wildcards.readCounter), "hifi_reads"] + + + +rule unzipHifi: + input: + fastq=hifi_gzipped, + output: + temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq")), + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/logs/hifiReads/{readCounter}.{smrtornot}_pigzUnzip.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + threads: + resource['unzipHifi']['threads'] + resources: + mem_mb=resource['unzipHifi']['mem_mb'], + time=resource['unzipHifi']['time'], + shell: + """ + pigz -p {threads} -c -d -k {input.fastq} > {output} 2> {log} + """ + +rule symlinkUnzippedHifi: + input: + fastq=hifi_notgzipped, + output: + temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq")), + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/logs/hifiReads/{readCounter}.{smrtornot}_pigzUnzip.log") + container: + None + shell: + """ + ln -s {input.fastq} {output} + echo "{input.fastq} no gzipped. Symlink created in place of expected decompressed file." > {log} + """ + +rule trimSMRTbell: + input: + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.smrtTrimmed.fastq"), + output: + outputFile=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.smrtTrimmed.fastq")) + threads: + resource['trimSMRTbell']['threads'] + resources: + mem_mb=resource['trimSMRTbell']['mem_mb'], + time=resource['trimSMRTbell']['time'], + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/logs/hifiReads/{readCounter}_trimSMRTbell.log") + priority: + 15 + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + shell: + """ + (cutadapt -j {threads} -o {output.outputFile} {input} -b ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT -b ATCTCTCTCTTTTCCTCCTCCTCCGTTGTTGTTGTTGAGAGAGAT --discard-trimmed) &> {log} + """ + +rule symlinkfornotSmartTrimmed: + input: + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.notsmrtTrimmed.fastq"), + output: + outputFile=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.notsmrtTrimmed.fastq")) + container: + None + shell: + """ + ln -s {input} {output.outputFile} + """ + +rule fastqc_hifi: + input: + os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.{smrtornot}.fastq") + params: + folder2out=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/") + output: + os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{readCounter}.{smrtornot}_fastqc.html") + threads: + resource['trimSMRTbell']['threads'] + resources: + mem_mb=resource['fastqc_hifi']['mem_mb'], + time=resource['fastqc_hifi']['time'], + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{readCounter}.{smrtornot}.FastQC.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + shell: + """ + mkdir {params.folder2out} + (fastqc {input} -o {params.folder2out} -t {threads}) &> {log} + """ + +rule multiqc_hifi: + input: + lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{readCounter}.{smrtornot}_fastqc.html"), sample=wildcards.sample, readCounter=d[wildcards.sample], smrtornot=testDict[wildcards.sample][1]) + params: + folder2qc=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/"), + filename="{sample}.{smrtornot}.multiqcReport.html" + output: + os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{sample}.{smrtornot}.multiqcReport.html") + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{sample}.{smrtornot}.multiqc.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + shell: + "(multiqc {params.folder2qc} -o {params.folder2qc} -n {params.filename}) &> {log}" + + +rule meryl_hifi_count: + input: + reads=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.{smrtornot}.fastq") + params: + kmer = "{kmer}" + threads: + resource['meryl_hifi_count']['threads'] + resources: + mem_mb=resource['meryl_hifi_count']['mem_mb'], + time=resource['meryl_hifi_count']['time'], + output: + temp(directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/03_merylDb/" + "{readCounter}" + "_hifi_dB.{smrtornot}.{kmer}.meryl"))), + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{readCounter}_hifi_{kmer}.{smrtornot}.log") + priority: + 10 + conda: + os.path.join(workflow.basedir, "envs/merylMerq_2.yaml") + shell: + """ + (meryl count k={params.kmer} threads={threads} {input.reads} output {output}) &> {log} + """ + +rule meryl_hifi_build: + input: + lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/03_merylDb/{readCounter}_hifi_dB.{smrtornot}.{kmer}.meryl/"), sample=wildcards.sample, readCounter=d[wildcards.sample], kmer=testDict[wildcards.sample][0], smrtornot=testDict[wildcards.sample][1]) + params: + kmer = "{kmer}" + threads: + resource['meryl_hifi_build']['threads'] + resources: + mem_mb=resource['meryl_hifi_build']['mem_mb'], + time=resource['meryl_hifi_build']['time'], + output: + directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/03_merylDb/complete_hifi_{sample}_dB.{smrtornot}.{kmer}.meryl")), + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{sample}_hifi_{smrtornot}.{kmer}.log") + priority: + 10 + conda: + os.path.join(workflow.basedir, "envs/merylMerq_2.yaml") + shell: + """ + (meryl union-sum {input} output {output}) &> {log} + """ diff --git a/rules/build_illumina.smk b/rules/build_illumina.smk new file mode 100644 index 0000000000000000000000000000000000000000..0b8367f2ab83ed84dc9d0437d4c04ac0ea063e20 --- /dev/null +++ b/rules/build_illumina.smk @@ -0,0 +1,307 @@ + + + + +localrules: symlinkUnzippedFastq_R1, symlinkUnzippedFastq_R2, symLink_trim10xbarcodes_notrimAdapt, symlinks_no10xwithAdaptTrim, symlinks_no10xOrAdaptTrim, symlink_trim10xbarcodesR2, multiqc_hifi + + + + +def R1_gzipped(wildcards): + return yesGzip_R1.loc[(wildcards.sample, wildcards.readCounter), "Library_R1"] + +def R1_notgzipped(wildcards): + return noGzip_R1.loc[(wildcards.sample, wildcards.readCounter), "Library_R1"] + +def R2_gzipped(wildcards): + return yesGzip_R2.loc[(wildcards.sample, wildcards.readCounter), "Library_R2"] + +def R2_notgzipped(wildcards): + return noGzip_R2.loc[(wildcards.sample, wildcards.readCounter), "Library_R2"] + +rule unzipFastq_R1: + input: + assembly=R1_gzipped, + output: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq"), + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_R1_pigzUnzip.log"), + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + threads: + resource['unzipFastq_R1']['threads'] + resources: + mem_mb=resource['unzipFastq_R1']['mem_mb'], + time=resource['unzipFastq_R1']['time'] + shell: + """ + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} + """ + +rule symlinkUnzippedFastq_R1: + input: + assembly=R1_notgzipped, + output: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq"), + container: + None + shell: + """ + ln -s {input} {output} + """ + +rule unzipFastq_R2: + input: + assembly=R2_gzipped, + output: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R2.fastq"), + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_R2_pigzUnzip.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + threads: + resource['unzipFastq_R2']['threads'] + resources: + mem_mb=resource['unzipFastq_R2']['mem_mb'], + time=resource['unzipFastq_R2']['time'] + shell: + """ + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} + """ + +rule symlinkUnzippedFastq_R2: + input: + assembly=R2_notgzipped, + output: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R2.fastq"), + container: + None + shell: + """ + ln -s {input} {output} + """ + + +rule trim10xbarcodes: + input: + read1=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R1.fastq"), + output: + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read1.fastq"), + threads: + resource['trim10xbarcodes']['threads'] + resources: + mem_mb=resource['trim10xbarcodes']['mem_mb'], + time=resource['trim10xbarcodes']['time'] + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.10xTrimmed.10BarcodeRemoval_Trimmomatic.{trimAdapters}.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + shell: + """ + (trimmomatic SE -threads {threads} {input.read1} {output.read1} HEADCROP:23) &> {log} + """ + + +rule symlink_trim10xbarcodesR2: + input: + read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R2.fastq") + output: + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read2.fastq") + container: + None + shell: + """ + ln -s {input.read2} {output.read2} + """ + + + +rule symLink_trim10xbarcodes_notrimAdapt: + input: + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_Read1.fastq"), + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_Read2.fastq"), + output: + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_val_1.fq"), + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_val_2.fq") + container: + None + shell: + """ + ln -s {input.read1} {output.read1} + ln -s {input.read2} {output.read2} + """ + +rule symlinks_no10xOrAdaptTrim: + input: + read1=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.not10xTrimmed.notAdaptTrimmed_R1.fastq"), + read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.not10xTrimmed.notAdaptTrimmed_R2.fastq") + output: + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.notAdaptTrimmed_val_1.fq"), + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.notAdaptTrimmed_val_2.fq") + container: + None + shell: + """ + ln -s {input.read1} {output.read1} + ln -s {input.read2} {output.read2} + """ + +rule symlinks_no10xwithAdaptTrim: + input: + read1=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.not10xTrimmed.AdaptTrimmed_R1.fastq"), + read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.not10xTrimmed.AdaptTrimmed_R2.fastq") + output: + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.AdaptTrimmed_Read1.fastq"), + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.AdaptTrimmed_Read2.fastq") + container: + None + shell: + """ + ln -s {input.read1} {output.read1} + ln -s {input.read2} {output.read2} + """ + +rule trimAdapters: + input: + read1= os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.AdaptTrimmed_Read1.fastq"), + read2= os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.AdaptTrimmed_Read2.fastq"), + params: + outputDir=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/"), + r1_prefix="{readCounter}.{trim10x}.AdaptTrimmed", + output: + read1=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.AdaptTrimmed_val_1.fq")), + read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.AdaptTrimmed_val_2.fq")) + threads: + resource['trimAdapters']['threads'] + resources: + mem_mb=resource['trimAdapters']['mem_mb'], + time=resource['trimAdapters']['time'], + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.AdaptTrimmed_tGalore.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + shell: + """ + (trim_galore -j {threads} --basename {params.r1_prefix} --dont_gzip --length 65 -o {params.outputDir} --paired {input.read1} {input.read2}) &> {log} + """ + +rule fastqc_Illumina: + input: + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_1.fq"), + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_2.fq") + params: + folder2out=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/04_fastqc") + output: + os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/04_fastqc/{readCounter}.{trim10x}.{trimAdapters}_val_1_fastqc.html"), + os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/04_fastqc/{readCounter}.{trim10x}.{trimAdapters}_val_2_fastqc.html") + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_fastqc.log") + threads: + resource['fastqc_Illumina']['threads'] + resources: + mem_mb=resource['fastqc_Illumina']['mem_mb'], + time=resource['fastqc_Illumina']['time'], + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + shell: + "(fastqc {input} -o {params.folder2out} -t {threads}) &> {log}" + +rule multiqc_hifi: + input: + read1=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/04_fastqc/{readCounter}.{trim10x}.{trimAdapters}_val_1_fastqc.html"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], trimAdapters=testDict[wildcards.sample][2]), + read2=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/04_fastqc/{readCounter}.{trim10x}.{trimAdapters}_val_2_fastqc.html"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], trimAdapters=testDict[wildcards.sample][2]) + params: + folder2qc=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/04_fastqc/"), + folder2out=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/05_multiqc/"), + filename="{sample}.{trim10x}.{trimAdapters}.multiqcReport.html" + output: + os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/05_multiqc/{sample}.{trim10x}.{trimAdapters}.multiqcReport.html") + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{sample}.{trim10x}.{trimAdapters}_multiqc.log") + conda: + os.path.join(workflow.basedir, "envs/pigz.yaml") + shell: + "(multiqc {params.folder2qc} -o {params.folder2out} -n {params.filename}) &> {log}" + + + +rule meryl_R1: + input: + read1= os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_1.fq") + params: + kmer = "{kmer}", + threads: + resource['meryl_R1']['threads'] + resources: + mem_mb=resource['meryl_R1']['mem_mb'], + time=resource['meryl_R1']['time'], + output: + directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/{readCounter}.{trim10x}.{trimAdapters}_R1.{kmer}.meryl")) + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_meryl_R1.{kmer}.log") + priority: + 10 + conda: + os.path.join(workflow.basedir, "envs/merylMerq_2.yaml") + shell: + """ + export OMP_NUM_THREADS={threads} + (meryl count k={params.kmer} threads={threads} {input.read1} output {output}) &> {log} + """ + +rule meryl_R2: + input: + read2= os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_2.fq") + params: + kmer = "{kmer}", + threads: + resource['meryl_R2']['threads'] + resources: + mem_mb=resource['meryl_R2']['mem_mb'], + time=resource['meryl_R2']['time'], + output: + directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/{readCounter}.{trim10x}.{trimAdapters}_R2.{kmer}.meryl")) + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_meryl_R2.{kmer}.log") + priority: + 10 + conda: + os.path.join(workflow.basedir, "envs/merylMerq_2.yaml") + shell: + """ + export OMP_NUM_THREADS={threads} + (meryl count k={params.kmer} threads={threads} {input.read2} output {output}) &> {log} + """ + + +rule meryl_illumina_build: + input: + removeReads1=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_1.fq"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], trimAdapters=testDict[wildcards.sample][2]), + removeReads2=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_2.fq"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], trimAdapters=testDict[wildcards.sample][2]), + read1=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/03_merylDb/{readCounter}.{trim10x}.{trimAdapters}_R1.{kmer}.meryl"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], kmer=testDict[wildcards.sample][0], trimAdapters=testDict[wildcards.sample][2]), + read2=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/03_merylDb/{readCounter}.{trim10x}.{trimAdapters}_R2.{kmer}.meryl"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], kmer=testDict[wildcards.sample][0], trimAdapters=testDict[wildcards.sample][2]) + params: + removeReadDIR_trimmed=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/"), + removeReadDIR_unzipped=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/"), + kmer = "{kmer}", + path= os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/") + threads: + resource['meryl_illumina_build']['threads'] + resources: + mem_mb=resource['meryl_illumina_build']['mem_mb'], + time=resource['meryl_illumina_build']['time'], + output: + directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/complete_{sample}_illuminaDb.{trim10x}.{trimAdapters}.{kmer}.meryl")), + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{sample}_hifi_{kmer}.{trim10x}.{trimAdapters}.log") + priority: + 10 + conda: + os.path.join(workflow.basedir, "envs/merylMerq_2.yaml") + shell: + """ + export OMP_NUM_THREADS={threads} + (meryl union-sum {input.read1} {input.read2} output {output}) &> {log} + rm -r {params.removeReadDIR_trimmed} + rm -r {params.removeReadDIR_unzipped} + """ diff --git a/scripts/addFullTableForReport.py b/scripts/addFullTableForReport.py new file mode 100644 index 0000000000000000000000000000000000000000..bb37771dee48daceac6c57ffe8d3a99ae9634ac4 --- /dev/null +++ b/scripts/addFullTableForReport.py @@ -0,0 +1,23 @@ +import pandas as pd +from tabulate import tabulate +samples=pd.read_csv(snakemake.input.results, dtype=str, index_col=0, delim_whitespace=True, skip_blank_lines=True) +# samples=samples.reset_index +turn2FloatAndMb=['Bases_Mb','Est_Size_Mb','Longest_Scaff_Mb','Scaff_N50_Mb','Scaff_NG50_Mb','Scaff_N95_Mb','Scaff_NG95_Mb','Longest_Cont','Cont_N50_Mb','Cont_NG50_Mb','Cont_N95_Mb','Cont_NG95_Mb'] +roundDecimals=['Comp_BUSCOs_%','Comp_Single_BUSCOs_%','Het_%','GC_%','QV','Completeness','Bases_Mb','Est_Size_Mb','Longest_Scaff_Mb','Scaff_N50_Mb','Scaff_NG50_Mb','Scaff_N95_Mb','Scaff_NG95_Mb','Longest_Cont','Cont_N50_Mb','Cont_NG50_Mb','Cont_N95_Mb','Cont_NG95_Mb'] +print('this is samples',samples) +sampleTransposed=samples.T +print('this is sampleTransposed',sampleTransposed) +for header in turn2FloatAndMb: + sampleTransposed[header]=sampleTransposed[header].astype(float).div(1000000) +for i in range(0,4): + sampleTransposed[roundDecimals[i]]=sampleTransposed[roundDecimals[i]].str.replace('%','') +for roundHeader in roundDecimals: + sampleTransposed[roundHeader]=sampleTransposed[roundHeader].astype(float).round(2) +with open(snakemake.output[0], "w") as out: + print("", file=out) + print("\\blandscape", file=out) + print("", file=out) + print("\\tiny", file=out) + print(tabulate(sampleTransposed.rename_axis('ASM_ID'), headers='keys',tablefmt="pipe", showindex=True), file=out) + print("\\elandscape", file=out) + print("", file=out) diff --git a/scripts/filter_five_end.pl b/scripts/filter_five_end.pl index 5580dbccb933a9033c6d1c2557294cfe96653e43..79d320536c29824816c1098a7798db4a4ff6904a 100644 --- a/scripts/filter_five_end.pl +++ b/scripts/filter_five_end.pl @@ -10,6 +10,7 @@ my @mid; my @all; my $counter = 0; + while (<STDIN>){ chomp; if (/^@/){ diff --git a/scripts/pretextMapsToMarkdown.py b/scripts/pretextMapsToMarkdown.py new file mode 100644 index 0000000000000000000000000000000000000000..bd87669d1752a36ed51cb5916bc72fa62069366f --- /dev/null +++ b/scripts/pretextMapsToMarkdown.py @@ -0,0 +1,6 @@ +import shutil +shutil.copyfile(snakemake.input.PretextMap, snakemake.output.pretextCP2keyResults) +with open(snakemake.output.IndividualPretextMD, "w") as out: + print("", file=out) + print("###",snakemake.params.assemblyName," HiC Heatmap", file=out) + print("{ height=30% }", file=out) diff --git a/testData/SpeciesX_R1.fastq.gz b/testData/SpeciesX_R1.fastq.gz deleted file mode 100644 index e0dd486d6d3578f3b40a0679442756789188fe17..0000000000000000000000000000000000000000 Binary files a/testData/SpeciesX_R1.fastq.gz and /dev/null differ diff --git a/testData/SpeciesX_R2.fastq.gz b/testData/SpeciesX_R2.fastq.gz deleted file mode 100644 index 322633dc499765eae1eb3527eac030b16d9b2ee9..0000000000000000000000000000000000000000 Binary files a/testData/SpeciesX_R2.fastq.gz and /dev/null differ diff --git a/testData/SpeciesX_assembly.fasta b/testData/SpeciesX_assembly.fasta deleted file mode 100644 index bd4f6b790c1cb53b7e00575864590cec5bc966e0..0000000000000000000000000000000000000000 Binary files a/testData/SpeciesX_assembly.fasta and /dev/null differ diff --git a/testData/SpeciesY_assembly.fasta b/testData/SpeciesY_assembly.fasta deleted file mode 100644 index 1f3e767df8299ed07263433840f26f618ea77351..0000000000000000000000000000000000000000 Binary files a/testData/SpeciesY_assembly.fasta and /dev/null differ diff --git a/testData/SpeciesY_library1_R1.fastq.gz b/testData/SpeciesY_library1_R1.fastq.gz deleted file mode 100644 index f7d0b1f47e183daf56c87decdaae2ed5440cc0ce..0000000000000000000000000000000000000000 Binary files a/testData/SpeciesY_library1_R1.fastq.gz and /dev/null differ diff --git a/testData/SpeciesY_library1_R2.fastq.gz b/testData/SpeciesY_library1_R2.fastq.gz deleted file mode 100644 index b2c97d819d680c651bf78756244c457bc6134960..0000000000000000000000000000000000000000 Binary files a/testData/SpeciesY_library1_R2.fastq.gz and /dev/null differ diff --git a/testData/SpeciesY_library2_R1.fastq.gz b/testData/SpeciesY_library2_R1.fastq.gz deleted file mode 100644 index b604d5c3dff9f7318cfae296125813571da957f0..0000000000000000000000000000000000000000 Binary files a/testData/SpeciesY_library2_R1.fastq.gz and /dev/null differ diff --git a/testData/SpeciesY_library2_R2.fastq.gz b/testData/SpeciesY_library2_R2.fastq.gz deleted file mode 100644 index 7342fb3ff544c0647681bdc407d7fcd24a8d6985..0000000000000000000000000000000000000000 Binary files a/testData/SpeciesY_library2_R2.fastq.gz and /dev/null differ