diff --git a/SUBMIT_CONFIG/local/config.yaml b/SUBMIT_CONFIG/local/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d0c1ee42ee2ba9dff711c2ca1a3f4ee0e2882c50 --- /dev/null +++ b/SUBMIT_CONFIG/local/config.yaml @@ -0,0 +1,10 @@ +cores: 64 +dry-run: False +use-conda: True +latency-wait: 60 +rerun-incomplete: True +printshellcmds: False +keep-going: False + +#shadow-prefix: /srv/public/user/<user>/tempfileshere +#use-singularity: True diff --git a/SUBMIT_CONFIG/slurm/config.yaml b/SUBMIT_CONFIG/slurm/config.yaml index f2969ebcde976e68c4db9e6d80ca2a797d3942a9..b2545ee6c42e7ca50464b6aece0d3e1b192112e8 100644 --- a/SUBMIT_CONFIG/slurm/config.yaml +++ b/SUBMIT_CONFIG/slurm/config.yaml @@ -1,21 +1,23 @@ - -# cluster-config: "cluster.yaml" +### SBATCH JOB THAT WILL BE SUBMITTED FOR EACH RULE IN THE PIPELINE ### cluster: mkdir -p slurm_logs/{rule} && sbatch - --partition=begendiv,main - --qos=standard + --partition={resources.partition} + --qos={resources.qos} --cpus-per-task={threads} - --mem={resources.memory} + --mem={resources.mem_mb} --job-name=GEP.{rule}.{wildcards}.%j - --output=slurm_logs/{rule}/{rule}.{wildcards}.%j.out + --output=slurm_logs/{rule}/{wildcards}.%j.out + --error=slurm_logs/{rule}/{wildcards}.%j.err --time={resources.time} -#User Defines below parameters + + +### DEFAULT RESOURCES THAT WILL BE USED IN THE ABOVE SBATCH COMMAND IF NOT DEFINED IN '../../../GEP/configuration/define_resources.yaml' ### default-resources: - - partition=begendiv + - partition=begendiv,main - qos=standard # - mem_mb=100000 # - time="3-00:00:00" @@ -23,14 +25,16 @@ default-resources: +### SNAKEMAKE ARGUMENTS ### + # restart-times: 3] # max-jobs-per-second: 100 -max-status-checks-per-second: 10 -latency-wait: 60 -jobs: 10 +#max-status-checks-per-second: 10 +cores: 96 +jobs: 100 +use-conda: True keep-going: False rerun-incomplete: True -printshellcmds: True +printshellcmds: False scheduler: greedy -use-conda: True -cores: 24 +latency-wait: 60 diff --git a/Snakefile b/Snakefile index b4c551258342e059707742728f1da26b9c800bca..f2d423aa13a80f3fd1c7c4aa69ef28e71bfdfad1 100644 --- a/Snakefile +++ b/Snakefile @@ -85,31 +85,31 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri dkmerSize = {} samples['gzipped_R1']=samples['Library_R1'].apply(gzipped_or_not) samples['gzipped_R2']=samples['Library_R2'].apply(gzipped_or_not) - print(samples) +# print(samples) noGzip_R1 = samples[samples['gzipped_R1'] == False] noGzip_R1=noGzip_R1.set_index(['sample','readCounter']) - print(noGzip_R1) +# print(noGzip_R1) noGzip_R2 = samples[samples['gzipped_R2'] == False] noGzip_R2=noGzip_R2.set_index(['sample','readCounter']) - print(noGzip_R2) +# print(noGzip_R2) yesGzip_R1 = samples[samples['gzipped_R1'] == True] yesGzip_R1=yesGzip_R1.set_index(['sample','readCounter']) - print(yesGzip_R1) +# print(yesGzip_R1) yesGzip_R2 = samples[samples['gzipped_R2'] == True] yesGzip_R2=yesGzip_R2.set_index(['sample','readCounter']) - print(yesGzip_R2) +# print(yesGzip_R2) samples=samples.set_index(['sample','readCounter']) - print('this is testDict',testDict) - print('this is testDict',testDictQC) +# print('this is testDict',testDict) +# print('this is testDict',testDictQC) ruleAllQCFiles=[] if samples['fastQC'].str.contains('True').any(): ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/05_multiqc/{sample}.{trim10x}.{trimAdapters}.multiqcReport.html"), sample=key, trim10x=value1, trimAdapters=value2) for key, [value1, value2, value3] in testDictQC.items()] - ruleAll=[expand(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/{sample}_illuminaDb.{trim10x}.{trimAdapters}.{kmer}.meryl"), sample=key, kmer=value1, trim10x=value2, trimAdapters=value3) for key, [value1, value2, value3] in testDict.items()] + ruleAll=[expand(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/complete_{sample}_illuminaDb.{trim10x}.{trimAdapters}.{kmer}.meryl"), sample=key, kmer=value1, trim10x=value2, trimAdapters=value3) for key, [value1, value2, value3] in testDict.items()] elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']).issubset(samples.columns): whichRule = "rules/hifi_05_11_21.smk" @@ -123,21 +123,21 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']). samples=samples.reset_index() testDict=samples[['sample','meryl_kmer_size', 'smrtornot']] testDict=testDict.set_index(['sample']).T.to_dict('list') - print('test', testDict) +# print('test', testDict) testDictQC=samples[['sample', 'smrtornot', 'fastQC']] testDictQC = testDictQC[testDictQC['fastQC'] == "True"] - print(testDictQC) +# print(testDictQC) testDictQC=testDictQC.drop_duplicates('sample', keep='first') testDictQC=testDictQC.set_index(['sample']).T.to_dict('list') - print() - print(testDictQC) +# print() +# print(testDictQC) runQC = samples[samples['fastQC'] == "True"] drunQCtrim = {} for i in runQC['sample'].unique(): drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index] - print('this is runqc', drunQCtrim) +# print('this is runqc', drunQCtrim) runQC=runQC.set_index(['sample','readCounter']) @@ -166,15 +166,15 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']). for i in samples['sample'].unique(): d[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index] samples['gzipped_hifi']=samples['hifi_reads'].apply(gzipped_or_not) - print(samples) +# print(samples) noGzip_hifi = samples[samples['gzipped_hifi'] == False] noGzip_hifi=noGzip_hifi.set_index(['sample','readCounter']) - print(noGzip_hifi) +# print(noGzip_hifi) yesGzip_hifi = samples[samples['gzipped_hifi'] == True] yesGzip_hifi=yesGzip_hifi.set_index(['sample','readCounter']) - print(yesGzip_hifi) +# print(yesGzip_hifi) samples=samples.set_index(['sample','readCounter']) ruleAllQCFiles=[] @@ -185,8 +185,6 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']). elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', 'HiC_R1', 'HiC_R2']).issubset(samples.columns): whichRule = "rules/run_05_11_21.smk" samples['genomeSize']=samples['genomeSize'].apply(genomeSize_auto_or_not) -# samples['genomeSize'] = samples['genomeSize'].fillna(0) -# samples["genomeSize"] = pd.to_numeric(samples["genomeSize"]) ### make new column for whether or not path/file given is gzipped (True or False) samples['gzipped_PRI']=samples['PRI_asm'].apply(gzipped_or_not) @@ -203,19 +201,10 @@ elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', ' testDictPRETEXT = samples[['ID', 'HiC_R1_present', 'HiC_R2_present']] # testDictPRETEXT = testDictPRETEXT[(testDictPRETEXT['HiC_R1_present'] == True) & (testDictPRETEXT['HiC_R2_present'] == True)] - print(testDictPRETEXT) -# testDictPRETEXT = testDictPRETEXT[(testDictPRETEXT['HiC_R1_present'] == "True") & (testDictPRETEXT['HiC_R2_present'] == "True")] +# print(testDictPRETEXT) testDictPRETEXT=testDictPRETEXT.set_index(['ID']).T.to_dict('list') - print(testDictPRETEXT) +# print(testDictPRETEXT) - # print(testDictPRETEXT) - # - # testDictREADS = samples[['ID', 'merylDB_present','merylDB_kmer']] - # testDictYESREADS = testDictREADS[(testDictREADS['merylDB_present'] == True)] - # testDictYESREADS=testDictYESREADS.set_index(['ID']).T.to_dict('list') - # - # testDictNOREADS = testDictREADS[(testDictREADS['merylDB_present'] == False)] - # testDictNOREADS=testDictNOREADS.set_index(['ID']).T.to_dict('list') noGzip_HiC_R1 = samples[samples['gzipped_HiC_R1'] == False] noGzip_HiC_R1 = noGzip_HiC_R1.set_index(['ID']) @@ -247,12 +236,11 @@ elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', ' yesGzip_ALT=yesGzip_ALT.set_index(['ID']) samples=samples.set_index(['ID']) - print(samples) +# print(samples) testDict=samples.T.to_dict('list') - print(testDict) +# print(testDict) ruleAllQCFiles=[] -# ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf")#expand(os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/markdownForReport.md"), asmID=list(testDict.keys())) - ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"),os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf")#expand(os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/markdownForReport.md"), asmID=list(testDict.keys())) + ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"),os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf") else: raise ValueError('Sample Sheet for illumina meryl building not recognised. Please make sure you are using the correct sample sheet (prebuilt or non-prebuilt meryl db, etc)') diff --git a/configuration/config.yaml b/configuration/config.yaml index a963a531c05cd88cad0e4b4f659b12100e5b9c85..bc5bf016e1a7cae7c3a99c580080b599c225b737 100644 --- a/configuration/config.yaml +++ b/configuration/config.yaml @@ -1,11 +1,16 @@ -Results: # e.g. "/srv/public/users/james94/insecta_results_05_11_2021" +Results: +#e.g. Results: "/srv/public/users/james94/insecta_results_05_11_2021" -samplesTSV: # e.g. "/srv/public/users/james94/GEP/configuration/buildPRI.tsv" +samplesTSV: +# e.g. samplesTSV: "/srv/public/users/james94/GEP/configuration/buildPRI.tsv" -busco5Lineage: # e.g. "insecta" +busco5Lineage: +# e.g. busco5Lineage: "insecta" -buscoMode: "genome" + + +resources: "configuration/define_resources.yaml" diff --git a/configuration/define_resources.yaml b/configuration/define_resources.yaml index 3098c7c861d3e624bcdaf70add41c16cc0f2b8a1..a91b8df56db23cdc73663ab5d6ba4dee36f8e433 100644 --- a/configuration/define_resources.yaml +++ b/configuration/define_resources.yaml @@ -9,7 +9,6 @@ #### PRETEXT #### unzipHiC_R1: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 500 time: "01:30:05" threads: 4 @@ -21,7 +20,6 @@ unzipHiC_R1: # threads: 1 unzipHiC_R2: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 500 time: "01:30:05" threads: 4 @@ -35,51 +33,43 @@ unzipHiC_R2: pretext_index_PRI_asm: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 72000 time: "02:30:00" threads: 1 pretext_fastq2bam_R1: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 32000 time: "08:00:00" threads: 12 pretext_fastq2bam_R2: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 32000 time: "08:00:00" threads: 12 pretext_filter_5primeEnd_R1: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 200 time: "08:00:00" threads: 4 pretext_filter_5primeEnd_R2: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 200 time: "08:00:00" threads: 4 pretext_filtered_combine: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 9500 time: "08:30:00" threads: 12 pretext_map: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 32000 time: "08:30:00" threads: 12 pretext_snapshot: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 500 time: "00:10:00" threads: 1 @@ -92,7 +82,6 @@ pretext_snapshot: unzipFasta_PRI: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 500 time: "00:15:00" threads: 4 @@ -105,7 +94,6 @@ unzipFasta_PRI: unzipFasta_ALT: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 500 time: "00:15:00" threads: 4 @@ -120,13 +108,11 @@ unzipFasta_ALT: merqury: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 40000 time: "08:00:00" threads: 8 busco5: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 150000 time: "1-00:00:00" threads: 16 @@ -147,14 +133,12 @@ busco5: genomescope2: - jobname: GEP.{rule}.{wildcards.asmID}.{wildcards.kmer} mem_mb: 500 time: "00:15:00" threads: 1 assemblyStats: - jobname: GEP.{rule}.{wildcards.asmID} mem_mb: 500 time: "00:15:00" threads: 1 @@ -200,7 +184,6 @@ assemblyStats: makePDF: - jobname: GEP.{rule} mem_mb: 1000 time: "00:10:05" threads: 1 @@ -208,14 +191,12 @@ makePDF: ###### Rules for illuminadb building ########## unzipFastq_R1: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x}.{wildcards.trimAdapters} mem_mb: 96000 time: "01:45:05" threads: 4 unzipFastq_R2: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x}.{wildcards.trimAdapters} mem_mb: 96000 time: "01:45:05" threads: 4 @@ -263,7 +244,6 @@ unzipFastq_R2: trim10xbarcodes: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters} mem_mb: 500 time: "02:30:05" threads: 4 @@ -271,14 +251,12 @@ trim10xbarcodes: trimAdapters: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trim10x} mem_mb: 500 time: "02:30:05" threads: 8 fastqc_Illumina: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters}.{wildcards.trim10x} mem_mb: 1000 time: "04:00:05" threads: 4 @@ -290,19 +268,16 @@ fastqc_Illumina: # threads: 1 meryl_R1: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters}.{wildcards.trim10x}.{wildcards.kmer} mem_mb: 20000 time: "01:30:05" threads: 12 meryl_R2: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.trimAdapters}.{wildcards.trim10x}.{wildcards.kmer} mem_mb: 20000 time: "01:30:05" threads: 12 meryl_illumina_build: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.trimAdapters}.{wildcards.trim10x}.{wildcards.kmer} mem_mb: 30000 time: "02:45:05" threads: 12 @@ -313,9 +288,9 @@ meryl_illumina_build: ###### HIFI BUILD ####### unzipHifi: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} mem_mb: 128000 - time: "00:30:05" + time: "01:30:05" + threads: 4 # symlinkUnzippedHifi: # jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} @@ -329,27 +304,27 @@ unzipHifi: fastqc_hifi: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} mem_mb: 12000 time: "04:00:05" + threads: 2 multiqc_hifi: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot} mem_mb: 4000 time: "01:15:05" + threads: 2 meryl_hifi_count: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter}.{wildcards.smrtornot}.{wildcards.kmer} mem_mb: 96000 time: "02:30:05" + threads: 12 meryl_hifi_build: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.smrtornot}.{wildcards.kmer} mem_mb: 96000 time: "01:30:05" + threads: 12 trimSMRTbell: - jobname: GEP.{rule}.{wildcards.sample}.{wildcards.readCounter} mem_mb: 96000 time: "01:30:05" + threads: 4 diff --git a/configuration/exampleSampleSheets/runEval_example.tsv b/configuration/exampleSampleSheets/runEval_example.tsv index 6ae8666b7ac5b4310aa583c0aa5f19d96192a376..3d6f7a5c0187e764b3aa6b850512f78400fe97e2 100644 --- a/configuration/exampleSampleSheets/runEval_example.tsv +++ b/configuration/exampleSampleSheets/runEval_example.tsv @@ -1,4 +1,4 @@ -ID PRI_asm ALT_asm merylDB merylDB_kmer genomeSize -speciesX_illumina /<pathto>/speciesX_assembly.fasta None /<pathTo>/speciesX_illumina_database.21.meryl 21 -speciesX_HiFi /<pathto>/speciesX_assembly.fasta None /<pathTo>/speciesX_hifi_database.31.meryl 31 -speciesY_illumina /<pathto>/speciesY_assembly_PrimaryHaplotype.fasta /<pathto>/speciesY_assembly_AlternateHaplotype.fasta /<pathTo>/speciesy_illumina_database.21.meryl 21 1050000 +ID PRI_asm ALT_asm merylDB merylDB_kmer genomeSize HiC_R1 HiC_R2 +AssemblyX_illuDB /<pathto>/speciesX_assembly.fasta None /<pathTo>/speciesX_illumina_database.21.meryl 21 auto None None +AssemblyX_HiFiDB /<pathto>/speciesX_assembly.fasta None /<pathTo>/speciesX_hifi_database.31.meryl 31 auto None None +AssemblyY_illuDB /<pathto>/speciesY_assembly_PrimaryHaplotype.fasta /<pathto>/speciesY_assembly_AlternateHaplotype.fasta /<pathTo>/speciesy_illumina_database.21.meryl 21 1050000 None None diff --git a/envs/pretext.yaml b/envs/pretext.yaml index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..571f4324d12f2592b3b48e1469c9262af673bc73 100644 --- a/envs/pretext.yaml +++ b/envs/pretext.yaml @@ -0,0 +1,9 @@ +channels: + - anaconda + - bioconda + - conda-forge +dependencies: + - pretextmap == 0.1.9 + - pretextsnapshot == 0.0.4 + - bwa-mem2 == 2.2.1 + - samtools == 1.14 diff --git a/installGEP.yaml b/installGEP.yaml index b6e00231f674f3dc7a6d763c150c033902ff379b..a09bff16cbc5d3bbae8d2db8143f27a545132f3e 100644 --- a/installGEP.yaml +++ b/installGEP.yaml @@ -5,7 +5,7 @@ channels: - anaconda dependencies: - snakemake==6.6.1 - - python>=3.6.7 + - python>=3.9.10 - tabulate=0.8.7 - beautifulsoup4=4.9 - mamba=0.15.2 diff --git a/rules/hifi_05_11_21.smk b/rules/hifi_05_11_21.smk index 932b989b66c6a6cfb6f5e5a20314516f46e635bd..44ff26aa8b389b0f3bcd926871195b6f434e12dc 100644 --- a/rules/hifi_05_11_21.smk +++ b/rules/hifi_05_11_21.smk @@ -1,17 +1,4 @@ -# def getSamplesInGroup(wildcards): -# return [join(wildcards.sample,s+".intersect.bed") for s in samplesInGroups[wildcards.sample]] -# -# print(getSamplesInGroup) -# - -# def get_fastq(wildcards): -# """Get fastq files of given sample-unit.""" -# fastqs = samples.loc[(wildcards.sample, wildcards.readCounter), ["hifi_reads"]].dropna() -# return {"r1": fastqs.hifi_reads} - -# def fq_to_trimSMRTbell(wildcards): -# return samples.loc[(wildcards.sample, wildcards.readCounter), "hifi_reads"] localrules: symlinkUnzippedHifi, symlinkfornotSmartTrimmed, multiqc_hifi @@ -22,11 +9,12 @@ def fq_to_trimSMRTbell(wildcards): def fq_to_notTrimSMRTbell(wildcards): return notrimSMRTbell.loc[(wildcards.sample, wildcards.readCounter), "hifi_reads"] -# def fq_to_trimSMRTbell(wildcards): -# return trimSMRTbell.loc[(wildcards.sample, wildcards.readCounter, wildcards.kmer ), "hifi_reads"] + + def hifi_gzipped(wildcards): return yesGzip_hifi.loc[(wildcards.sample, wildcards.readCounter), "hifi_reads"] + def hifi_notgzipped(wildcards): return noGzip_hifi.loc[(wildcards.sample, wildcards.readCounter), "hifi_reads"] @@ -44,7 +32,7 @@ rule unzipHifi: threads: resource['unzipHifi']['threads'] resources: - memory=resource['unzipHifi']['memory'], + mem_mb=resource['unzipHifi']['mem_mb'], time=resource['unzipHifi']['time'], shell: """ @@ -65,36 +53,16 @@ rule symlinkUnzippedHifi: ln -s {input.fastq} {output} echo "{input.fastq} no gzipped. Symlink created in place of expected decompressed file." > {log} """ -# def get_readNames(wildcards): -# """Get fastq files of given sample-unit.""" -# readNames1 = samples.loc[(wildcards.sample), ["readNames"]].dropna() -# return {"readNames": readNames1.readNames} - -# def fq_dict_from_sample(wildcards): -# return { -# "fq2": samples_table.loc[wildcards.sample, "fastq2"], -# "fq1": samples_table.loc[wildcards.sample, "fastq1"] -# }00_preProcessing/{sample}/00A_Trimming/10xTrimmed/ + rule trimSMRTbell: input: -# file=lambda wildcards: samples.at[wildcards.sample, 'hifi_reads'] \ -# if wildcards.sample in samples.index else '', - # lambda wildcards: samples.hifi_reads[samples.index == wildcards.sample], -# file=unpack(get_fastq), os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.smrtTrimmed.fastq"), - params: - # names=unpack(get_readCounter), - # base=os.path.join(config['Results'],"{sample}/0_qualityControl/01_fastqc"), - # readName=lambda wildcards: samples.at[wildcards.readCounter, 'test'] \ - # if wildcards.readCounter in samples.index else '', output: -# directory(os.path.join(config['Results'],"{sample}/0_preProcessing/01_trimHifi/")), -# outputFile=os.path.join(config['Results'],"{sample}/0_preProcessing/01_trimHifi/{readCounter}_smrtTrimmed.fastq") outputFile=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.smrtTrimmed.fastq")) threads: resource['trimSMRTbell']['threads'] resources: - memory=resource['trimSMRTbell']['memory'], + mem_mb=resource['trimSMRTbell']['mem_mb'], time=resource['trimSMRTbell']['time'], log: os.path.join(config['Results'], "0_buildDatabases/{sample}/logs/hifiReads/{readCounter}_trimSMRTbell.log") @@ -129,7 +97,7 @@ rule fastqc_hifi: threads: resource['trimSMRTbell']['threads'] resources: - memory=resource['fastqc_hifi']['memory'], + mem_mb=resource['fastqc_hifi']['mem_mb'], time=resource['fastqc_hifi']['time'], log: os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{readCounter}.{smrtornot}.FastQC.log") @@ -161,18 +129,13 @@ rule multiqc_hifi: rule meryl_hifi_count: input: -# reads= rules.trimSMRTbell.output.outputFile reads=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.{smrtornot}.fastq") - # "/srv/public/users/james94/GEP/configuration/Results_Test/mTamTet1/0_preProcessing/01_trimHifi/hifi1_smrtTrimmed.fastq", - # "/srv/public/users/james94/GEP/configuration/Results_Test/mTamTet1/0_preProcessing/01_trimHifi/hifi2_smrtTrimmed.fastq", - # "/srv/public/users/james94/GEP/configuration/Results_Test/knufia/0_preProcessing/01_trimHifi/hifi2_1_smrtTrimmed.fastq" params: kmer = "{kmer}" -# path= os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/") threads: resource['meryl_hifi_count']['threads'] resources: - memory=resource['meryl_hifi_count']['memory'], + mem_mb=resource['meryl_hifi_count']['mem_mb'], time=resource['meryl_hifi_count']['time'], output: temp(directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/03_merylDb/" + "{readCounter}" + "_hifi_dB.{smrtornot}.{kmer}.meryl"))), @@ -189,16 +152,13 @@ rule meryl_hifi_count: rule meryl_hifi_build: input: -# reads= rules.trimSMRTbell.output.outputFile -# lambda wildcards: expand(config['Results'],"{group}/alignment/{sample}.bam", group=wildcards.group, sample=GROUPS[wildcards.group] lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/03_merylDb/{readCounter}_hifi_dB.{smrtornot}.{kmer}.meryl/"), sample=wildcards.sample, readCounter=d[wildcards.sample], kmer=testDict[wildcards.sample][0], smrtornot=testDict[wildcards.sample][1]) -# reads=os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{readCounter}" + "_hifi_dB.21.meryl") params: kmer = "{kmer}" threads: resource['meryl_hifi_build']['threads'] resources: - memory=resource['meryl_hifi_build']['memory'], + mem_mb=resource['meryl_hifi_build']['mem_mb'], time=resource['meryl_hifi_build']['time'], output: directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/03_merylDb/complete_hifi_{sample}_dB.{smrtornot}.{kmer}.meryl")), diff --git a/rules/illumina_05_11_21.smk b/rules/illumina_05_11_21.smk index 4c58a1f27503368095d656dc75a1d9acc70b5443..f305f4141c39ee7e78a29183ccdce150bc141d36 100644 --- a/rules/illumina_05_11_21.smk +++ b/rules/illumina_05_11_21.smk @@ -1,31 +1,5 @@ -# def fq1_from_sample_trim10x(wildcards): -# return trim10x_table.loc[(wildcards.sample, wildcards.readCounter), "Library_R1"] -# -# print("this is 10xtrim", trim10x_table) -# -# def fq2_from_sample_trim10x(wildcards): -# return trim10x_table.loc[(wildcards.sample, wildcards.readCounter), 'Library_R2'] -# -# def fq1_from_sample_notrim10xwithAdapt(wildcards): -# return notrim10xwithAdapt.loc[(wildcards.sample, wildcards.readCounter), "Library_R1"] -# -# print("this is notrim10xwithAdapt", notrim10xwithAdapt) -# -# def fq2_from_sample_notrim10xwithAdapt(wildcards): -# return notrim10xwithAdapt.loc[(wildcards.sample, wildcards.readCounter), 'Library_R2'] -# -# -# def fq1_from_sample_notrim10xOrAdapt(wildcards): -# return notrim10xorAdapt.loc[(wildcards.sample, wildcards.readCounter), "Library_R1"] -# -# print("this is notrim10xorAdapt", notrim10xorAdapt) -# def fq2_from_sample_notrim10xOrAdapt(wildcards): -# return notrim10xorAdapt.loc[(wildcards.sample, wildcards.readCounter), 'Library_R2'] -# -# - -localrules: symlinkUnzippedFastq_R1, symlinkUnzippedFastq_R2, symLink_trim10xbarcodes_notrimAdapt, symlinks_no10xOrAdaptTrim, symlink_trim10xbarcodesR2, multiqc_hifi +localrules: symlinkUnzippedFastq_R1, symlinkUnzippedFastq_R2, symLink_trim10xbarcodes_notrimAdapt, symlinks_no10xwithAdaptTrim, symlinks_no10xOrAdaptTrim, symlink_trim10xbarcodesR2, multiqc_hifi @@ -46,7 +20,7 @@ rule unzipFastq_R1: input: assembly=R1_gzipped, output: - temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq")), + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq"), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_R1_pigzUnzip.log"), conda: @@ -54,7 +28,7 @@ rule unzipFastq_R1: threads: resource['unzipFastq_R1']['threads'] resources: - memory=resource['unzipFastq_R1']['memory'], + mem_mb=resource['unzipFastq_R1']['mem_mb'], time=resource['unzipFastq_R1']['time'] shell: """ @@ -65,9 +39,7 @@ rule symlinkUnzippedFastq_R1: input: assembly=R1_notgzipped, output: - temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq")), - # threads: - # 1 + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq"), shell: """ ln -s {input} {output} @@ -85,7 +57,7 @@ rule unzipFastq_R2: threads: resource['unzipFastq_R2']['threads'] resources: - memory=resource['unzipFastq_R2']['memory'], + mem_mb=resource['unzipFastq_R2']['mem_mb'], time=resource['unzipFastq_R2']['time'] shell: """ @@ -96,9 +68,7 @@ rule symlinkUnzippedFastq_R2: input: assembly=R2_notgzipped, output: - temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R2.fastq")), - # threads: - # 1 + os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R2.fastq"), shell: """ ln -s {input} {output} @@ -108,14 +78,12 @@ rule symlinkUnzippedFastq_R2: rule trim10xbarcodes: input: read1=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R1.fastq"), -# read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R2.fastq") output: - read1=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read1.fastq")), -# read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read2.fastq")) + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read1.fastq"), threads: resource['trim10xbarcodes']['threads'] resources: - memory=resource['trim10xbarcodes']['memory'], + mem_mb=resource['trim10xbarcodes']['mem_mb'], time=resource['trim10xbarcodes']['time'] log: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.10xTrimmed.10BarcodeRemoval_Trimmomatic.{trimAdapters}.log") @@ -125,16 +93,13 @@ rule trim10xbarcodes: """ (trimmomatic SE -threads {threads} {input.read1} {output.read1} HEADCROP:23) &> {log} """ -# ln -s {input.read2} {output.read2} rule symlink_trim10xbarcodesR2: input: read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.10xTrimmed.{trimAdapters}_R2.fastq") output: - read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read2.fastq")) - # threads: - # 1 + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.{trimAdapters}_Read2.fastq") conda: "../envs/pigz.yaml" shell: @@ -149,10 +114,9 @@ rule symLink_trim10xbarcodes_notrimAdapt: read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_Read1.fastq"), read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_Read2.fastq"), output: - read1=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_val_1.fq")), - read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_val_2.fq")) - # threads: - # 1 + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_val_1.fq"), + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.10xTrimmed.notAdaptTrimmed_val_2.fq") + shell: """ ln -s {input.read1} {output.read1} @@ -164,10 +128,8 @@ rule symlinks_no10xOrAdaptTrim: read1=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.not10xTrimmed.notAdaptTrimmed_R1.fastq"), read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.not10xTrimmed.notAdaptTrimmed_R2.fastq") output: - read1=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.notAdaptTrimmed_val_1.fq")), - read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.notAdaptTrimmed_val_2.fq")) - # threads: - # 1 + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.notAdaptTrimmed_val_1.fq"), + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.notAdaptTrimmed_val_2.fq") shell: """ ln -s {input.read1} {output.read1} @@ -179,10 +141,8 @@ rule symlinks_no10xwithAdaptTrim: read1=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.not10xTrimmed.AdaptTrimmed_R1.fastq"), read2=os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.not10xTrimmed.AdaptTrimmed_R2.fastq") output: - read1=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.AdaptTrimmed_Read1.fastq")), - read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.AdaptTrimmed_Read2.fastq")) - # threads: - # 1 + read1=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.AdaptTrimmed_Read1.fastq"), + read2=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.not10xTrimmed.AdaptTrimmed_Read2.fastq") shell: """ ln -s {input.read1} {output.read1} @@ -196,14 +156,13 @@ rule trimAdapters: params: outputDir=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/"), r1_prefix="{readCounter}.{trim10x}.AdaptTrimmed", -# r2_rename=os.path.join(config['Results'],"{sample}/0_preProcessing/01_trim/{read1Prefix}_val_2.fq") output: read1=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.AdaptTrimmed_val_1.fq")), read2=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.AdaptTrimmed_val_2.fq")) threads: resource['trimAdapters']['threads'] resources: - memory=resource['trimAdapters']['memory'], + mem_mb=resource['trimAdapters']['mem_mb'], time=resource['trimAdapters']['time'], log: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.AdaptTrimmed_tGalore.log") @@ -228,7 +187,7 @@ rule fastqc_Illumina: threads: resource['fastqc_Illumina']['threads'] resources: - memory=resource['fastqc_Illumina']['memory'], + mem_mb=resource['fastqc_Illumina']['mem_mb'], time=resource['fastqc_Illumina']['time'], conda: "../envs/pigz.yaml" @@ -247,13 +206,11 @@ rule multiqc_hifi: os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/05_multiqc/{sample}.{trim10x}.{trimAdapters}.multiqcReport.html") log: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{sample}.{trim10x}.{trimAdapters}_multiqc.log") - # threads: - # 1 conda: "../envs/pigz.yaml" shell: "(multiqc {params.folder2qc} -o {params.folder2out} -n {params.filename}) &> {log}" -# rule multiqc: + rule meryl_R1: @@ -261,11 +218,10 @@ rule meryl_R1: read1= os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_1.fq") params: kmer = "{kmer}", -# path= os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/") threads: resource['meryl_R1']['threads'] resources: - memory=resource['meryl_R1']['memory'], + mem_mb=resource['meryl_R1']['mem_mb'], time=resource['meryl_R1']['time'], output: temp(directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/{readCounter}.{trim10x}.{trimAdapters}_R1.{kmer}.meryl"))) @@ -286,11 +242,10 @@ rule meryl_R2: read2= os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_2.fq") params: kmer = "{kmer}", -# path= os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/") threads: resource['meryl_R2']['threads'] resources: - memory=resource['meryl_R2']['memory'], + mem_mb=resource['meryl_R2']['mem_mb'], time=resource['meryl_R2']['time'], output: temp(directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/{readCounter}.{trim10x}.{trimAdapters}_R2.{kmer}.meryl"))) @@ -309,21 +264,22 @@ rule meryl_R2: rule meryl_illumina_build: input: -# reads= rules.trimSMRTbell.output.outputFile{readCounter}.{trim10x}.{trimAdapters} -# lambda wildcards: expand(config['Results'],"{group}/alignment/{sample}.bam", group=wildcards.group, sample=GROUPS[wildcards.group] + removeReads1=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_1.fq"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], trimAdapters=testDict[wildcards.sample][2]), + removeReads2=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/{readCounter}.{trim10x}.{trimAdapters}_val_2.fq"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], trimAdapters=testDict[wildcards.sample][2]), read1=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/03_merylDb/{readCounter}.{trim10x}.{trimAdapters}_R1.{kmer}.meryl"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], kmer=testDict[wildcards.sample][0], trimAdapters=testDict[wildcards.sample][2]), read2=lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/03_merylDb/{readCounter}.{trim10x}.{trimAdapters}_R2.{kmer}.meryl"), sample=wildcards.sample, readCounter=d[wildcards.sample], trim10x=testDict[wildcards.sample][1], kmer=testDict[wildcards.sample][0], trimAdapters=testDict[wildcards.sample][2]) -# reads=os.path.join(config['Results'], "{sample}" +"/2_QVstats_merylAndMerqury/" + "{readCounter}" + "_hifi_dB.21.meryl") params: + removeReadDIR_trimmed=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/02_trimReads/"), + removeReadDIR_unzipped=os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/"), kmer = "{kmer}", path= os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/") threads: resource['meryl_illumina_build']['threads'] resources: - memory=resource['meryl_illumina_build']['memory'], + mem_mb=resource['meryl_illumina_build']['mem_mb'], time=resource['meryl_illumina_build']['time'], output: - directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/{sample}_illuminaDb.{trim10x}.{trimAdapters}.{kmer}.meryl")), + directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/complete_{sample}_illuminaDb.{trim10x}.{trimAdapters}.{kmer}.meryl")), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{sample}_hifi_{kmer}.{trim10x}.{trimAdapters}.log") priority: @@ -333,5 +289,7 @@ rule meryl_illumina_build: shell: """ export OMP_NUM_THREADS={threads} - (meryl union-sum {input} output {output}) &> {log} + (meryl union-sum {input.read1} {input.read2} output {output}) &> {log} + rm -r {params.removeReadDIR_trimmed} + rm -r {params.removeReadDIR_unzipped} """ diff --git a/rules/run_05_11_21.smk b/rules/run_05_11_21.smk index a91a08c512d17ddc25004bee8c6d39c7fcde6831..1122edf5f6f79062950793120a21c93ae4c0fe09 100644 --- a/rules/run_05_11_21.smk +++ b/rules/run_05_11_21.smk @@ -50,22 +50,6 @@ def merylDB(wildcards): return samples.loc[(wildcards.asmID), "merylDB"] - - # print(fastas) - # print(fastas) -# def ALT_asm(wildcards): -# return samples.loc[(wildcards.ALT_asm), "ALT_asm"] -# -# def PRI_asm_gzipped(wildcards): -# fastas=samples.loc[(wildcards.asmID), "PRI_asm"] -# newList=[] -# for entry in fastas: -# if entry.endswith(".gz"): -# print(entry) -# print(newList) -# return newList - - localrules: symlinkUnzippedHiC_R1, symlinkUnzippedHiC_R2, symlinkUnzippedFasta_PRI, symlinkUnzippedFasta_ALT, moveBuscoOutputs, saveConfiguration_and_getKeyValues_kmer, saveConfiguration_and_getKeyValues, aggregateAllAssemblies, makeReport, pretextMaps2md, addFullTable, aggregateReport def HiC_R1_gzipped(wildcards): @@ -83,7 +67,7 @@ rule unzipHiC_R1: threads: resource['unzipHiC_R1']['threads'] resources: - memory=resource['unzipHiC_R1']['memory'], + mem_mb=resource['unzipHiC_R1']['mem_mb'], time=resource['unzipHiC_R1']['time'], shell: """ @@ -121,7 +105,7 @@ rule unzipHiC_R2: threads: resource['unzipHiC_R2']['threads'] resources: - memory=resource['unzipHiC_R2']['memory'], + mem_mb=resource['unzipHiC_R2']['mem_mb'], time=resource['unzipHiC_R2']['time'] shell: """ @@ -164,7 +148,7 @@ rule pretext_index_PRI_asm: threads: resource['pretext_index_PRI_asm']['threads'] resources: - memory=resource['pretext_index_PRI_asm']['memory'], + mem_mb=resource['pretext_index_PRI_asm']['mem_mb'], time=resource['pretext_index_PRI_asm']['time'] shell: """ @@ -189,7 +173,7 @@ rule pretext_fastq2bam_R1: threads: resource['pretext_fastq2bam_R1']['threads'] resources: - memory=resource['pretext_fastq2bam_R1']['memory'], + mem_mb=resource['pretext_fastq2bam_R1']['mem_mb'], time=resource['pretext_fastq2bam_R1']['time'] log: os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.fastq2bam.R1.log") @@ -215,7 +199,7 @@ rule pretext_fastq2bam_R2: threads: resource['pretext_fastq2bam_R2']['threads'] resources: - memory=resource['pretext_fastq2bam_R2']['memory'], + mem_mb=resource['pretext_fastq2bam_R2']['mem_mb'], time=resource['pretext_fastq2bam_R2']['time'] log: os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.fastq2bam.R2.log") @@ -243,7 +227,7 @@ rule pretext_filter_5primeEnd_R1: threads: resource['pretext_filter_5primeEnd_R1']['threads'] resources: - memory=resource['pretext_filter_5primeEnd_R1']['memory'], + mem_mb=resource['pretext_filter_5primeEnd_R1']['mem_mb'], time=resource['pretext_filter_5primeEnd_R1']['time'] log: os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.filter5end.R1.log") @@ -271,7 +255,7 @@ rule pretext_filter_5primeEnd_R2: threads: resource['pretext_filter_5primeEnd_R2']['threads'] resources: - memory=resource['pretext_filter_5primeEnd_R2']['memory'], + mem_mb=resource['pretext_filter_5primeEnd_R2']['mem_mb'], time=resource['pretext_filter_5primeEnd_R2']['time'] log: os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.filter5end.R2.log") @@ -294,7 +278,7 @@ rule pretext_filtered_combine: threads: resource['pretext_filtered_combine']['threads'] resources: - memory=resource['pretext_filtered_combine']['memory'], + mem_mb=resource['pretext_filtered_combine']['mem_mb'], time=resource['pretext_filtered_combine']['time'] log: os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.log") @@ -313,7 +297,7 @@ rule pretext_map: threads: resource['pretext_map']['threads'] resources: - memory=resource['pretext_map']['memory'], + mem_mb=resource['pretext_map']['mem_mb'], time=resource['pretext_map']['time'] log: os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.PretextMap.log") @@ -327,7 +311,6 @@ rule pretext_snapshot: pretextFile=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.pretext") output: pretextSnapshotFULL=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png"), -# pretextCP2keyResults=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png") params: outDirectory=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/") conda: @@ -335,7 +318,7 @@ rule pretext_snapshot: threads: resource['pretext_snapshot']['threads'] resources: - memory=resource['pretext_snapshot']['memory'], + mem_mb=resource['pretext_snapshot']['mem_mb'], time=resource['pretext_snapshot']['time'] log: os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.PretextSnapshot.log") @@ -362,7 +345,7 @@ rule unzipFasta_PRI: threads: resource['unzipFasta_PRI']['threads'] resources: - memory=resource['unzipFasta_PRI']['memory'], + mem_mb=resource['unzipFasta_PRI']['mem_mb'], time=resource['unzipFasta_PRI']['time'] shell: """ @@ -401,7 +384,7 @@ rule unzipFasta_ALT: threads: resource['unzipFasta_ALT']['threads'] resources: - memory=resource['unzipFasta_ALT']['memory'], + mem_mb=resource['unzipFasta_ALT']['mem_mb'], time=resource['unzipFasta_ALT']['time'] shell: """ @@ -439,16 +422,14 @@ rule merqury: assemblyPRI=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), assemblyALT=altFile, merylDB=merylDB, - # checkunzip=os.path.join(config['Results'], "01_evaluation/{asmID}/logs/{asmID}_pigzUnzip.log") params: outFile= "{asmID}" + "_merqOutput", outPath= os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT"), - # symlink_fasta=os.path.join(config['Results'], "01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}.fasta"), symlink_merylDB=directory(os.path.join(config['Results'], "1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.meryl")) threads: resource['merqury']['threads'] resources: - memory=resource['merqury']['memory'], + mem_mb=resource['merqury']['mem_mb'], time=resource['merqury']['time'] output: os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), @@ -481,22 +462,16 @@ rule busco5: input: assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), lineage = os.path.join(workflow.basedir, "buscoLineage/" + buscoDataBaseName + "_odb10"), - # checkunzip=os.path.join(config['Results'], "01_evaluation/{asmID}/logs/{asmID}_pigzUnzip.log") params: - mode = config['buscoMode'], assemblyName = "{asmID}", chngDir = os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO") threads: resource['busco5']['threads'] resources: - memory=resource['busco5']['memory'], + mem_mb=resource['busco5']['mem_mb'], time=resource['busco5']['time'] output: -# report(os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/short_summary.specific." + config['busco5Lineage'] + "_odb10." + "{sample}" + ".txt"), caption="../report/busco.rst", category="Benchmark Universal Single Copy Orthologs", subcategory="{fastq}") os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), - # symlink = os.path.join(config['Results'], "{fastq}" + "/{sample}.fasta") -# mvRunBusco= directory(os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/run_" + config['busco5Lineage'] + "_odb10")), -# blastDB= directory(os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/blast_db")) conda: "../envs/busco_and_assembly.yaml" log: @@ -506,24 +481,18 @@ rule busco5: shell: """ cd {params.chngDir} - (busco -m {params.mode} --offline --in {input.assembly} -o {params.assemblyName} -l {input.lineage} -c {threads} -f --limit 5) &> {log} + (busco -m genome --offline --in {input.assembly} -o {params.assemblyName} -l {input.lineage} -c {threads} -f --limit 5) &> {log} """ -# ln -s {input.assembly} {output.symlink} rule moveBuscoOutputs: input: buscoResult=os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), -# mvRunBusco= os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/run_" + config['busco5Lineage'] + "_odb10"), -# blastDB= os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/blast_db") params: rmDir= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}"), -# logs = os.path.join(config['Results'], "{sample}" + "/busco5/" + "{sample}" + "/logs/*"), -# mvLogs= os.path.join(config['Results'], "{sample}" + "/busco5/logs/"), mvRunBusco= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/run_" + buscoDataBaseName + "_odb10"), mvRunBuscoDest= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO"), logs= os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/{asmID}/logs") -# blastDB= os.path.join(config['Results'], "{sample}" + "/busco5/blast_db") output: file = os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), # threads: @@ -548,12 +517,10 @@ rule genomescope2: name="{asmID}_k{kmer}", kmer="{kmer}", cpHist=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/merylDB_providedFor_{asmID}_10000.hist") -# findGscope=os.path.join(workflow.basedir) output: summary=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), logPlot=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), linearPlot=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), -# estimatedSize21=os.path.join(config['Results'], "{sample}" +"/3_genomescopeProfile/" + "{sample}_k21" + "_sizeEst.txt"), estimatedSize=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_sizeEst.txt")) conda: "../envs/genomescope.yaml" @@ -562,7 +529,7 @@ rule genomescope2: threads: resource['genomescope2']['threads'] resources: - memory=resource['genomescope2']['memory'], + mem_mb=resource['genomescope2']['mem_mb'], time=resource['genomescope2']['time'] shell: """ @@ -570,13 +537,7 @@ rule genomescope2: (genomescope2 -i {params.cpHist} -o {params.outFolder} -k {params.kmer} -n {params.name}) &> {log} grep "Genome Haploid Length" {output.summary} | awk {{'print $6'}} | sed 's/,//g'> {output.estimatedSize} """ - # expand -t 1 {input.hist} > {params.cpHist} - # """ - # head -n 10000 {input.hist} > {params.cpHist} - # export genomescope2script=$(find {params.findGscope} -name genomescope2) - # Rscript $genomescope2script -i {params.cpHist} -o {params.outFolder} -k 21 -n {wildcards.sample} - # grep "Genome Haploid Length" {output.summary} | awk {{'print $6'}} | sed 's/,//g'> {output.estimatedSize} - # """ + rule assemblyStats: input: assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), @@ -592,7 +553,7 @@ rule assemblyStats: threads: resource['assemblyStats']['threads'] resources: - memory=resource['assemblyStats']['memory'], + mem_mb=resource['assemblyStats']['mem_mb'], time=resource['assemblyStats']['time'] shell: """ @@ -604,24 +565,17 @@ rule assemblyStats: rule saveConfiguration_and_getKeyValues_kmer: input: -# checkLog=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/" + "{sample}" + "_merq.spectra-cn.log"), gscopeSum=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), gscopeLog=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), gscopeLin=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), params: keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv") output: - # newSampleSheet=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedSampleSheet.tsv"), - # newConfigFile=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedConfig.yaml"), -# merqLog=os.path.join(config['Results'],"logs/2_merylAndMerq/" + "{sample}" + "_merqury.spectra-cn.log"), gscopeSum=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_summary.txt"), gscopeLog=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_log_plot.png"), gscopeLin=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_linear_plot.png") # threads: # 1 -# os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), -# multiqc=os.path.join(config['Results'],"{sample}/5_31_Key_Results/{sample}_multiqc_report.html") - # aggregateTsv=os.path.join(config['Results'],"{sample}"+ "/individual_aggregated_results/aggregate.tsv") conda: "../envs/dos2unix.yaml" shell: @@ -633,105 +587,12 @@ rule saveConfiguration_and_getKeyValues_kmer: # -# rule saveConfiguration_and_getKeyValues: -# input: -# # checkLog=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/" + "{sample}" + "_merq.spectra-cn.log"), gscopeSum=os.path.join(config['Results'], "01_evaluation/{asmID}/01E_keyResults/{asmID}_k{kmer}_summary.txt"), -# gscopeSum=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), -# gscopeLog=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), -# gscopeLin=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), -# # keyValues=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv"), -# busco=os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), -# qv=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), -# completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), -# spectraStacked_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), -# spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), -# spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"), -# spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"), -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), -# # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), -# scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_scaffold_stats.tsv"), -# contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv"), -# pretextSnapshotFULL=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png") -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), -# # multiqc=os.path.join(config['Results'],"{sample}/0_qualityControl/02_multiqc/{sample}_multiqc_report.html") -# params: -# # sampleSheet= config['samplesTSV'], -# # config=os.path.join(workflow.basedir, "configuration/config.yaml"), -# resultsPath=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults"), -# keyValues2=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames_2.tsv"), -# keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames.tsv"), -# rowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/rowNames.tsv") -# # masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv") -# output: -# keyValuesWithRowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), -# busco=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), -# buscoScores=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/only_buscoScores_{asmID}.txt")), -# qv=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.qv"), -# completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.completeness.stats"), -# spectraStacked_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), -# spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), -# spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.st.png"), -# spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.fl.png"), -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), -# # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), -# scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_scaffold_stats.tsv"), -# contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_contig_stats.tsv"), -# pretextCP2keyResults=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png") -# threads: -# 1 -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), -# # multiqc=os.path.join(config['Results'],"{sample}/5_31_Key_Results/{sample}_multiqc_report.html") -# # aggregateTsv=os.path.join(config['Results'],"{sample}"+ "/individual_aggregated_results/aggregate.tsv") -# conda: -# "../envs/dos2unix.yaml" -# shell: -# """ -# head -n 15 {input.busco} | tail -n 7 | sed "s/^['\t']*//" > {output.buscoScores} -# cp {input.pretextSnapshotFULL} {output.pretextCP2keyResults} -# cp {input.busco} {output.busco} -# cp {input.completeness} {output.completeness} -# cp {input.qv} {output.qv} -# cp {input.spectraStacked_PRI} {output.spectraStacked_PRI} -# cp {input.spectraFlat_PRI} {output.spectraFlat_PRI} -# cp {input.spectraFlat_both} {output.spectraFlat_both} -# cp {input.spectraStacked_both} {output.spectraStacked_both} -# cp {input.scaffStats} {output.scaffStats} -# cp {input.contStats} {output.contStats} -# echo "{wildcards.asmID}" >> {params.keyValues} -# echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} -# echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {params.keyValues} -# echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(grep 'gc_content' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} -# echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'longest (bp)' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} -# echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(grep 'longest (bp)' {input.contStats} | awk {{'print $3'}})" >> {params.keyValues} -# echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(grep 'N95' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'NG95' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(awk {{'print $4'}} {input.qv})" | head -n 1 >> {params.keyValues} -# echo "$(awk {{'print $5'}} {input.completeness})" | head -n 1 >> {params.keyValues} -# echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $4'}})" >> {params.keyValues} -# dos2unix {params.keyValues} -# echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\nGC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" > {params.rowNames} -# paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} -# rm {params.rowNames} {params.keyValues} -# """ rule saveConfiguration_and_getKeyValues: input: -# checkLog=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/" + "{sample}" + "_merq.spectra-cn.log"), gscopeSum=os.path.join(config['Results'], "01_evaluation/{asmID}/01E_keyResults/{asmID}_k{kmer}_summary.txt"), gscopeSum=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), gscopeLog=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), gscopeLin=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), -# keyValues=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv"), busco=os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), qv=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), @@ -739,20 +600,13 @@ rule saveConfiguration_and_getKeyValues: spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"), spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"), -# os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), -# os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_scaffold_stats.tsv"), contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv"), -# os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), -# multiqc=os.path.join(config['Results'],"{sample}/0_qualityControl/02_multiqc/{sample}_multiqc_report.html") params: - # sampleSheet= config['samplesTSV'], - # config=os.path.join(workflow.basedir, "configuration/config.yaml"), resultsPath=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults"), keyValues2=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames_2.tsv"), keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames.tsv"), rowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/rowNames.tsv") - # masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv") output: keyValuesWithRowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), busco=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), @@ -763,15 +617,10 @@ rule saveConfiguration_and_getKeyValues: spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.st.png"), spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.fl.png"), -# os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), -# os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_scaffold_stats.tsv"), contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_contig_stats.tsv"), # threads: # 1 -# os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), -# multiqc=os.path.join(config['Results'],"{sample}/5_31_Key_Results/{sample}_multiqc_report.html") - # aggregateTsv=os.path.join(config['Results'],"{sample}"+ "/individual_aggregated_results/aggregate.tsv") conda: "../envs/dos2unix.yaml" shell: @@ -814,67 +663,6 @@ rule saveConfiguration_and_getKeyValues: rm {params.rowNames} {params.keyValues} """ - #rm {params.rowNames} {params.keyValues} # """ - # head -n 15 {input.busco} | tail -n 7 | sed "s/^['\t']*//" > {output.buscoScores} - # cp {input.busco} {output.busco} - # cp {input.completeness} {output.completeness} - # cp {input.qv} {output.qv} - # cp {input.spectraStacked_PRI} {output.spectraStacked_PRI} - # cp {input.spectraFlat_PRI} {output.spectraFlat_PRI} - # cp {input.spectraFlat_both} {output.spectraFlat_both} - # cp {input.spectraStacked_both} {output.spectraStacked_both} - # cp {input.scaffStats} {output.scaffStats} - # cp {input.contStats} {output.contStats} - # echo "{wildcards.asmID}" >> {params.keyValues} - # echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}} | sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} | sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} - # echo "$(grep 'gc_content' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} - # echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}} | sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'longest (bp)' {input.scaffStats} | awk {{'print $3'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'longest (bp)' {input.contStats} | awk {{'print $3'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'N95' {input.contStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(grep 'NG95' {input.contStats} | awk {{'print $4'}}| sed 's/,//g')" >> {params.keyValues} - # echo "$(awk {{'print $4'}} {input.qv})" | head -n 1 >> {params.keyValues} - # echo "$(awk {{'print $5'}} {input.completeness})" | head -n 1 >> {params.keyValues} - # echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $2'}})" >> {params.keyValues} - # echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $4'}})" >> {params.keyValues} - # dos2unix {params.keyValues} - # echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\nGC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" > {params.rowNames} - # paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} - # rm {params.rowNames} {params.keyValues} - # """ - -# rule aggregateAllAssemb| sed 's/,//g'lies: -# input: -# allResults=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), -# sampleSheet= config['samplesTSV'], -# config=os.path.join(workflow.basedir, "configuration/config.yaml") -# output: -# results=os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), -# # rows=os.path.join(config['Results'],"allAssemblies_keyResults/aggregate_rows.tsv"), -# newSampleSheet=os.path.join(config['Results'],"1_evaluation/finalResults/savedSampleSheet.tsv"), -# newConfigFile=os.path.join(config['Results'],"1_evaluation/finalResults/savedConfig.yaml") -# threads: -# 1 -# shell: -# """ -# cp {input.sampleSheet} {output.newSampleSheet} -# cp {input.config} {output.newConfigFile} -# echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\nGC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" | \ -# paste -d'\t' - {input.allResults} | \ -# awk -F'\t' '{{s="";for (i=1;i<=NF;i+=2) {{s=s?s FS $i:$i}} print s}}' | \ -# column -t > {output.results} -# printf "All assembly evaluation results aggregated successfully. Please find these results at {output.results}" -# """ rule aggregateAllAssemblies: input: @@ -923,50 +711,6 @@ rule makeReport: -# rule makeReportLandingPage1: -# input: -# [expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value4) for key, [value1, value2, value3, value4, value5, value6, value7] in testDict.items()], -# landingPage=os.path.join(workflow.basedir, "scripts/reportLandingPage.md") -# output: -# temp(os.path.join(config['Results'],"1_evaluation/evaluationRun_DAG.png")) -# shell: -# """ -# snakemake --dag | dot -Tpng > {output[0]} -# """ -# -# rule makeReportLandingPage2: -# input: -# os.path.join(config['Results'],"1_evaluation/evaluationRun_DAG.png"), -# landingPage=os.path.join(workflow.basedir, "scripts/reportLandingPage.md") -# output: -# temp(os.path.join(config['Results'],"1_evaluation/landingPageWith_DAG.md")) -# shell: -# """ -# awk '{{sub("PLACEHOLDER","{input[0]}")}}1' {input.landingPage} > {output[0]} -# """ - -# def fq2_from_sample_notrim10xOrAdapt(wildcards): -# return notrim10xorAdapt.loc[(wildcards.sample, wildcards.readCounter), 'Library_R2'] -# -# rule pretextMaps2md: -# input: -# [expand(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png"), asmID=list(testDictPRETEXT.keys()))] -# output: -# os.path.join(config['Results'],"1_evaluation/finalResults/FullPRETEXTMarkdown.md") -# params: -# assemblyName='{wildcards.asmID}' -# threads: -# 1 -# run: -# with open(output[0], "w") as out: -# for i in range(0,len(input)): -# print("", file=out) -# print(params.assemblyName, file=out) -# print("", file=out) -# print("", file=out) -# print("", file=out) - - rule pretextMaps2md: input: PretextMap=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png") @@ -989,22 +733,6 @@ rule pretextMaps2md: # print("", file=out) -# testDictPRETEXT -# def IndividualPretextMD(wildcards): -# if samples.loc[(wildcards.asmID), "ALT_present"] == True: -# return os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta") -# else: -# return os.path.join(workflow.basedir, "scripts/ALT_missing.fasta") - -# rule aggregatePretextMaps2md: -# input: -# IndividualPretextMD=expand(os.path.join(config['Results'],"1_evaluation/finalResults/{asmID}_FullPRETEXTMarkdown.md"), asmID=wildcards.asmID) -# output: -# os.path.join(config['Results'],"1_evaluation/finalResults/FullPRETEXTMarkdown.md") -# shell: -# """ -# cat {input.IndividualPretextMD} > {output} -# """ rule addFullTable: input: results=os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv") @@ -1046,7 +774,7 @@ rule addFullTable: rule aggregateReport: input: - indivMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value4) for key, [value1, value2, value3, value4, value5, value6, value7, value8, value9, value10, value11, value12, value13, value14] in testDict.items()], + indivMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value4) for key, [value1, value2, value3, value4, value5, value6, value7, value8, value9, value10, value11, value12, value13, value14, value15] in testDict.items()], landingPage=os.path.join(workflow.basedir, "scripts/reportLandingPage.md"), landingPageTABLE=os.path.join(workflow.basedir, "scripts/reportTABLELandingPage.md"), endTableMD=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown.md"), @@ -1076,15 +804,10 @@ rule makePDF: threads: resource['makePDF']['threads'] resources: - memory=resource['makePDF']['memory'], + mem_mb=resource['makePDF']['mem_mb'], time=resource['makePDF']['time'] shell: """ (pandoc -o {output.pdf_report} {input.md_report} --pdf-engine=tectonic) &>> {log} (pandoc -o {output.pdf_comparison_table} {input.md_comparison_table} --pdf-engine=tectonic) &>> {log} """ - - # echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nN50_length\nNG50_length\nN95_length\nNG95_length\nN100_length\nNG100_length\ntotal_num_bases\ntotal_num_scaffolds" > {output.rows} - # paste -d'\t' {output.rows} {input.allResults} > {output.results} - # awk '{print $1"\t"$3"\t"$5}' {output.results} | column -t {output.results} -#