From 5d44efe6e61e3b11db6b2d8c8d46f2530507269c Mon Sep 17 00:00:00 2001 From: james94 <james94@mi.fu-berlin.de> Date: Sat, 6 Aug 2022 12:45:43 +0200 Subject: [PATCH] comments --- Snakefile | 82 ++++++++++++------------------------------------------- 1 file changed, 18 insertions(+), 64 deletions(-) diff --git a/Snakefile b/Snakefile index 3c70c40..3502a10 100644 --- a/Snakefile +++ b/Snakefile @@ -13,28 +13,34 @@ import argparse, subprocess container: "docker://gepdocker/gep_dockerimage:latest" configfile: "configuration/config.yaml" +### LOAD IN RESOURCES CONFIG ### with open(config['resources'], 'r') as f: resource = yaml.safe_load(f) +### GET BASENAME FOR READS WILDCARDS ### def getBasename4Reads(path): base=os.path.basename(path) return os.path.splitext(base)[0] - +### CHECK IF INPUT IS GZIPPED ### def gzipped_or_not(path): trueORfalse=path.endswith('.gz') return trueORfalse +### CHECK IF HIC AND/OR ALT_ASM FILES ARE GIVEN, OR VALUE IS NONE ### def FILE_present_or_not(path): if path == 'None': return False return True + +### CHECK IF GENOME_SIZE IS PROVIDED, OR VALUE IS AUTO ### def genomeSize_auto_or_not(given_size): if given_size == 'auto': return 0 return given_size + def addUniqueLibraryID(library): string2Add=library + "_1" @@ -51,18 +57,10 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri samples['readCounter'] = samples.groupby(['sample']).cumcount()+1 samples['readCounter'] = samples['readCounter'].astype(str) samples['readCounter'] = samples['sample'] + "_LibraryPair" + samples['readCounter'] - # samples=samples.reset_index() - # trim10x_table = samples[samples['trim10X'] == "True"] - # trim10x_table=trim10x_table.set_index(['sample','readCounter']) - # notrim10xwithAdapt= samples[(samples['trim10X'] == "False") & (samples['trimAdapters'] == "True") ] - # notrim10xwithAdapt=notrim10xwithAdapt.set_index(['sample','readCounter']) - # notrim10xorAdapt= samples[(samples['trim10X'] == "False") & (samples['trimAdapters'] == "False") ] - # notrim10xorAdapt=notrim10xorAdapt.set_index(['sample','readCounter']) - # d10xtrim = {} + samples['10xtrimorNot'] = np.where(samples['trim10X'] == "True", "10xTrimmed", "not10xTrimmed") samples['AdpaterTrimorNot'] = np.where(samples['trimAdapters'] == "True", "AdaptTrimmed", "notAdaptTrimmed") - # for i in samples['sample'].unique(): - # d10xtrim[i] = [samples['10xtrimorNot'][j] for j in samples[samples['sample']==i].index] + dictSamples=samples[['sample','meryl_kmer_size', '10xtrimorNot','AdpaterTrimorNot']] dictSamples=dictSamples.set_index(['sample']).T.to_dict('list') @@ -72,14 +70,7 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri testDictQC=testDictQC.set_index(['sample']).T.to_dict('list') trimAdapters = samples[samples['trimAdapters'] == "True"] - # dAdaptertrim = {} - # for i in trimAdapters['sample'].unique(): - # dAdaptertrim[i] = [trimAdapters['readCounter'][j] for j in trimAdapters[trimAdapters['sample']==i].index] - # - # runQC = samples[samples['fastQC'] == "True"] - # drunQCtrim = {} - # for i in runQC['sample'].unique(): - # drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index] + dictReadCounter = {} for i in samples['sample'].unique(): dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index] @@ -118,9 +109,7 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i samples['readCounter'] = samples['readCounter'].astype(str) samples['readCounter'] = samples['sample'] + "_Library" + samples['readCounter'] samples['smrtornot'] = np.where(samples['trimSMRTbell'] == "True", "smrtTrimmed", "notsmrtTrimmed") - # samplesDict=samples.set_index('sample').T.to_dict('list') - # samples=samples.reset_index() dictSamples=samples[['sample','meryl_kmer_size', 'smrtornot']] dictSamples=dictSamples.drop_duplicates('sample', keep='first') dictSamples=dictSamples.set_index(['sample']).T.to_dict('list') @@ -129,42 +118,10 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i testDictQC=samples[['sample', 'smrtornot', 'fastQC']] testDictQC = testDictQC[testDictQC['fastQC'] == "True"] - # testDictQC=testDictQC.drop_duplicates('sample', keep='first') + testDictQC=testDictQC.set_index(['sample']).T.to_dict('list') - # runQC = samples[samples['fastQC'] == "True"] - # - # drunQCtrim = {} - # for i in runQC['sample'].unique(): - # drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index] - # - # runQC=runQC.set_index(['sample','readCounter']) - # - # - # trimSMRTbell=samples.loc[samples['trimSMRTbell'] == "True"] - # dtrimSMRTbell = {} - # for i in trimSMRTbell['sample'].unique(): - # dtrimSMRTbell[i] = [trimSMRTbell['readCounter'][j] for j in trimSMRTbell[trimSMRTbell['sample']==i].index] - # trimSMRTbell=trimSMRTbell.set_index(['sample','readCounter']) - # notrimSMRTbell = samples[samples['trimSMRTbell'] == "False"] - # notrimSMRTbell=notrimSMRTbell.set_index(['sample','readCounter']) - - - # dsmrtornot = {} - # dfsmrtornot=samples.drop_duplicates('smrtornot', keep='first') - # - # for i in samples['sample'].unique(): - # dsmrtornot[i] = [dfsmrtornot['smrtornot'][j] for j in dfsmrtornot[dfsmrtornot['sample']==i].index] - - # for i in samples['sample'].unique(): - # dsmrtornot[i] = [samples['smrtornot'][j] for j in samples[samples['sample']==i].index] - # - # - # dkmerSize = {} - # dkmerDups=samples.drop_duplicates('meryl_kmer_size', keep='first') - # for i in samples['sample'].unique(): - # dkmerSize[i] = [dkmerDups['meryl_kmer_size'][j] for j in dkmerDups[dkmerDups['sample']==i].index] dictReadCounter = {} for i in samples['sample'].unique(): dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index] @@ -244,11 +201,13 @@ elif set(['ID', 'ASM_LEVEL', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'g dictSamples=samples.T.to_dict('list') ruleAllQCFiles=[] - ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(dictSamples.keys())),\ - os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), \ + ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/KEY_RESULTS/{asmID}_aggregatedResults.tsv"), asmID=list(dictSamples.keys())),\ + os.path.join(config['Results'],"1_evaluation/finalResults/TABLE_OF_RESULTS.tsv"), \ os.path.join(config['Results'],"1_evaluation/finalResults/FULL_REPORT.pdf") +### DOWNLOAD BUSCO LINEAGE (IF IT DOESN'T ALREADY EXIST) #### + args_o=os.path.join(workflow.basedir, "buscoLineage") args_l=config['busco5Lineage'] checkLineagePath=args_o + "/" + args_l + "_odb10" @@ -298,11 +257,6 @@ elif set(['ID', 'ASM_LEVEL', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'g else: raise ValueError("Error - could not identify lineage please check busco site for a correct prefix") - - # os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"), \ - # os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf"), \ - # os.path.join(config['Results'],"1_evaluation/finalResults/FullTableCOLOUREDheatmap.pdf"), \ - # os.path.join(config['Results'],"1_evaluation/finalResults/FullTableGRADIENTheatmap.pdf") else: raise ValueError('Sample Sheet for not recognised. Please make sure you are using the correct sample sheet') @@ -314,13 +268,13 @@ else: if "Results" not in config: config["Results"] = "results" -report: "report/workflow.rst" + include: whichRule -inputs = ruleAllQCFiles, ruleAll +final_target_outputs = ruleAllQCFiles, ruleAll rule all: input: - inputs + final_target_outputs -- GitLab