Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Genome_Evaluation_Pipeline
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
cmazzoni
Genome_Evaluation_Pipeline
Commits
5d44efe6
Commit
5d44efe6
authored
2 years ago
by
james94
Browse files
Options
Downloads
Patches
Plain Diff
comments
parent
d57f9d47
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Snakefile
+18
-64
18 additions, 64 deletions
Snakefile
with
18 additions
and
64 deletions
Snakefile
+
18
−
64
View file @
5d44efe6
...
@@ -13,28 +13,34 @@ import argparse, subprocess
...
@@ -13,28 +13,34 @@ import argparse, subprocess
container: "docker://gepdocker/gep_dockerimage:latest"
container: "docker://gepdocker/gep_dockerimage:latest"
configfile: "configuration/config.yaml"
configfile: "configuration/config.yaml"
### LOAD IN RESOURCES CONFIG ###
with open(config['resources'], 'r') as f:
with open(config['resources'], 'r') as f:
resource = yaml.safe_load(f)
resource = yaml.safe_load(f)
### GET BASENAME FOR READS WILDCARDS ###
def getBasename4Reads(path):
def getBasename4Reads(path):
base=os.path.basename(path)
base=os.path.basename(path)
return os.path.splitext(base)[0]
return os.path.splitext(base)[0]
### CHECK IF INPUT IS GZIPPED ###
def gzipped_or_not(path):
def gzipped_or_not(path):
trueORfalse=path.endswith('.gz')
trueORfalse=path.endswith('.gz')
return trueORfalse
return trueORfalse
### CHECK IF HIC AND/OR ALT_ASM FILES ARE GIVEN, OR VALUE IS NONE ###
def FILE_present_or_not(path):
def FILE_present_or_not(path):
if path == 'None':
if path == 'None':
return False
return False
return True
return True
### CHECK IF GENOME_SIZE IS PROVIDED, OR VALUE IS AUTO ###
def genomeSize_auto_or_not(given_size):
def genomeSize_auto_or_not(given_size):
if given_size == 'auto':
if given_size == 'auto':
return 0
return 0
return given_size
return given_size
def addUniqueLibraryID(library):
def addUniqueLibraryID(library):
string2Add=library + "_1"
string2Add=library + "_1"
...
@@ -51,18 +57,10 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri
...
@@ -51,18 +57,10 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri
samples['readCounter'] = samples.groupby(['sample']).cumcount()+1
samples['readCounter'] = samples.groupby(['sample']).cumcount()+1
samples['readCounter'] = samples['readCounter'].astype(str)
samples['readCounter'] = samples['readCounter'].astype(str)
samples['readCounter'] = samples['sample'] + "_LibraryPair" + samples['readCounter']
samples['readCounter'] = samples['sample'] + "_LibraryPair" + samples['readCounter']
# samples=samples.reset_index()
# trim10x_table = samples[samples['trim10X'] == "True"]
# trim10x_table=trim10x_table.set_index(['sample','readCounter'])
# notrim10xwithAdapt= samples[(samples['trim10X'] == "False") & (samples['trimAdapters'] == "True") ]
# notrim10xwithAdapt=notrim10xwithAdapt.set_index(['sample','readCounter'])
# notrim10xorAdapt= samples[(samples['trim10X'] == "False") & (samples['trimAdapters'] == "False") ]
# notrim10xorAdapt=notrim10xorAdapt.set_index(['sample','readCounter'])
# d10xtrim = {}
samples['10xtrimorNot'] = np.where(samples['trim10X'] == "True", "10xTrimmed", "not10xTrimmed")
samples['10xtrimorNot'] = np.where(samples['trim10X'] == "True", "10xTrimmed", "not10xTrimmed")
samples['AdpaterTrimorNot'] = np.where(samples['trimAdapters'] == "True", "AdaptTrimmed", "notAdaptTrimmed")
samples['AdpaterTrimorNot'] = np.where(samples['trimAdapters'] == "True", "AdaptTrimmed", "notAdaptTrimmed")
# for i in samples['sample'].unique():
# d10xtrim[i] = [samples['10xtrimorNot'][j] for j in samples[samples['sample']==i].index]
dictSamples=samples[['sample','meryl_kmer_size', '10xtrimorNot','AdpaterTrimorNot']]
dictSamples=samples[['sample','meryl_kmer_size', '10xtrimorNot','AdpaterTrimorNot']]
dictSamples=dictSamples.set_index(['sample']).T.to_dict('list')
dictSamples=dictSamples.set_index(['sample']).T.to_dict('list')
...
@@ -72,14 +70,7 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri
...
@@ -72,14 +70,7 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri
testDictQC=testDictQC.set_index(['sample']).T.to_dict('list')
testDictQC=testDictQC.set_index(['sample']).T.to_dict('list')
trimAdapters = samples[samples['trimAdapters'] == "True"]
trimAdapters = samples[samples['trimAdapters'] == "True"]
# dAdaptertrim = {}
# for i in trimAdapters['sample'].unique():
# dAdaptertrim[i] = [trimAdapters['readCounter'][j] for j in trimAdapters[trimAdapters['sample']==i].index]
#
# runQC = samples[samples['fastQC'] == "True"]
# drunQCtrim = {}
# for i in runQC['sample'].unique():
# drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index]
dictReadCounter = {}
dictReadCounter = {}
for i in samples['sample'].unique():
for i in samples['sample'].unique():
dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index]
dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index]
...
@@ -118,9 +109,7 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i
...
@@ -118,9 +109,7 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i
samples['readCounter'] = samples['readCounter'].astype(str)
samples['readCounter'] = samples['readCounter'].astype(str)
samples['readCounter'] = samples['sample'] + "_Library" + samples['readCounter']
samples['readCounter'] = samples['sample'] + "_Library" + samples['readCounter']
samples['smrtornot'] = np.where(samples['trimSMRTbell'] == "True", "smrtTrimmed", "notsmrtTrimmed")
samples['smrtornot'] = np.where(samples['trimSMRTbell'] == "True", "smrtTrimmed", "notsmrtTrimmed")
# samplesDict=samples.set_index('sample').T.to_dict('list')
# samples=samples.reset_index()
dictSamples=samples[['sample','meryl_kmer_size', 'smrtornot']]
dictSamples=samples[['sample','meryl_kmer_size', 'smrtornot']]
dictSamples=dictSamples.drop_duplicates('sample', keep='first')
dictSamples=dictSamples.drop_duplicates('sample', keep='first')
dictSamples=dictSamples.set_index(['sample']).T.to_dict('list')
dictSamples=dictSamples.set_index(['sample']).T.to_dict('list')
...
@@ -129,42 +118,10 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i
...
@@ -129,42 +118,10 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i
testDictQC=samples[['sample', 'smrtornot', 'fastQC']]
testDictQC=samples[['sample', 'smrtornot', 'fastQC']]
testDictQC = testDictQC[testDictQC['fastQC'] == "True"]
testDictQC = testDictQC[testDictQC['fastQC'] == "True"]
# testDictQC=testDictQC.drop_duplicates('sample', keep='first')
testDictQC=testDictQC.set_index(['sample']).T.to_dict('list')
testDictQC=testDictQC.set_index(['sample']).T.to_dict('list')
# runQC = samples[samples['fastQC'] == "True"]
#
# drunQCtrim = {}
# for i in runQC['sample'].unique():
# drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index]
#
# runQC=runQC.set_index(['sample','readCounter'])
#
#
# trimSMRTbell=samples.loc[samples['trimSMRTbell'] == "True"]
# dtrimSMRTbell = {}
# for i in trimSMRTbell['sample'].unique():
# dtrimSMRTbell[i] = [trimSMRTbell['readCounter'][j] for j in trimSMRTbell[trimSMRTbell['sample']==i].index]
# trimSMRTbell=trimSMRTbell.set_index(['sample','readCounter'])
# notrimSMRTbell = samples[samples['trimSMRTbell'] == "False"]
# notrimSMRTbell=notrimSMRTbell.set_index(['sample','readCounter'])
# dsmrtornot = {}
# dfsmrtornot=samples.drop_duplicates('smrtornot', keep='first')
#
# for i in samples['sample'].unique():
# dsmrtornot[i] = [dfsmrtornot['smrtornot'][j] for j in dfsmrtornot[dfsmrtornot['sample']==i].index]
# for i in samples['sample'].unique():
# dsmrtornot[i] = [samples['smrtornot'][j] for j in samples[samples['sample']==i].index]
#
#
# dkmerSize = {}
# dkmerDups=samples.drop_duplicates('meryl_kmer_size', keep='first')
# for i in samples['sample'].unique():
# dkmerSize[i] = [dkmerDups['meryl_kmer_size'][j] for j in dkmerDups[dkmerDups['sample']==i].index]
dictReadCounter = {}
dictReadCounter = {}
for i in samples['sample'].unique():
for i in samples['sample'].unique():
dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index]
dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index]
...
@@ -244,11 +201,13 @@ elif set(['ID', 'ASM_LEVEL', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'g
...
@@ -244,11 +201,13 @@ elif set(['ID', 'ASM_LEVEL', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'g
dictSamples=samples.T.to_dict('list')
dictSamples=samples.T.to_dict('list')
ruleAllQCFiles=[]
ruleAllQCFiles=[]
ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/
06_keyResults
/{asmID}_aggregatedResults.tsv"), asmID=list(dictSamples.keys())),\
ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/
KEY_RESULTS
/{asmID}_aggregatedResults.tsv"), asmID=list(dictSamples.keys())),\
os.path.join(config['Results'],"1_evaluation/finalResults/
Combined_Results_FULLTABLE
.tsv"), \
os.path.join(config['Results'],"1_evaluation/finalResults/
TABLE_OF_RESULTS
.tsv"), \
os.path.join(config['Results'],"1_evaluation/finalResults/FULL_REPORT.pdf")
os.path.join(config['Results'],"1_evaluation/finalResults/FULL_REPORT.pdf")
### DOWNLOAD BUSCO LINEAGE (IF IT DOESN'T ALREADY EXIST) ####
args_o=os.path.join(workflow.basedir, "buscoLineage")
args_o=os.path.join(workflow.basedir, "buscoLineage")
args_l=config['busco5Lineage']
args_l=config['busco5Lineage']
checkLineagePath=args_o + "/" + args_l + "_odb10"
checkLineagePath=args_o + "/" + args_l + "_odb10"
...
@@ -298,11 +257,6 @@ elif set(['ID', 'ASM_LEVEL', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'g
...
@@ -298,11 +257,6 @@ elif set(['ID', 'ASM_LEVEL', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'g
else:
else:
raise ValueError("Error - could not identify lineage please check busco site for a correct prefix")
raise ValueError("Error - could not identify lineage please check busco site for a correct prefix")
# os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"), \
# os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf"), \
# os.path.join(config['Results'],"1_evaluation/finalResults/FullTableCOLOUREDheatmap.pdf"), \
# os.path.join(config['Results'],"1_evaluation/finalResults/FullTableGRADIENTheatmap.pdf")
else:
else:
raise ValueError('Sample Sheet for not recognised. Please make sure you are using the correct sample sheet')
raise ValueError('Sample Sheet for not recognised. Please make sure you are using the correct sample sheet')
...
@@ -314,13 +268,13 @@ else:
...
@@ -314,13 +268,13 @@ else:
if "Results" not in config:
if "Results" not in config:
config["Results"] = "results"
config["Results"] = "results"
report: "report/workflow.rst"
include: whichRule
include: whichRule
inputs = ruleAllQCFiles, ruleAll
f
in
al_target_out
puts = ruleAllQCFiles, ruleAll
rule all:
rule all:
input:
input:
inputs
f
in
al_target_out
puts
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment