Skip to content
Snippets Groups Projects
Commit 3e1cf26f authored by james94's avatar james94
Browse files

update Snakefile

parent f6eefe83
No related branches found
No related tags found
No related merge requests found
...@@ -47,22 +47,22 @@ def to_be_smrtTrimmed(userAnswer): ...@@ -47,22 +47,22 @@ def to_be_smrtTrimmed(userAnswer):
samples = pd.read_csv(config['samplesTSV'], dtype=str, index_col=False, delim_whitespace=True, skip_blank_lines=True) samples = pd.read_csv(config['samplesTSV'], dtype=str, index_col=False, delim_whitespace=True, skip_blank_lines=True)
if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'trimAdapters', 'fastQC']).issubset(samples.columns): if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'trimAdapters', 'fastQC']).issubset(samples.columns):
whichRule = "rules/build_illumina.smk" whichRule = "rules/build_illumina.smk"
samples=samples.reset_index() # samples=samples.reset_index()
samples['readCounter'] = samples.groupby(['sample']).cumcount()+1 samples['readCounter'] = samples.groupby(['sample']).cumcount()+1
samples['readCounter'] = samples['readCounter'].astype(str) samples['readCounter'] = samples['readCounter'].astype(str)
samples['readCounter'] = samples['sample'] + "_LibraryPair" + samples['readCounter'] samples['readCounter'] = samples['sample'] + "_LibraryPair" + samples['readCounter']
samples=samples.reset_index() # samples=samples.reset_index()
trim10x_table = samples[samples['trim10X'] == "True"] # trim10x_table = samples[samples['trim10X'] == "True"]
trim10x_table=trim10x_table.set_index(['sample','readCounter']) # trim10x_table=trim10x_table.set_index(['sample','readCounter'])
notrim10xwithAdapt= samples[(samples['trim10X'] == "False") & (samples['trimAdapters'] == "True") ] # notrim10xwithAdapt= samples[(samples['trim10X'] == "False") & (samples['trimAdapters'] == "True") ]
notrim10xwithAdapt=notrim10xwithAdapt.set_index(['sample','readCounter']) # notrim10xwithAdapt=notrim10xwithAdapt.set_index(['sample','readCounter'])
notrim10xorAdapt= samples[(samples['trim10X'] == "False") & (samples['trimAdapters'] == "False") ] # notrim10xorAdapt= samples[(samples['trim10X'] == "False") & (samples['trimAdapters'] == "False") ]
notrim10xorAdapt=notrim10xorAdapt.set_index(['sample','readCounter']) # notrim10xorAdapt=notrim10xorAdapt.set_index(['sample','readCounter'])
d10xtrim = {} # d10xtrim = {}
samples['10xtrimorNot'] = np.where(samples['trim10X'] == "True", "10xTrimmed", "not10xTrimmed") samples['10xtrimorNot'] = np.where(samples['trim10X'] == "True", "10xTrimmed", "not10xTrimmed")
samples['AdpaterTrimorNot'] = np.where(samples['trimAdapters'] == "True", "AdaptTrimmed", "notAdaptTrimmed") samples['AdpaterTrimorNot'] = np.where(samples['trimAdapters'] == "True", "AdaptTrimmed", "notAdaptTrimmed")
for i in samples['sample'].unique(): # for i in samples['sample'].unique():
d10xtrim[i] = [samples['10xtrimorNot'][j] for j in samples[samples['sample']==i].index] # d10xtrim[i] = [samples['10xtrimorNot'][j] for j in samples[samples['sample']==i].index]
dictSamples=samples[['sample','meryl_kmer_size', '10xtrimorNot','AdpaterTrimorNot']] dictSamples=samples[['sample','meryl_kmer_size', '10xtrimorNot','AdpaterTrimorNot']]
dictSamples=dictSamples.set_index(['sample']).T.to_dict('list') dictSamples=dictSamples.set_index(['sample']).T.to_dict('list')
...@@ -72,18 +72,18 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri ...@@ -72,18 +72,18 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri
testDictQC=testDictQC.set_index(['sample']).T.to_dict('list') testDictQC=testDictQC.set_index(['sample']).T.to_dict('list')
trimAdapters = samples[samples['trimAdapters'] == "True"] trimAdapters = samples[samples['trimAdapters'] == "True"]
dAdaptertrim = {} # dAdaptertrim = {}
for i in trimAdapters['sample'].unique(): # for i in trimAdapters['sample'].unique():
dAdaptertrim[i] = [trimAdapters['readCounter'][j] for j in trimAdapters[trimAdapters['sample']==i].index] # dAdaptertrim[i] = [trimAdapters['readCounter'][j] for j in trimAdapters[trimAdapters['sample']==i].index]
#
runQC = samples[samples['fastQC'] == "True"] # runQC = samples[samples['fastQC'] == "True"]
drunQCtrim = {} # drunQCtrim = {}
for i in runQC['sample'].unique(): # for i in runQC['sample'].unique():
drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index] # drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index]
dictReadCounter = {} dictReadCounter = {}
for i in samples['sample'].unique(): for i in samples['sample'].unique():
dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index] dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index]
dkmerSize = {} # dkmerSize = {}
samples['gzipped_R1']=samples['Library_R1'].apply(gzipped_or_not) samples['gzipped_R1']=samples['Library_R1'].apply(gzipped_or_not)
samples['gzipped_R2']=samples['Library_R2'].apply(gzipped_or_not) samples['gzipped_R2']=samples['Library_R2'].apply(gzipped_or_not)
...@@ -108,59 +108,63 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri ...@@ -108,59 +108,63 @@ if set(['sample', 'Library_R1', 'Library_R2', 'meryl_kmer_size', 'trim10X', 'tri
ruleAllQCFiles=[] ruleAllQCFiles=[]
if samples['fastQC'].str.contains('True').any(): if samples['fastQC'].str.contains('True').any():
ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/05_multiqc/{sample}.{trim10x}.{trimAdapters}.multiqcReport.html"), sample=key, trim10x=value1, trimAdapters=value2) for key, [value1, value2, value3] in testDictQC.items()] ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/illuminaReads/QC/multiqc/{sample}.multiqcReport.html"), sample=key) for key, [value1, value2, value3] in testDictQC.items()]
ruleAll=[expand(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/03_merylDb/complete_{sample}_illuminaDb.{trim10x}.{trimAdapters}.{kmer}.meryl"), sample=key, kmer=value1, trim10x=value2, trimAdapters=value3) for key, [value1, value2, value3] in dictSamples.items()] ruleAll=[expand(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/merylDb/complete_illumina.{sample}.{kmer}.meryl"), sample=key, kmer=value1) for key, [value1, value2, value3] in dictSamples.items()]
elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).issubset(samples.columns): elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).issubset(samples.columns):
whichRule = "rules/build_hifi.smk" whichRule = "rules/build_hifi.smk"
samples=samples.reset_index() # samples=samples.reset_index()
samples['readCounter'] = samples.groupby(['sample']).cumcount()+1 samples['readCounter'] = samples.groupby(['sample']).cumcount()+1
samples['readCounter'] = samples['readCounter'].astype(str) samples['readCounter'] = samples['readCounter'].astype(str)
samples['readCounter'] = samples['sample'] + "_Library" + samples['readCounter'] samples['readCounter'] = samples['sample'] + "_Library" + samples['readCounter']
samples['smrtornot'] = np.where(samples['trimSMRTbell'] == "True", "smrtTrimmed", "notsmrtTrimmed") samples['smrtornot'] = np.where(samples['trimSMRTbell'] == "True", "smrtTrimmed", "notsmrtTrimmed")
samplesDict=samples.set_index('sample').T.to_dict('list') # samplesDict=samples.set_index('sample').T.to_dict('list')
samples=samples.reset_index() # samples=samples.reset_index()
dictSamples=samples[['sample','meryl_kmer_size', 'smrtornot']] dictSamples=samples[['sample','meryl_kmer_size', 'smrtornot']]
dictSamples=dictSamples.drop_duplicates('sample', keep='first')
dictSamples=dictSamples.set_index(['sample']).T.to_dict('list') dictSamples=dictSamples.set_index(['sample']).T.to_dict('list')
testDictQC=samples[['sample', 'smrtornot', 'fastQC']] testDictQC=samples[['sample', 'smrtornot', 'fastQC']]
testDictQC = testDictQC[testDictQC['fastQC'] == "True"] testDictQC = testDictQC[testDictQC['fastQC'] == "True"]
testDictQC=testDictQC.drop_duplicates('sample', keep='first') # testDictQC=testDictQC.drop_duplicates('sample', keep='first')
testDictQC=testDictQC.set_index(['sample']).T.to_dict('list') testDictQC=testDictQC.set_index(['sample']).T.to_dict('list')
runQC = samples[samples['fastQC'] == "True"] # runQC = samples[samples['fastQC'] == "True"]
#
drunQCtrim = {} # drunQCtrim = {}
for i in runQC['sample'].unique(): # for i in runQC['sample'].unique():
drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index] # drunQCtrim[i] = [runQC['readCounter'][j] for j in runQC[runQC['sample']==i].index]
#
runQC=runQC.set_index(['sample','readCounter']) # runQC=runQC.set_index(['sample','readCounter'])
#
#
trimSMRTbell=samples.loc[samples['trimSMRTbell'] == "True"] # trimSMRTbell=samples.loc[samples['trimSMRTbell'] == "True"]
dtrimSMRTbell = {} # dtrimSMRTbell = {}
for i in trimSMRTbell['sample'].unique(): # for i in trimSMRTbell['sample'].unique():
dtrimSMRTbell[i] = [trimSMRTbell['readCounter'][j] for j in trimSMRTbell[trimSMRTbell['sample']==i].index] # dtrimSMRTbell[i] = [trimSMRTbell['readCounter'][j] for j in trimSMRTbell[trimSMRTbell['sample']==i].index]
trimSMRTbell=trimSMRTbell.set_index(['sample','readCounter']) # trimSMRTbell=trimSMRTbell.set_index(['sample','readCounter'])
notrimSMRTbell = samples[samples['trimSMRTbell'] == "False"] # notrimSMRTbell = samples[samples['trimSMRTbell'] == "False"]
notrimSMRTbell=notrimSMRTbell.set_index(['sample','readCounter']) # notrimSMRTbell=notrimSMRTbell.set_index(['sample','readCounter'])
dsmrtornot = {} # dsmrtornot = {}
dfsmrtornot=samples.drop_duplicates('smrtornot', keep='first') # dfsmrtornot=samples.drop_duplicates('smrtornot', keep='first')
#
for i in samples['sample'].unique(): # for i in samples['sample'].unique():
dsmrtornot[i] = [dfsmrtornot['smrtornot'][j] for j in dfsmrtornot[dfsmrtornot['sample']==i].index] # dsmrtornot[i] = [dfsmrtornot['smrtornot'][j] for j in dfsmrtornot[dfsmrtornot['sample']==i].index]
# for i in samples['sample'].unique():
# dsmrtornot[i] = [samples['smrtornot'][j] for j in samples[samples['sample']==i].index]
dkmerSize = {} #
dkmerDups=samples.drop_duplicates('meryl_kmer_size', keep='first') #
for i in samples['sample'].unique():
dkmerSize[i] = [dkmerDups['meryl_kmer_size'][j] for j in dkmerDups[dkmerDups['sample']==i].index] # dkmerSize = {}
# dkmerDups=samples.drop_duplicates('meryl_kmer_size', keep='first')
# for i in samples['sample'].unique():
# dkmerSize[i] = [dkmerDups['meryl_kmer_size'][j] for j in dkmerDups[dkmerDups['sample']==i].index]
dictReadCounter = {} dictReadCounter = {}
for i in samples['sample'].unique(): for i in samples['sample'].unique():
dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index] dictReadCounter[i] = [samples['readCounter'][j] for j in samples[samples['sample']==i].index]
...@@ -179,9 +183,10 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i ...@@ -179,9 +183,10 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i
ruleAllQCFiles=[] ruleAllQCFiles=[]
if samples['fastQC'].str.contains('True').any(): if samples['fastQC'].str.contains('True').any():
ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{sample}.{smrtornot}.multiqcReport.html"), sample=key, smrtornot=value1) for key, [value1, value2] in testDictQC.items()] ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/multiqc/{sample}.multiqcReport.html"), sample=key) for key, [value1, value2] in testDictQC.items()]
ruleAll=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/03_merylDb/complete_hifi_{sample}_dB.{smrtornot}.{kmer}.meryl"), sample=key, kmer=value1, smrtornot=value2) for key, [value1, value2] in dictSamples.items()] ruleAll=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/merylDb/complete_hifi.{sample}.{kmer}.meryl"), sample=key, kmer=value1) for key, [value1, value2] in dictSamples.items()]
elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', 'HiC_R1', 'HiC_R2']).issubset(samples.columns): elif set(['ID', 'ASM_LEVEL', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', 'HiC_R1', 'HiC_R2']).issubset(samples.columns):
whichRule = "rules/evaluate.smk" whichRule = "rules/evaluate.smk"
samples['genomeSize']=samples['genomeSize'].apply(genomeSize_auto_or_not) samples['genomeSize']=samples['genomeSize'].apply(genomeSize_auto_or_not)
...@@ -239,8 +244,65 @@ elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', ' ...@@ -239,8 +244,65 @@ elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', '
dictSamples=samples.T.to_dict('list') dictSamples=samples.T.to_dict('list')
ruleAllQCFiles=[] ruleAllQCFiles=[]
ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(dictSamples.keys())), os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"),os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf") ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(dictSamples.keys())),\
os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), \
os.path.join(config['Results'],"1_evaluation/finalResults/FULL_REPORT.pdf")
args_o=os.path.join(workflow.basedir, "buscoLineage")
args_l=config['busco5Lineage']
checkLineagePath=args_o + "/" + args_l + "_odb10"
checkLineageDled=args_o + "/" + args_l
outDir=args_o
if os.path.isdir(args_l) is True:
# subprocess.call("ln -sf %s %s"%(args.l, args.l), shell=True)
buscoDataBaseName=os.path.basename(args_l)
buscoDataBaseName=buscoDataBaseName[:-6]
# print("Lineage path given, basename is:", buscoDataBaseName)
elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True:
# subprocess.call("ln -sf %s %s"%(checkLineagePath, checkLineageDled), shell=True)
buscoDataBaseName=os.path.basename(checkLineagePath)
buscoDataBaseName=buscoDataBaseName[:-6]
# print("Database already in buscoLineage directory, basename is:", buscoDataBaseName)
else:
# print("Database will be downloaded")
url = "https://busco-data.ezlab.org/v5/data/lineages/"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
linID=None
#identify the lineage file
for line in text.splitlines():
if line.startswith(args_l):
# print(line)
linID=line.split(" ")[0]
# print(linID)
break
if not linID==None:
linLink=url+linID
# print(linLink)
print('Downloading Busco Database:', args_l)
subprocess.run("wget -q -P %s %s"%(outDir, linLink), shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
subprocess.run("tar -xvf %s*.tar.gz -C %s"%(args_o + "/" + args_l, outDir), shell=True, check=True,stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
subprocess.run("rm %s_*.tar.gz"%(checkLineageDled), shell=True, check=True)
buscoDataBaseName=args_l
print('Downloading Busco Database:', args_l, " - COMPLETE")
else:
raise ValueError("Error - could not identify lineage please check busco site for a correct prefix")
# os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"), \
# os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf"), \
# os.path.join(config['Results'],"1_evaluation/finalResults/FullTableCOLOUREDheatmap.pdf"), \
# os.path.join(config['Results'],"1_evaluation/finalResults/FullTableGRADIENTheatmap.pdf")
else: else:
raise ValueError('Sample Sheet for not recognised. Please make sure you are using the correct sample sheet') raise ValueError('Sample Sheet for not recognised. Please make sure you are using the correct sample sheet')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment