diff --git a/Snakefile b/Snakefile index 397ee4cbc662e61897642c803e7c38c5e5f92d59..10eafcee9f5ce04f7113f76222399b6ddd455d18 100644 --- a/Snakefile +++ b/Snakefile @@ -21,10 +21,16 @@ def gzipped_or_not(path): trueORfalse=path.endswith('.gz') return trueORfalse -def ALT_present_or_not(path): +def FILE_present_or_not(path): if path == 'None': return False return True + +def genomeSize_auto_or_not(given_size): + if given_size == 'auto': + return 0 + return given_size + def addUniqueLibraryID(library): string2Add=library + "_1" @@ -173,26 +179,60 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size' ,'trimSMRTbell', 'fastQC']). if samples['fastQC'].str.contains('True').any(): ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{sample}.{smrtornot}.multiqcReport.html"), sample=key, smrtornot=value1) for key, [value1, value2] in testDictQC.items()] ruleAll=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/03_merylDb/complete_hifi_{sample}_dB.{smrtornot}.{kmer}.meryl"), sample=key, kmer=value1, smrtornot=value2) for key, [value1, value2] in testDict.items()] -elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize']).issubset(samples.columns): +elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize', 'HiC_R1', 'HiC_R2']).issubset(samples.columns): whichRule = "rules/run_05_11_21.smk" - samples['genomeSize'] = samples['genomeSize'].fillna(0) - samples["genomeSize"] = pd.to_numeric(samples["genomeSize"]) - + samples['genomeSize']=samples['genomeSize'].apply(genomeSize_auto_or_not) +# samples['genomeSize'] = samples['genomeSize'].fillna(0) +# samples["genomeSize"] = pd.to_numeric(samples["genomeSize"]) +### make new column for whether or not path/file given is gzipped (True or False) samples['gzipped_PRI']=samples['PRI_asm'].apply(gzipped_or_not) samples['gzipped_ALT']=samples['ALT_asm'].apply(gzipped_or_not) - samples['ALT_present']=samples['ALT_asm'].apply(ALT_present_or_not) - print(samples) + samples['gzipped_HiC_R1']=samples['HiC_R1'].apply(gzipped_or_not) + samples['gzipped_HiC_R2']=samples['HiC_R2'].apply(gzipped_or_not) + +### new column for whether or not alt asm is provided or not + samples['ALT_present']=samples['ALT_asm'].apply(FILE_present_or_not) + samples['HiC_R1_present']=samples['HiC_R1'].apply(FILE_present_or_not) + samples['HiC_R2_present']=samples['HiC_R2'].apply(FILE_present_or_not) + samples['merylDB_present']=samples['merylDB'].apply(FILE_present_or_not) + + testDictPRETEXT = samples[['ID', 'HiC_R1_present', 'HiC_R2_present']] +# + testDictPRETEXT = testDictPRETEXT[(testDictPRETEXT['HiC_R1_present'] == True) & (testDictPRETEXT['HiC_R2_present'] == True)] + print(testDictPRETEXT) +# testDictPRETEXT = testDictPRETEXT[(testDictPRETEXT['HiC_R1_present'] == "True") & (testDictPRETEXT['HiC_R2_present'] == "True")] + testDictPRETEXT=testDictPRETEXT.set_index(['ID']).T.to_dict('list') + print(testDictPRETEXT) + + # print(testDictPRETEXT) + # + # testDictREADS = samples[['ID', 'merylDB_present','merylDB_kmer']] + # testDictYESREADS = testDictREADS[(testDictREADS['merylDB_present'] == True)] + # testDictYESREADS=testDictYESREADS.set_index(['ID']).T.to_dict('list') + # + # testDictNOREADS = testDictREADS[(testDictREADS['merylDB_present'] == False)] + # testDictNOREADS=testDictNOREADS.set_index(['ID']).T.to_dict('list') + + noGzip_HiC_R1 = samples[samples['gzipped_HiC_R1'] == False] + noGzip_HiC_R1 = noGzip_HiC_R1.set_index(['ID']) + noGzip_HiC_R2 = samples[samples['gzipped_HiC_R2'] == False] + noGzip_HiC_R2 = noGzip_HiC_R2.set_index(['ID']) + + yesGzip_HiC_R1 = samples[samples['gzipped_HiC_R1'] == True] + yesGzip_HiC_R1 = yesGzip_HiC_R1.set_index(['ID']) + yesGzip_HiC_R2 = samples[samples['gzipped_HiC_R2'] == True] + yesGzip_HiC_R2 = yesGzip_HiC_R2.set_index(['ID']) noGzip_PRI = samples[samples['gzipped_PRI'] == False] noGzip_PRI=noGzip_PRI.set_index(['ID']) yesGzip_PRI = samples[samples['gzipped_PRI'] == True] - yesGzip_PRI=yesGzip_PRI.set_index(['ID']) + yesGzip_PRI = yesGzip_PRI.set_index(['ID']) noGzip_ALT = samples[samples['gzipped_ALT'] == False] noGzip_ALT = noGzip_ALT[noGzip_ALT['ALT_present'] == True] - noGzip_ALT=noGzip_ALT.set_index(['ID']) + noGzip_ALT = noGzip_ALT.set_index(['ID']) noGzip_ALTDict=noGzip_ALT.T.to_dict('list') no_ALT = samples[samples['gzipped_ALT'] == False] @@ -202,60 +242,21 @@ elif set(['ID', 'PRI_asm', 'ALT_asm', 'merylDB', 'merylDB_kmer', 'genomeSize']). yesGzip_ALT = samples[samples['gzipped_ALT'] == True] yesGzip_ALT=yesGzip_ALT.set_index(['ID']) + samples=samples.set_index(['ID']) + print(samples) testDict=samples.T.to_dict('list') + print(testDict) ruleAllQCFiles=[] - ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf")#expand(os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/markdownForReport.md"), asmID=list(testDict.keys())) +# ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf")#expand(os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/markdownForReport.md"), asmID=list(testDict.keys())) + ruleAll=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"),os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf")#expand(os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/markdownForReport.md"), asmID=list(testDict.keys())) + else: raise ValueError('Sample Sheet for illumina meryl building not recognised. Please make sure you are using the correct sample sheet (prebuilt or non-prebuilt meryl db, etc)') -args_o=os.path.join(workflow.basedir, "buscoLineage") -args_l=config['busco5Lineage'] -checkLineagePath=args_o + "/" + args_l + "_odb10" -checkLineageDled=args_o + "/" + args_l -outDir=args_o -if os.path.isdir(args_l) is True: -# subprocess.call("ln -sf %s %s"%(args.l, args.l), shell=True) - buscoDataBaseName=os.path.basename(args_l) - buscoDataBaseName=buscoDataBaseName[:-6] -# print("Lineage path given, basename is:", buscoDataBaseName) -elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True: -# subprocess.call("ln -sf %s %s"%(checkLineagePath, checkLineageDled), shell=True) - buscoDataBaseName=os.path.basename(checkLineagePath) - buscoDataBaseName=buscoDataBaseName[:-6] -# print("Database already in buscoLineage directory, basename is:", buscoDataBaseName) -else: -# print("Database will be downloaded") - url = "https://busco-data.ezlab.org/v5/data/lineages/" - html = urlopen(url).read() - soup = BeautifulSoup(html, features="html.parser") -# kill all script and style elements - for script in soup(["script", "style"]): - script.extract() # rip it out -# get text - text = soup.get_text() -# break into lines and remove leading and trailing space on each - lines = (line.strip() for line in text.splitlines()) - linID=None -#identify the lineage file - for line in text.splitlines(): - if line.startswith(args_l): -# print(line) - linID=line.split(" ")[0] -# print(linID) - break - if not linID==None: - linLink=url+linID -# print(linLink) - subprocess.run("wget -q -P %s %s"%(outDir, linLink), shell=True, check=True) - subprocess.run("tar -xvf %s*.tar.gz -C %s"%(args_o + "/" + args_l, outDir), shell=True, check=True) - subprocess.run("rm %s_*.tar.gz"%(checkLineageDled), shell=True, check=True) - buscoDataBaseName=args_l - else: - raise ValueError("Error - could not identify lineage please check busco site for a correct prefix") if "Results" not in config: diff --git a/envs/busco_and_assembly.yaml b/envs/busco_and_assembly.yaml index a91593577935a6b947651e009de1237d7df035f2..ad19962297b1f5fd066833023ab16aaff0447e1f 100644 --- a/envs/busco_and_assembly.yaml +++ b/envs/busco_and_assembly.yaml @@ -17,4 +17,4 @@ dependencies: - r-ggplot2 >=2.2.1 - sepp >=4.3.10 - wget - - busco >= 5.1.3 + - busco == 5.3.0 diff --git a/envs/pretext.yaml b/envs/pretext.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/installGEP.yaml b/installGEP.yaml index 2415ffe27842a81c549dfe69fec1af9f02b05ff1..b6e00231f674f3dc7a6d763c150c033902ff379b 100644 --- a/installGEP.yaml +++ b/installGEP.yaml @@ -4,9 +4,10 @@ channels: - bioconda - anaconda dependencies: - - snakemake=6.6.1 - - python=3.9.1 + - snakemake==6.6.1 + - python>=3.6.7 - tabulate=0.8.7 - beautifulsoup4=4.9 - mamba=0.15.2 - - pandoc=2.2.1 + - pandoc=2.15.* + - tectonic diff --git a/rules/hifi_05_11_21.smk b/rules/hifi_05_11_21.smk index 64b27dc00b21a08dce228f9011a37fe172ad6dad..5aa4f544b01e6d87e7e84a99b90c506bb93a1a63 100644 --- a/rules/hifi_05_11_21.smk +++ b/rules/hifi_05_11_21.smk @@ -33,7 +33,7 @@ rule unzipHifi: input: fastq=hifi_gzipped, output: - os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq"), + temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq")), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/logs/hifiReads/{readCounter}.{smrtornot}_pigzUnzip.log") conda: @@ -49,7 +49,7 @@ rule symlinkUnzippedHifi: input: fastq=hifi_notgzipped, output: - os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq"), + temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.{smrtornot}.fastq")), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/logs/hifiReads/{readCounter}.{smrtornot}_pigzUnzip.log") threads: @@ -102,7 +102,7 @@ rule symlinkfornotSmartTrimmed: input: os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/01_unzipFastqs/{readCounter}.notsmrtTrimmed.fastq"), output: - outputFile=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.notsmrtTrimmed.fastq") + outputFile=temp(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.notsmrtTrimmed.fastq")) threads: 1 shell: @@ -110,7 +110,7 @@ rule symlinkfornotSmartTrimmed: ln -s {input} {output.outputFile} """ -rule fastqc_hifi_SMRT_removed: +rule fastqc_hifi: input: os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.{smrtornot}.fastq") params: @@ -129,7 +129,7 @@ rule fastqc_hifi_SMRT_removed: (fastqc {input} -o {params.folder2out} -t {threads}) &> {log} """ -rule multiqc_hifi_SMRTremoved: +rule multiqc_hifi: input: lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/05_multiqc/{readCounter}.{smrtornot}_fastqc.html"), sample=wildcards.sample, readCounter=d[wildcards.sample], smrtornot=testDict[wildcards.sample][1]) params: @@ -147,7 +147,7 @@ rule multiqc_hifi_SMRTremoved: "(multiqc {params.folder2qc} -o {params.folder2qc} -n {params.filename}) &> {log}" -rule meryl_hifi_trimmed_count: +rule meryl_hifi_count: input: # reads= rules.trimSMRTbell.output.outputFile reads=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/02_trimReads/{readCounter}.{smrtornot}.fastq") @@ -160,7 +160,7 @@ rule meryl_hifi_trimmed_count: threads: 16 output: - directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/03_merylDb/" + "{readCounter}" + "_hifi_dB.{smrtornot}.{kmer}.meryl")), + temp(directory(os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/03_merylDb/" + "{readCounter}" + "_hifi_dB.{smrtornot}.{kmer}.meryl"))), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/{readCounter}_hifi_{kmer}.{smrtornot}.log") priority: diff --git a/rules/illumina_05_11_21.smk b/rules/illumina_05_11_21.smk index 83fb00ba7be6c583d05705d78a58334e168808aa..33a8a9a0036128dd9b48d1d6ebabf3fe405e1e20 100644 --- a/rules/illumina_05_11_21.smk +++ b/rules/illumina_05_11_21.smk @@ -36,13 +36,13 @@ def R2_gzipped(wildcards): def R2_notgzipped(wildcards): return noGzip_R2.loc[(wildcards.sample, wildcards.readCounter), "Library_R2"] -rule unzipFasta_R1: +rule unzipFastq_R1: input: assembly=R1_gzipped, output: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R1.fastq"), log: - os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_R1_pigzUnzip.log") + temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_R1_pigzUnzip.log")), conda: "../envs/pigz.yaml" threads: @@ -52,7 +52,7 @@ rule unzipFasta_R1: pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} """ -rule symlinkUnzippedFasta_R1: +rule symlinkUnzippedFastq_R1: input: assembly=R1_notgzipped, output: @@ -64,11 +64,11 @@ rule symlinkUnzippedFasta_R1: ln -s {input} {output} """ -rule unzipFasta_R2: +rule unzipFastq_R2: input: assembly=R2_gzipped, output: - os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R2.fastq"), + temp(os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/01_unzipFastqs/{readCounter}.{trim10x}.{trimAdapters}_R2.fastq")), log: os.path.join(config['Results'], "0_buildDatabases/{sample}/illuminaReads/logs/{readCounter}.{trim10x}.{trimAdapters}_R2_pigzUnzip.log") conda: @@ -80,7 +80,7 @@ rule unzipFasta_R2: pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} """ -rule symlinkUnzippedFasta_R2: +rule symlinkUnzippedFastq_R2: input: assembly=R2_notgzipped, output: diff --git a/rules/run_05_11_21.smk b/rules/run_05_11_21.smk index 52b80adec54dd6eea7fcbdf05dc1ad4308ae8020..32723f14723cd1198d5ba862239da31c2a88dda6 100644 --- a/rules/run_05_11_21.smk +++ b/rules/run_05_11_21.smk @@ -1,8 +1,55 @@ +args_o=os.path.join(workflow.basedir, "buscoLineage") +args_l=config['busco5Lineage'] +checkLineagePath=args_o + "/" + args_l + "_odb10" +checkLineageDled=args_o + "/" + args_l +outDir=args_o + + +if os.path.isdir(args_l) is True: +# subprocess.call("ln -sf %s %s"%(args.l, args.l), shell=True) + buscoDataBaseName=os.path.basename(args_l) + buscoDataBaseName=buscoDataBaseName[:-6] +# print("Lineage path given, basename is:", buscoDataBaseName) +elif os.path.isdir(args_l) is False and os.path.isdir(checkLineagePath) is True: +# subprocess.call("ln -sf %s %s"%(checkLineagePath, checkLineageDled), shell=True) + buscoDataBaseName=os.path.basename(checkLineagePath) + buscoDataBaseName=buscoDataBaseName[:-6] +# print("Database already in buscoLineage directory, basename is:", buscoDataBaseName) +else: +# print("Database will be downloaded") + url = "https://busco-data.ezlab.org/v5/data/lineages/" + html = urlopen(url).read() + soup = BeautifulSoup(html, features="html.parser") +# kill all script and style elements + for script in soup(["script", "style"]): + script.extract() # rip it out +# get text + text = soup.get_text() +# break into lines and remove leading and trailing space on each + lines = (line.strip() for line in text.splitlines()) + linID=None +#identify the lineage file + for line in text.splitlines(): + if line.startswith(args_l): +# print(line) + linID=line.split(" ")[0] +# print(linID) + break + if not linID==None: + linLink=url+linID +# print(linLink) + subprocess.run("wget -q -P %s %s"%(outDir, linLink), shell=True, check=True) + subprocess.run("tar -xvf %s*.tar.gz -C %s"%(args_o + "/" + args_l, outDir), shell=True, check=True) + subprocess.run("rm %s_*.tar.gz"%(checkLineageDled), shell=True, check=True) + buscoDataBaseName=args_l + else: + raise ValueError("Error - could not identify lineage please check busco site for a correct prefix") + + def merylDB(wildcards): return samples.loc[(wildcards.asmID), "merylDB"] -def PRI_asm_unzipped(wildcards): - return noGzip_PRI.loc[(wildcards.asmID), "PRI_asm"] + # print(fastas) # print(fastas) @@ -18,16 +65,260 @@ def PRI_asm_unzipped(wildcards): # print(newList) # return newList -def PRI_asm_gzipped(wildcards): - return yesGzip_PRI.loc[(wildcards.asmID), "PRI_asm"] -def ALT_asm_unzipped(wildcards): - return noGzip_ALT.loc[(wildcards.asmID), "ALT_asm"] -def ALT_asm_gzipped(wildcards): - return yesGzip_ALT.loc[(wildcards.asmID), "ALT_asm"] +def HiC_R1_gzipped(wildcards): + return yesGzip_HiC_R1.loc[(wildcards.asmID), "HiC_R1"] + +rule unzipHiC_R1: + input: + assembly=HiC_R1_gzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq")), + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.R1.pigzUnzip.log") + conda: + "../envs/pigz.yaml" + threads: + 4 + shell: + """ + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} + """ + +def HiC_R1_unzipped(wildcards): + return noGzip_HiC_R1.loc[(wildcards.asmID), "HiC_R1"] + +rule symlinkUnzippedHiC_R1: + input: + assembly=HiC_R1_unzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq")), + threads: + 1 + shell: + """ + ln -s {input} {output} + """ + + +def HiC_R2_gzipped(wildcards): + return yesGzip_HiC_R2.loc[(wildcards.asmID), "HiC_R2"] + +rule unzipHiC_R2: + input: + assembly=HiC_R2_gzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq")), + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.R2.pigzUnzip.log") + conda: + "../envs/pigz.yaml" + threads: + 4 + shell: + """ + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} + """ + +def HiC_R2_unzipped(wildcards): + return noGzip_HiC_R2.loc[(wildcards.asmID), "HiC_R2"] + +rule symlinkUnzippedHiC_R2: + input: + assembly=HiC_R2_unzipped, + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq")), + threads: + 1 + shell: + """ + ln -s {input} {output} + """ + + + + + +rule pretext_index_PRI_asm: + input: + assemblyPRI=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac")), + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai")) + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.PRI.indexing.log") + conda: + "../envs/pretext.yaml" + threads: 1 + shell: + """ + (bwa-mem2 index {input.assemblyPRI}) &> {log} + (samtools faidx {input.assemblyPRI}) &>> {log} + """ + +rule pretext_fastq2bam_R1: + input: + HiC_R1=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.fastq"), + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ] + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.bam")) + conda: + "../envs/pretext.yaml" + threads: + 12 + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.fastq2bam.R1.log") + shell: + """ + (bwa-mem2 mem -t {threads} -B8 {input.assembly} {input.HiC_R1} | samtools view -Sb - > {output}) &> {log} + """ + +rule pretext_fastq2bam_R2: + input: + HiC_R2=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.fastq"), + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ] + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.bam")) + conda: + "../envs/pretext.yaml" + threads: + 12 + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.fastq2bam.R2.log") + shell: + """ + (bwa-mem2 mem -t {threads} -B8 {input.assembly} {input.HiC_R2} | samtools view -Sb - > {output}) &> {log} + """ + +rule pretext_filter_5primeEnd_R1: + input: + HiC_R1_bam=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.bam"), + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ] + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.FILTERED.bam")) + params: + script=os.path.join(workflow.basedir, "scripts/filter_five_end.pl") + conda: + "../envs/pretext.yaml" + threads: + 12 + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.filter5end.R1.log") + shell: + """ + (samtools view -h {input.HiC_R1_bam}| perl {params.script} | samtools view -@{threads} -Sb - > {output}) &> {log} + """ + +rule pretext_filter_5primeEnd_R2: + input: + HiC_R2_bam=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.bam"), + assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + indexes= [ os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.0123"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.amb"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.ann"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.bwt.2bit.64"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.pac"), + os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta.fai") ] + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.FILTERED.bam")) + params: + script=os.path.join(workflow.basedir, "scripts/filter_five_end.pl") + conda: + "../envs/pretext.yaml" + threads: + 12 + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.filter5end.R2.log") + shell: + """ + (samtools view -h {input.HiC_R2_bam}| perl {params.script} | samtools view -@{threads} -Sb - > {output}) &> {log} + """ + + +rule pretext_filtered_combine: + input: + R1=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R1.FILTERED.bam"), + R2=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.R2.FILTERED.bam") + output: + temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.bam")) + params: + script=os.path.join(workflow.basedir, "scripts/two_read_bam_combiner.pl") + conda: + "../envs/pretext.yaml" + threads: + 12 + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.log") + shell: + """ + (perl {params.script} {input.R1} {input.R2} | samtools view -@{threads} -Sb > {output}) &>{log} + """ + +rule pretext_map: + input: + HiC_alignment=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.bam") + output: + pretextFile=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.pretext")) + conda: + "../envs/pretext.yaml" + threads: + 12 + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.PretextMap.log") + shell: + """ + (samtools view -h {input.HiC_alignment} | PretextMap -o {output.pretextFile} --sortby length --mapq 10) &> {log} + """ + +rule pretext_snapshot: + input: + pretextFile=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED.pretext") + output: + pretextSnapshotFULL=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png"), +# pretextCP2keyResults=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png") + params: + outDirectory=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/") + conda: + "../envs/pretext.yaml" + threads: + 1 + log: + os.path.join(config['Results'], "1_evaluation/{asmID}/logs/{asmID}.HiC.COMBINED.FILTERED.PretextSnapshot.log") + shell: + """ + (PretextSnapshot -m {input.pretextFile} -o {params.outDirectory}) &> {log} + """ + +####################################################################################################################################### + +def PRI_asm_gzipped(wildcards): + return yesGzip_PRI.loc[(wildcards.asmID), "PRI_asm"] + rule unzipFasta_PRI: input: @@ -45,6 +336,9 @@ rule unzipFasta_PRI: pigz -c -d -k {input.assembly} > {output} 2> {log} """ +def PRI_asm_unzipped(wildcards): + return noGzip_PRI.loc[(wildcards.asmID), "PRI_asm"] + rule symlinkUnzippedFasta_PRI: input: assembly=PRI_asm_unzipped, @@ -57,6 +351,11 @@ rule symlinkUnzippedFasta_PRI: ln -s {input} {output} """ +################ + +def ALT_asm_gzipped(wildcards): + return yesGzip_ALT.loc[(wildcards.asmID), "ALT_asm"] + rule unzipFasta_ALT: input: assembly=ALT_asm_gzipped, @@ -67,12 +366,16 @@ rule unzipFasta_ALT: conda: "../envs/pigz.yaml" threads: - 1 + 4 shell: """ - pigz -c -d -k {input.assembly} > {output} 2> {log} + pigz -p {threads} -c -d -k {input.assembly} > {output} 2> {log} """ + +def ALT_asm_unzipped(wildcards): + return noGzip_ALT.loc[(wildcards.asmID), "ALT_asm"] + rule symlinkUnzippedFasta_ALT: input: assembly=ALT_asm_unzipped, @@ -85,6 +388,56 @@ rule symlinkUnzippedFasta_ALT: ln -s {input} {output} """ + +#################### + + +def altFile(wildcards): + if samples.loc[(wildcards.asmID), "ALT_present"] == True: + return os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta") + else: + return os.path.join(workflow.basedir, "scripts/ALT_missing.fasta") + +rule merqury: + input: + assemblyPRI=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), + assemblyALT=altFile, + merylDB=merylDB, + # checkunzip=os.path.join(config['Results'], "01_evaluation/{asmID}/logs/{asmID}_pigzUnzip.log") + params: + outFile= "{asmID}" + "_merqOutput", + outPath= os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT"), + # symlink_fasta=os.path.join(config['Results'], "01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}.fasta"), + symlink_merylDB=directory(os.path.join(config['Results'], "1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.meryl")) + threads: + 12 + output: + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"), + os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.hist") + log: + os.path.join(config['Results'],"1_evaluation/{asmID}/logs/{asmID}_merqury.log") + priority: + 3 + conda: + "../envs/merylMerq_2.yaml" + shell: + """ + ln -s {input.merylDB} {params.symlink_merylDB} + cd {params.outPath} + export OMP_NUM_THREADS={threads} + (merqury.sh {params.symlink_merylDB} {input.assemblyPRI} {input.assemblyALT} {params.outFile}) &> {log} + """ + + +####################################################################################################################################### + + + rule busco5: input: assembly=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), @@ -111,7 +464,7 @@ rule busco5: shell: """ cd {params.chngDir} - (busco -m --offline {params.mode} --in {input.assembly} -o {params.assemblyName} -l {input.lineage} -c {threads} -f --limit 5) &> {log} + (busco -m {params.mode} --offline --in {input.assembly} -o {params.assemblyName} -l {input.lineage} -c {threads} -f --limit 5) &> {log} """ # ln -s {input.assembly} {output.symlink} @@ -141,90 +494,9 @@ rule moveBuscoOutputs: rm -r {params.rmDir} """ -def altFile(wildcards): - if samples.loc[(wildcards.asmID), "ALT_present"] == True: - return os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta") - else: - return os.path.join(workflow.basedir, "scripts/ALT_missing.fasta") - -rule merqury: - input: - assemblyPRI=os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), - assemblyALT=altFile, - merylDB=merylDB, - # checkunzip=os.path.join(config['Results'], "01_evaluation/{asmID}/logs/{asmID}_pigzUnzip.log") - params: - outFile= "{asmID}" + "_merqOutput", - outPath= os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT"), - # symlink_fasta=os.path.join(config['Results'], "01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}.fasta"), - symlink_merylDB=directory(os.path.join(config['Results'], "1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.meryl")) - threads: - 12 - output: - os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), - os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), - os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), - os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), - os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"), - os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"), - os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.hist") - log: - os.path.join(config['Results'],"1_evaluation/{asmID}/logs/{asmID}_merqury.log") - priority: - 3 - conda: - "../envs/merylMerq_2.yaml" - shell: - """ - ln -s {input.merylDB} {params.symlink_merylDB} - cd {params.outPath} - export OMP_NUM_THREADS={threads} - (merqury.sh {params.symlink_merylDB} {input.assemblyPRI} {input.assemblyALT} {params.outFile}) &> {log} - """ - -# rule merqury: -# input: -# assemblyPRI=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.PRI.fasta"), asmID=wildcards.asmID) if testDict[wildcards.asmID][3] == False, -# merylDB=merylDB, -# # checkunzip=os.path.join(config['Results'], "01_evaluation/{asmID}/logs/{asmID}_pigzUnzip.log") -# params: -# outFile= "{asmID}" + "_merqOutput", -# outPath= os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT"), -# # symlink_fasta=os.path.join(config['Results'], "01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}.fasta"), -# symlink_merylDB=directory(os.path.join(config['Results'], "1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.meryl")) -# threads: -# workflow.cores * 0.5 -# output: -# os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), -# os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), -# os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), -# os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), -# os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"), -# os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"), -# os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/merylDB_providedFor_{asmID}.hist") -# log: -# os.path.join(config['Results'],"1_evaluation/{asmID}/logs/{asmID}_merqury.log") -# priority: -# 3 -# conda: -# "../envs/merylMerq_2.yaml" -# shell: -# """ -# ln -s {input.merylDB} {params.symlink_merylDB} -# cd {params.outPath} -# export OMP_NUM_THREADS={threads} -# (merqury.sh {params.symlink_merylDB} {input.assemblyPRI} {params.outFile}) &> {log} -# """# bash {params.script} {input.merylDB} {params.symlink} {params.outFile} - # """ - # export PATH={params.merylPath} - # export MERQURY={params.export} - # export OMP_NUM_THREADS={threads} - # cd {params.outPath} - # bash {params.script} {input.merylDB} {input.assembly} {params.outFile} - # """ rule genomescope2: input: @@ -288,24 +560,8 @@ rule saveConfiguration_and_getKeyValues_kmer: gscopeSum=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), gscopeLog=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), gscopeLin=os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), -# busco=os.path.join(config['Results'], "01_evaluation/{asmID}/01A_BUSCOv5/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), -# qv=os.path.join(config['Results'],"01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}_merq.qv"), -# completeness=os.path.join(config['Results'],"01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}_merq.completeness.stats"), -# spectraStacked=os.path.join(config['Results'],"01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}_merq.{asmID}.spectra-cn.st.png"), -# spectraFlat=os.path.join(config['Results'],"01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}_merq.{asmID}.spectra-cn.fl.png"), -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), -# # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), -# scaffStats=os.path.join(config['Results'],"01_evaluation/{asmID}/01D_assemblyStats/{asmID}_scaffold_stats.tsv"), -# contStats=os.path.join(config['Results'],"01_evaluation/{asmID}/01D_assemblyStats/{asmID}_contig_stats.tsv") -# os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), -# multiqc=os.path.join(config['Results'],"{sample}/0_qualityControl/02_multiqc/{sample}_multiqc_report.html") params: - # sampleSheet= config['samplesTSV'], - # config=os.path.join(workflow.basedir, "configuration/config.yaml"), -# resultsPath=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults"), - keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv"), -# rowNames=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/rowNames.tsv") - # masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv") + keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv") output: # newSampleSheet=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedSampleSheet.tsv"), # newConfigFile=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedConfig.yaml"), @@ -329,6 +585,97 @@ rule saveConfiguration_and_getKeyValues_kmer: # +# rule saveConfiguration_and_getKeyValues: +# input: +# # checkLog=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/" + "{sample}" + "_merq.spectra-cn.log"), gscopeSum=os.path.join(config['Results'], "01_evaluation/{asmID}/01E_keyResults/{asmID}_k{kmer}_summary.txt"), +# gscopeSum=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), +# gscopeLog=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), +# gscopeLin=lambda wildcards: expand(os.path.join(config['Results'], "1_evaluation/{asmID}/03_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), asmID=wildcards.asmID, kmer=testDict[wildcards.asmID][3]), +# # keyValues=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv"), +# busco=os.path.join(config['Results'], "1_evaluation/{asmID}/05_BUSCO/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), +# qv=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.qv"), +# completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.completeness.stats"), +# spectraStacked_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), +# spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), +# spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.st.png"), +# spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/04_merquryQVandKAT/{asmID}_merqOutput.spectra-cn.fl.png"), +# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), +# # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), +# scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_scaffold_stats.tsv"), +# contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv"), +# pretextSnapshotFULL=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png") +# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), +# # multiqc=os.path.join(config['Results'],"{sample}/0_qualityControl/02_multiqc/{sample}_multiqc_report.html") +# params: +# # sampleSheet= config['samplesTSV'], +# # config=os.path.join(workflow.basedir, "configuration/config.yaml"), +# resultsPath=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults"), +# keyValues2=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames_2.tsv"), +# keyValues=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/results_withoutRowNames.tsv"), +# rowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/rowNames.tsv") +# # masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv") +# output: +# keyValuesWithRowNames=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), +# busco=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), +# buscoScores=temp(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/only_buscoScores_{asmID}.txt")), +# qv=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.qv"), +# completeness=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.completeness.stats"), +# spectraStacked_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.st.png"), +# spectraFlat_PRI=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.{asmID}.PRI.spectra-cn.fl.png"), +# spectraStacked_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.st.png"), +# spectraFlat_both=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_merqOutput.spectra-cn.fl.png"), +# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), +# # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), +# scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_scaffold_stats.tsv"), +# contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_contig_stats.tsv"), +# pretextCP2keyResults=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png") +# threads: +# 1 +# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), +# # multiqc=os.path.join(config['Results'],"{sample}/5_31_Key_Results/{sample}_multiqc_report.html") +# # aggregateTsv=os.path.join(config['Results'],"{sample}"+ "/individual_aggregated_results/aggregate.tsv") +# conda: +# "../envs/dos2unix.yaml" +# shell: +# """ +# head -n 15 {input.busco} | tail -n 7 | sed "s/^['\t']*//" > {output.buscoScores} +# cp {input.pretextSnapshotFULL} {output.pretextCP2keyResults} +# cp {input.busco} {output.busco} +# cp {input.completeness} {output.completeness} +# cp {input.qv} {output.qv} +# cp {input.spectraStacked_PRI} {output.spectraStacked_PRI} +# cp {input.spectraFlat_PRI} {output.spectraFlat_PRI} +# cp {input.spectraFlat_both} {output.spectraFlat_both} +# cp {input.spectraStacked_both} {output.spectraStacked_both} +# cp {input.scaffStats} {output.scaffStats} +# cp {input.contStats} {output.contStats} +# echo "{wildcards.asmID}" >> {params.keyValues} +# echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} +# echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {params.keyValues} +# echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} +# echo "$(grep 'gc_content' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} +# echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} +# echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} +# echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} +# echo "$(grep 'longest (bp)' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} +# echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} +# echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} +# echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} +# echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} +# echo "$(grep 'longest (bp)' {input.contStats} | awk {{'print $3'}})" >> {params.keyValues} +# echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} +# echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} +# echo "$(grep 'N95' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} +# echo "$(grep 'NG95' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} +# echo "$(awk {{'print $4'}} {input.qv})" | head -n 1 >> {params.keyValues} +# echo "$(awk {{'print $5'}} {input.completeness})" | head -n 1 >> {params.keyValues} +# echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $2'}})" >> {params.keyValues} +# echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $4'}})" >> {params.keyValues} +# dos2unix {params.keyValues} +# echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\nGC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" > {params.rowNames} +# paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} +# rm {params.rowNames} {params.keyValues} +# """ rule saveConfiguration_and_getKeyValues: input: @@ -347,7 +694,7 @@ rule saveConfiguration_and_getKeyValues: # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_scaffold_stats.tsv"), - contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv") + contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/02_assemblyStats/{asmID}_contig_stats.tsv"), # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), # multiqc=os.path.join(config['Results'],"{sample}/0_qualityControl/02_multiqc/{sample}_multiqc_report.html") params: @@ -371,7 +718,7 @@ rule saveConfiguration_and_getKeyValues: # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), scaffStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_scaffold_stats.tsv"), - contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_contig_stats.tsv") + contStats=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_contig_stats.tsv"), threads: 1 # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), @@ -393,7 +740,7 @@ rule saveConfiguration_and_getKeyValues: cp {input.contStats} {output.contStats} echo "{wildcards.asmID}" >> {params.keyValues} echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} - echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {params.keyValues} + echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}})" >> {params.keyValues} echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} echo "$(grep 'gc_content' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} @@ -411,105 +758,76 @@ rule saveConfiguration_and_getKeyValues: echo "$(grep 'NG95' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} echo "$(awk {{'print $4'}} {input.qv})" | head -n 1 >> {params.keyValues} echo "$(awk {{'print $5'}} {input.completeness})" | head -n 1 >> {params.keyValues} - echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $2'}})" >> {params.keyValues} - echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $4'}})" >> {params.keyValues} + echo "$(grep 'C:' {input.busco} | awk -F'[:\\[,]' {{'print $2'}})" >> {params.keyValues} + echo "$(grep 'C:' {input.busco} | awk -F'[:\\[,]' {{'print $4'}})" >> {params.keyValues} dos2unix {params.keyValues} - echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\nGC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" > {params.rowNames} + echo -e "ASM_ID\nBases\nEst_Size\nHet_%\nGC_%\nScaff\nCont\nGaps\nLongest_Scaff\nScaff_N50\nScaff_NG50\nScaff_N95\nScaff_NG95\nLongest_Cont\nCont_N50\nCont_NG50\nCont_N95\nCont_NG95\nQV\nCompleteness\nComp_BUSCOs_%\nComp_Single_BUSCOs_%" > {params.rowNames} paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} - printf "Evaluation complete for {wildcards.asmID}, please find these results in {params.resultsPath} folder" rm {params.rowNames} {params.keyValues} """ + #rm {params.rowNames} {params.keyValues} # """ + # head -n 15 {input.busco} | tail -n 7 | sed "s/^['\t']*//" > {output.buscoScores} + # cp {input.busco} {output.busco} + # cp {input.completeness} {output.completeness} + # cp {input.qv} {output.qv} + # cp {input.spectraStacked_PRI} {output.spectraStacked_PRI} + # cp {input.spectraFlat_PRI} {output.spectraFlat_PRI} + # cp {input.spectraFlat_both} {output.spectraFlat_both} + # cp {input.spectraStacked_both} {output.spectraStacked_both} + # cp {input.scaffStats} {output.scaffStats} + # cp {input.contStats} {output.contStats} + # echo "{wildcards.asmID}" >> {params.keyValues} + # echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}} | sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} | sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} + # echo "$(grep 'gc_content' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} + # echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}} | sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'longest (bp)' {input.scaffStats} | awk {{'print $3'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'longest (bp)' {input.contStats} | awk {{'print $3'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'N95' {input.contStats} | awk {{'print $2'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(grep 'NG95' {input.contStats} | awk {{'print $4'}}| sed 's/,//g')" >> {params.keyValues} + # echo "$(awk {{'print $4'}} {input.qv})" | head -n 1 >> {params.keyValues} + # echo "$(awk {{'print $5'}} {input.completeness})" | head -n 1 >> {params.keyValues} + # echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $2'}})" >> {params.keyValues} + # echo "$(grep 'C:' {input.busco} | awk -F'[:\[,]' {{'print $4'}})" >> {params.keyValues} + # dos2unix {params.keyValues} + # echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\nGC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" > {params.rowNames} + # paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} + # rm {params.rowNames} {params.keyValues} + # """ -# rule saveConfiguration_and_getKeyValues: +# rule aggregateAllAssemb| sed 's/,//g'lies: # input: -# # checkLog=os.path.join(config['Results'],"{sample}" + "/2_QVstats_merylAndMerqury/logs/" + "{sample}" + "_merq.spectra-cn.log"), -# gscopeSum=os.path.join(config['Results'], "01_evaluation/{asmID}/01C_genomescopeProfile/{asmID}_k{kmer}_summary.txt"), -# gscopeLog=os.path.join(config['Results'], "01_evaluation/{asmID}/01C_genomescopeProfile/{asmID}_k{kmer}_log_plot.png"), -# gscopeLin=os.path.join(config['Results'], "01_evaluation/{asmID}/01C_genomescopeProfile/{asmID}_k{kmer}_linear_plot.png"), -# busco=os.path.join(config['Results'], "01_evaluation/{asmID}/01A_BUSCOv5/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), -# qv=os.path.join(config['Results'],"01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}_merq.qv"), -# completeness=os.path.join(config['Results'],"01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}_merq.completeness.stats"), -# spectraStacked=os.path.join(config['Results'],"01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}_merq.{asmID}.spectra-cn.st.png"), -# spectraFlat=os.path.join(config['Results'],"01_evaluation/{asmID}/01B_QV-and-kmerMultiplicity/{asmID}_merq.{asmID}.spectra-cn.fl.png"), -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), -# # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), -# scaffStats=os.path.join(config['Results'],"01_evaluation/{asmID}/01D_assemblyStats/{asmID}_scaffold_stats.tsv"), -# contStats=os.path.join(config['Results'],"01_evaluation/{asmID}/01D_assemblyStats/{asmID}_contig_stats.tsv") -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), -# # multiqc=os.path.join(config['Results'],"{sample}/0_qualityControl/02_multiqc/{sample}_multiqc_report.html") -# params: -# # sampleSheet= config['samplesTSV'], -# # config=os.path.join(workflow.basedir, "configuration/config.yaml"), -# resultsPath=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults"), -# keyValues=os.path.join(config['Results'],"{01_evaluation/{asmID}/01E_keyResults/results_withoutRowNames.tsv"), -# rowNames=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/rowNames.tsv") -# # masteraggregate=os.path.join(config['Results'],"{sample}"+ "/keyResults/aggregate.tsv") +# allResults=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), +# sampleSheet= config['samplesTSV'], +# config=os.path.join(workflow.basedir, "configuration/config.yaml") # output: -# # newSampleSheet=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedSampleSheet.tsv"), -# # newConfigFile=os.path.join(config['Results'],"{sample}"+ "/keyResults/savedConfig.yaml"), -# # merqLog=os.path.join(config['Results'],"logs/2_merylAndMerq/" + "{sample}" + "_merqury.spectra-cn.log"), -# keyValuesWithRowNames=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/{asmID}_aggregatedResults.tsv"), -# gscopeSum=os.path.join(config['Results'], "01_evaluation/{asmID}/01E_keyResults/{asmID}_k{kmer}_summary.txt"), -# gscopeLog=os.path.join(config['Results'], "01_evaluation/{asmID}/01E_keyResults/{asmID}_k{kmer}_log_plot.png"), -# gscopeLin=os.path.join(config['Results'], "01_evaluation/{asmID}/01E_keyResults/{asmID}_k{kmer}_linear_plot.png"), -# busco=os.path.join(config['Results'], "01_evaluation/{asmID}/01E_keyResults/short_summary.specific." + buscoDataBaseName + "_odb10.{asmID}.txt"), -# qv=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/{asmID}_merq.qv"), -# completeness=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/{asmID}_merq.completeness.stats"), -# spectraStacked=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/{asmID}_merq.{asmID}.spectra-cn.st.png"), -# spectraFlat=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/{asmID}_merq.{asmID}.spectra-cn.fl.png"), -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_merqury.log"), -# # os.path.join(config['Results'],"{sample}" + "/assemblyStats/{sample}_contig_stats.tsv"), -# scaffStats=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/{asmID}_scaffold_stats.tsv"), -# contStats=os.path.join(config['Results'],"01_evaluation/{asmID}/01E_keyResults/{asmID}_contig_stats.tsv") -# # os.path.join(config['Results'],"{sample}" + "/QVstats_merylAndMerqury/logs/" + "{sample}" + "_meryl.log"), -# # multiqc=os.path.join(config['Results'],"{sample}/5_31_Key_Results/{sample}_multiqc_report.html") -# # aggregateTsv=os.path.join(config['Results'],"{sample}"+ "/individual_aggregated_results/aggregate.tsv") -# conda: -# "../envs/dos2unix.yaml" +# results=os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv"), +# # rows=os.path.join(config['Results'],"allAssemblies_keyResults/aggregate_rows.tsv"), +# newSampleSheet=os.path.join(config['Results'],"1_evaluation/finalResults/savedSampleSheet.tsv"), +# newConfigFile=os.path.join(config['Results'],"1_evaluation/finalResults/savedConfig.yaml") +# threads: +# 1 # shell: # """ -# cp {input.gscopeSum} {output.gscopeSum} -# cp {input.gscopeLog} {output.gscopeLog} -# cp {input.gscopeLin} {output.gscopeLin} -# cp {input.busco} {output.busco} -# cp {input.qv} {output.qv} -# cp {input.spectraStacked} {output.spectraStacked} -# cp {input.spectraFlat} {output.spectraFlat} -# cp {input.scaffStats} {output.scaffStats} -# cp {input.contStats} {output.contStats} -# echo "{wildcards.sample}" >> {params.keyValues} -# echo "$(grep 'total_bps' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} -# echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} )" >> {params.keyValues} -# echo "$(grep 'Heterozygous (ab)' {input.gscopeSum} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(grep 'gc_content' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'sequence_count' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'sequence_count' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'number_of_gaps' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'longest (bp)' {input.scaffStats} | awk {{'print $3'}})" >> {params.keyValues} -# echo "$(grep 'N50' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'NG50' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(grep 'N95' {input.scaffStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'NG95' {input.scaffStats} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(grep 'longest (bp)' {input.contStats} | awk {{'print $3'}})" >> {params.keyValues} -# echo "$(grep 'N50' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'NG50' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(grep 'N95' {input.contStats} | awk {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'NG95' {input.contStats} | awk {{'print $4'}})" >> {params.keyValues} -# echo "$(awk {{'print $4'}} {input.qv})" >> {params.keyValues} -# echo "$(awk {{'print $5'}} {input.completeness})" >> {params.keyValues} -# echo "$(grep 'C:'' {input.busco} | awk -F'[:\[,]' {{'print $2'}})" >> {params.keyValues} -# echo "$(grep 'C:'' {input.busco} | awk -F'[:\[,]' {{'print $4'}})" >> {params.keyValues} -# dos2unix {params.keyValues} -# echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\GC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" > {params.rowNames} -# paste -d'\t' {params.rowNames} {params.keyValues} > {output.keyValuesWithRowNames} -# printf "Evaluation complete for {wildcards.sample}, please find these results in {params.resultsPath} folder" -# rm {params.rowNames} {params.keyValues} +# cp {input.sampleSheet} {output.newSampleSheet} +# cp {input.config} {output.newConfigFile} +# echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\nGC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" | \ +# paste -d'\t' - {input.allResults} | \ +# awk -F'\t' '{{s="";for (i=1;i<=NF;i+=2) {{s=s?s FS $i:$i}} print s}}' | \ +# column -t > {output.results} +# printf "All assembly evaluation results aggregated successfully. Please find these results at {output.results}" # """ -# -# cp {input.checkLog} {output.merqLog} -#echo "$(grep 'Genome Haploid Length' {input.gscopeSum} | awk {{'print $6'}} | sed 's/,//g')" >> {output.keyValues} put this to remove commas rule aggregateAllAssemblies: input: allResults=expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_aggregatedResults.tsv"), asmID=list(testDict.keys())), @@ -526,14 +844,13 @@ rule aggregateAllAssemblies: """ cp {input.sampleSheet} {output.newSampleSheet} cp {input.config} {output.newConfigFile} - echo -e "Assembly\ntotal_num_bases\nsize_estimate\nmax_heterozygosity\nGC_Content\ntotal_num_scaffolds\ntotal_num_contigs\nnumber_of_gaps\nLongest_scaffold\nScaffold_N50_length\nScaffold_NG50_length\nScaffold_N95_length\nScaffold_NG95_length\nLongest_contig\nContig_N50_length\nContig_NG50_length\nContig_N95_length\nContig_NG95_length\nqv_score\nkmer_completeness\nComplete_Buscos\nComplete_single_buscos" | \ + echo -e "ASM_ID\nBases_Mb\nEst_Size_Mb\nHet_%\nGC_%\nScaff\nCont\nGaps\nLongest_Scaff_Mb\nScaff_N50_Mb\nScaff_NG50_Mb\nScaff_N95_Mb\nScaff_NG95_Mb\nLongest_Cont\nCont_N50_Mb\nCont_NG50_Mb\nCont_N95_Mb\nCont_NG95_Mb\nQV\nCompleteness\nComp_BUSCOs_%\nComp_Single_BUSCOs_%" | \ paste -d'\t' - {input.allResults} | \ awk -F'\t' '{{s="";for (i=1;i<=NF;i+=2) {{s=s?s FS $i:$i}} print s}}' | \ column -t > {output.results} - printf "All assembly evaluation results aggregated successfully. Please find these results at {output.results}" + sed -i 's/,//g' {output.results} """ - - +# echo -e "Assembly ID\n#Bases (Mb)\nEst. Size (Mb)\nHet. (%)\nGC (%)\n#Scaff\n#Cont\n#Gaps\nLongest Scaff (Mb)\nScaff N50 (Mb)\nScaff NG50 (Mb)\nScaff N95 (Mb)\nScaff NG95 (Mb)\nLongest Cont\nCont N50 (Mb)\nCont NG50 (Mb)\nCont N95 (Mb)\nCont NG95 (Mb)\nQV\nCompleteness\nComplete BUSCOs\nComplete Single BUSCOs" | \ rule makeReport: input: @@ -580,6 +897,66 @@ rule makeReport: # awk '{{sub("PLACEHOLDER","{input[0]}")}}1' {input.landingPage} > {output[0]} # """ +# def fq2_from_sample_notrim10xOrAdapt(wildcards): +# return notrim10xorAdapt.loc[(wildcards.sample, wildcards.readCounter), 'Library_R2'] +# +# rule pretextMaps2md: +# input: +# [expand(os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png"), asmID=list(testDictPRETEXT.keys()))] +# output: +# os.path.join(config['Results'],"1_evaluation/finalResults/FullPRETEXTMarkdown.md") +# params: +# assemblyName='{wildcards.asmID}' +# threads: +# 1 +# run: +# with open(output[0], "w") as out: +# for i in range(0,len(input)): +# print("", file=out) +# print(params.assemblyName, file=out) +# print("", file=out) +# print("", file=out) +# print("", file=out) + + +rule pretextMaps2md: + input: + PretextMap=os.path.join(config['Results'], "1_evaluation/{asmID}/07_pretext/{asmID}.HiC.COMBINED.FILTERED_FullMap.png") +# PretextMap=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png") + output: + pretextCP2keyResults=os.path.join(config['Results'], "1_evaluation/{asmID}/06_keyResults/{asmID}.pretext_hiC_FullMap.png"), + IndividualPretextMD=os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_FullPRETEXTMarkdown.md") + params: + assemblyName='{asmID}' + threads: + 1 + run: + import shutil + shutil.copyfile(input.PretextMap, output.pretextCP2keyResults) + with open(output.IndividualPretextMD, "w") as out: + print("", file=out) + print("###",params.assemblyName," HiC Heatmap", file=out) + print("{ height=30% }", file=out) +# print("{ height=35% }", file=out) +# print("", file=out) + + +# testDictPRETEXT +# def IndividualPretextMD(wildcards): +# if samples.loc[(wildcards.asmID), "ALT_present"] == True: +# return os.path.join(config['Results'], "1_evaluation/{asmID}/01_unzipFastas/{asmID}.ALT.fasta") +# else: +# return os.path.join(workflow.basedir, "scripts/ALT_missing.fasta") + +# rule aggregatePretextMaps2md: +# input: +# IndividualPretextMD=expand(os.path.join(config['Results'],"1_evaluation/finalResults/{asmID}_FullPRETEXTMarkdown.md"), asmID=wildcards.asmID) +# output: +# os.path.join(config['Results'],"1_evaluation/finalResults/FullPRETEXTMarkdown.md") +# shell: +# """ +# cat {input.IndividualPretextMD} > {output} +# """ rule addFullTable: input: results=os.path.join(config['Results'],"1_evaluation/finalResults/Combined_Results_FULLTABLE.tsv") @@ -591,43 +968,69 @@ rule addFullTable: import pandas as pd from tabulate import tabulate samples=pd.read_csv(input.results, dtype=str, index_col=0, delim_whitespace=True, skip_blank_lines=True) + # samples=samples.reset_index + turn2FloatAndMb=['Bases_Mb','Est_Size_Mb','Longest_Scaff_Mb','Scaff_N50_Mb','Scaff_NG50_Mb','Scaff_N95_Mb','Scaff_NG95_Mb','Longest_Cont','Cont_N50_Mb','Cont_NG50_Mb','Cont_N95_Mb','Cont_NG95_Mb'] + roundDecimals=['Comp_BUSCOs_%','Comp_Single_BUSCOs_%','Het_%','GC_%','QV','Completeness','Bases_Mb','Est_Size_Mb','Longest_Scaff_Mb','Scaff_N50_Mb','Scaff_NG50_Mb','Scaff_N95_Mb','Scaff_NG95_Mb','Longest_Cont','Cont_N50_Mb','Cont_NG50_Mb','Cont_N95_Mb','Cont_NG95_Mb'] + print('this is samples',samples) + sampleTransposed=samples.T + print('this is sampleTransposed',sampleTransposed) + for header in turn2FloatAndMb: + sampleTransposed[header]=sampleTransposed[header].astype(float).div(1000000) + for i in range(0,4): + sampleTransposed[roundDecimals[i]]=sampleTransposed[roundDecimals[i]].str.replace('%','') + for roundHeader in roundDecimals: + sampleTransposed[roundHeader]=sampleTransposed[roundHeader].astype(float).round(2) with open(output[0], "w") as out: print("", file=out) - print("\\onecolumn", file=out) + #print("\\newpage", file=out) + # print("\\onecolumn", file=out) + print("\\blandscape", file=out) print("", file=out) print("\\tiny", file=out) - print(tabulate(samples, headers='keys',tablefmt="pipe", showindex=True), file=out) + print(tabulate(sampleTransposed.rename_axis('ASM_ID'), headers='keys',tablefmt="pipe", showindex=True), file=out) + print("\\elandscape", file=out) print("", file=out) + # print("", file=out) + # print("\\endpage", file=out) + # print("\\twocolumn", file=out) + # print("", file=out) + # print("\\small", file=out) rule aggregateReport: input: - indivMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value4) for key, [value1, value2, value3, value4, value5, value6, value7, value8] in testDict.items()], + indivMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_k{kmer}_markdownForReport.md"), asmID=key, kmer=value4) for key, [value1, value2, value3, value4, value5, value6, value7, value8, value9, value10, value11, value12, value13, value14] in testDict.items()], landingPage=os.path.join(workflow.basedir, "scripts/reportLandingPage.md"), - endTableMD=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown.md") + landingPageTABLE=os.path.join(workflow.basedir, "scripts/reportTABLELandingPage.md"), + endTableMD=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown.md"), + IndividualPretextMD=[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/06_keyResults/{asmID}_FullPRETEXTMarkdown.md"), asmID=list(testDictPRETEXT.keys()))] output: - os.path.join(config['Results'],"1_evaluation/finalResults/FullMarkdown.md") + FullMarkdown=os.path.join(config['Results'],"1_evaluation/finalResults/FullMarkdown.md"), + endTableMD=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown_wPreamble.md") threads: 1 shell: """ - cat {input.landingPage} {input.indivMD} {input.endTableMD} >> {output} + cat {input.landingPage} {input.indivMD} {input.IndividualPretextMD} >> {output.FullMarkdown} + cat {input.landingPageTABLE} {input.endTableMD} >> {output.endTableMD} """ - - - rule makePDF: input: - os.path.join(config['Results'],"1_evaluation/finalResults/FullMarkdown.md") + md_report=os.path.join(config['Results'],"1_evaluation/finalResults/FullMarkdown.md"), + md_comparison_table=os.path.join(config['Results'],"1_evaluation/finalResults/FullTableMarkdown_wPreamble.md") output: - os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf") + pdf_report=os.path.join(config['Results'],"1_evaluation/finalResults/FULL_Report_PDF.pdf"), + pdf_comparison_table=os.path.join(config['Results'],"1_evaluation/finalResults/FULL_TABLE_PDF.pdf") + log: + os.path.join(config['Results'], "1_evaluation/logs/MAKEPDF_pandoc.log") threads: 1 shell: """ - pandoc -o {output} {input} + (pandoc -o {output.pdf_report} {input.md_report} --pdf-engine=tectonic) &>> {log} + (pandoc -o {output.pdf_comparison_table} {input.md_comparison_table} --pdf-engine=tectonic) &>> {log} """ # echo -e "Assembly\nsize_estimate\nmax_heterozygosity\nqv_score\nN50_length\nNG50_length\nN95_length\nNG95_length\nN100_length\nNG100_length\ntotal_num_bases\ntotal_num_scaffolds" > {output.rows} diff --git a/scripts/digest.py b/scripts/digest.py new file mode 100644 index 0000000000000000000000000000000000000000..ea32be0e626b25afca1c026e46af6f2b8b6cbfdc --- /dev/null +++ b/scripts/digest.py @@ -0,0 +1,26 @@ +from Bio.Seq import Seq +from Bio import SeqIO +from Bio.Restriction import * + +rb= +rb = DpnII + HinfI + DdeI + MseI +dpnii_dig=[] +for record in SeqIO.parse("GCA_905146935.1_idScaPyra1.1_genomic.fna", "fasta"): + my_seq = Seq(str(record.seq)) + dpnii_dig.append(DpnII.catalyse(my_seq)) + +test_1=list(sum(dpnii_dig, ())) + + +hinfi_dig=[] +for i in range(len(test_1)): + newSeq= test_1[i] + hinfi_dig.append(HinfI.catalyse(my_seq)) +print(hinfi_dig) + +test_2=list(sum(hinfi_dig, ())) + +ddei_dig=[] +for i in range(len(test_1)): + newSeq= test_1[i] + hinfi_dig.append(HinfI.catalyse(my_seq)) diff --git a/scripts/filter_five_end.pl b/scripts/filter_five_end.pl new file mode 100644 index 0000000000000000000000000000000000000000..5580dbccb933a9033c6d1c2557294cfe96653e43 --- /dev/null +++ b/scripts/filter_five_end.pl @@ -0,0 +1,109 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $prev_id = ""; +my @five; +my @three; +my @unmap; +my @mid; +my @all; +my $counter = 0; + +while (<STDIN>){ + chomp; + if (/^@/){ + print $_."\n"; + next; + } + my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/; + my $bin = reverse(dec2bin($flag)); + my @binary = split(//,$bin); + if ($prev_id ne $id && $prev_id ne ""){ + if ($counter == 1){ + if (@five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } + } + elsif ($counter == 2 && @five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } + + $counter = 0; + undef @unmap; + undef @five; + undef @three; + undef @mid; + undef @all; + } + + $counter++; + $prev_id = $id; + push @all,$_; + if ($binary[2]==1){ + push @unmap,$_; + } + elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){ + push @five, $_; + } + elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){ + push @three, $_; + } + elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){ + push @mid, $_; + } +} + +if ($counter == 1){ + if (@five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } +} +elsif ($counter == 2 && @five == 1){ + print $five[0]."\n"; +} +else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); +} + +sub dec2bin { + my $str = unpack("B32", pack("N", shift)); + return $str; +} + +sub bin2dec { + return unpack("N", pack("B32", substr("0" x 32 . shift, -32))); +} diff --git a/scripts/makePDF_indivMD.py b/scripts/makePDF_indivMD.py index fd3735c1239189b0d0de9b5404a2dc3ead5360ef..c3eaf54ae9885e528da419f600a2c700bf94df5e 100644 --- a/scripts/makePDF_indivMD.py +++ b/scripts/makePDF_indivMD.py @@ -53,6 +53,7 @@ with open(snakemake.output[0], 'w') as outFile: # print("---", "title: 'GEP Quick-View'", "author: James Sullivan", "date: 11/09/2021", "geometry: margin=2cm", "classoption: table", "documentclass: extarticle", "urlcolor: blue", "colorlinks: true", "header-includes: |", " \\rowcolors{2}{gray!10}{gray!25}" , " ```{=latex}", "---", sep="\n") print("", file=outFile) print("\\twocolumn", file=outFile) + print("\\Large", file=outFile) print("# Assembly:",params_assemblyID, file=outFile) print("", file=outFile) print(" - db built with kmer size ", params_merylKmer, "bp", file=outFile) @@ -62,35 +63,37 @@ with open(snakemake.output[0], 'w') as outFile: print(tabulate(key_stats_table, headers='keys',tablefmt="pipe", showindex=False), file=outFile) print("", file=outFile) print("### Genomescope2 Profile (Linear)", file=outFile) - print("{ width=32% }", file=outFile) + print("{ width=38% }", file=outFile) print("", file=outFile) print("### Genomescope2 Profile (Log)", file=outFile) - print("{ width=32% }", file=outFile) + print("{ width=38% }", file=outFile) print("\\", file=outFile) print("\\pagebreak", file=outFile) print("\\", file=outFile) print("", file=outFile) - print("\\tiny", file=outFile) + print("\\large", file=outFile) print("### QV Score", file=outFile) print(tabulate(QV_score_table, headers='keys',tablefmt="pipe", showindex=True), file=outFile) print("", file=outFile) - print("\\tiny", file=outFile) + print("\\large", file=outFile) print("### Kmer Completeness", file=outFile) print(tabulate(kmer_completeness_table, headers='keys',tablefmt="pipe", showindex=True), file=outFile) print("", file=outFile) + print("\\Large", file=outFile) print("### BUSCOv5 (database: ", params_buscoDB, ")", file=outFile) print("```", file=outFile) for line in lines: print(line, file=outFile) print("```", file=outFile) print("\\", file=outFile) + print("\\large", file=outFile) print("", file=outFile) print("### K-mer Multiplicity PRI Only (Flattened)", file=outFile) - print("{ width=38% }", file=outFile) + print("{ width=44% }", file=outFile) print("\\", file=outFile) print("", file=outFile) print("### K-mer Multiplicity PRI (Stacked)", file=outFile) - print("{ width=38% }", file=outFile) + print("{ width=44% }", file=outFile) print("\\", file=outFile) print("\\pagebreak", file=outFile) diff --git a/scripts/reportLandingPage.md b/scripts/reportLandingPage.md index d3ab3bf4fbdee5dac253cf6a5037028cbe0984eb..d46692de7eff19aad5b6f6565473c8d4c99df106 100644 --- a/scripts/reportLandingPage.md +++ b/scripts/reportLandingPage.md @@ -1,9 +1,8 @@ --- -geometry: a4paper, margin=1.3cm +geometry: a3paper, margin=1.3cm classoption: - table - twocolumn -documentclass: article urlcolor: blue colorlinks: true header-includes: @@ -15,8 +14,6 @@ header-includes: \let\longtable\supertabular \let\endlongtable\endsupertabular \let\endhead\relax - \newcommand{\blandscape}{\begin{landscape}} - \newcommand{\elandscape}{\end{landscape}} ``` output: pdf_document --- diff --git a/scripts/reportTABLELandingPage.md b/scripts/reportTABLELandingPage.md new file mode 100644 index 0000000000000000000000000000000000000000..882d4052e3152bdaa57e571e25b33334d12a0622 --- /dev/null +++ b/scripts/reportTABLELandingPage.md @@ -0,0 +1,19 @@ +--- +geometry: a3paper, margin=1.3cm +classoption: + - table + - onecolumn +urlcolor: blue +colorlinks: true +header-includes: +- | + ```{=latex} + \rowcolors{2}{gray!10}{gray!25} + \usepackage{longtable} + \let\endhead\relax + \usepackage{pdflscape} + \newcommand{\blandscape}{\begin{landscape}} + \newcommand{\elandscape}{\end{landscape}} + ``` +output: pdf_document +--- diff --git a/scripts/stats.py b/scripts/stats.py index a374b702b34e0b3a08b515ede0344ba32663b7cc..497b6e80794c2edebac968292b563397e779de37 100644 --- a/scripts/stats.py +++ b/scripts/stats.py @@ -183,7 +183,7 @@ if __name__ == "__main__": contig_lens, scaffold_lens, gc_cont = read_genome(infilename) # contig_stats = calculate_stats(contig_lens, gc_cont) scaffold_stats = calculate_stats(scaffold_lens, gc_cont) - rounded_gc = round(gc_cont, 4) + rounded_gc = round(gc_cont, 2) # print("this is the rounded_gc: ", rounded_gc) contig_stats = calculate_stats(contig_lens, gc_cont) gaps=contig_stats.get('sequence_count') - scaffold_stats.get('sequence_count') diff --git a/scripts/two_read_bam_combiner.pl b/scripts/two_read_bam_combiner.pl new file mode 100644 index 0000000000000000000000000000000000000000..96e6149c93a46905e897a7bb0a346aa3990dbbcf --- /dev/null +++ b/scripts/two_read_bam_combiner.pl @@ -0,0 +1,124 @@ +#!/usr/bin/perl + +use strict; + +MAIN : { + + my ($read1_bam, $read2_bam) = @ARGV; + + if ((not defined $read1_bam) || + (not defined $read2_bam)) { + die ("Usage: ./two_read_bam_combiner.pl <read 1 bam> <read 2 bam>\n"); + } + + open(FILE1,"samtools view -h $read1_bam |"); + open(FILE2,"samtools view -h $read2_bam |"); + + my $line1 = <FILE1>; + my $line2 = <FILE2>; + + my $counter = 0; + my $new_counter = 0; + + while (defined $line1) { + ## the line directly below this needed to be modified slightly from the genotyping pipeline ## + if ($line1 =~ /^(\@)SQ/){ + if ($line1 ne $line2){print $line1;print $line2; die ("inconsistent bam header.");} + else{ + print $line1; + } + $line1 = <FILE1>; + $line2 = <FILE2>; + next; + } + + $counter++; + if ($counter == ($new_counter + 1000000)) { + print STDERR $counter . "\n"; + $new_counter = $counter; + } + + chomp $line1; + chomp $line2; + + my ($id1, $flag1, $chr_from1, $loc_from1, $mapq1, $cigar1, $d1_1, $d2_1, $d3_1, $read1, $read_qual1, @rest1) = split(/\t/,$line1); + my ($id2, $flag2, $chr_from2, $loc_from2, $mapq2, $cigar2, $d1_2, $d2_2, $d3_2, $read2, $read_qual2, @rest2) = split(/\t/,$line2); + + if ($id1 ne $id2) { + die ("The id's of the two files do not match up at line number $counter\n"); + } + + my $bin1 = reverse(dec2bin($flag1)); + my $bin2 = reverse(dec2bin($flag2)); + + my @binary1 = split(//,$bin1); + my @binary2 = split(//,$bin2); + + my $trouble = 0; + if (($binary1[2] == 1) || ($mapq1 < 10)) { + $trouble = 1; + } + if (($binary2[2]== 1) || ($mapq2 < 10)) { + $trouble = 1; + } + + my $proper_pair1; + my $proper_pair2; + my $dist1; + my $dist2; + + if (($binary1[2] == 0) && ($binary2[2] == 0)) { + $proper_pair1 = 1; + $proper_pair2 = 1; + if ($chr_from1 eq $chr_from2) { + my $dist = abs($loc_from1 - $loc_from2); + if ($loc_from1 >= $loc_from2) { + $dist1 = -1*$dist; + $dist2 = $dist; + } else { + $dist1 = $dist; + $dist2 = -1*$dist; + } + } else { + $dist1 = 0; + $dist2 = 0; + } + } else { + $proper_pair1 = 0; + $proper_pair2 = 0; + $dist1 = 0; + $dist2 = 0; + } + + my $new_bin1 = join("","000000000000000000000",$binary1[10],$binary1[9],"0","0","1",$binary2[4],$binary1[4],$binary2[2],$binary1[2],$proper_pair1,"1"); + my $new_bin2 = join("","000000000000000000000",$binary2[10],$binary2[9],"0","1","0",$binary1[4],$binary2[4],$binary1[2],$binary2[2],$proper_pair2,"1"); + + my $new_flag1 = bin2dec($new_bin1); + my $new_flag2 = bin2dec($new_bin2); + + unless ($trouble == 1) { + + print(join("\t",$id1,$new_flag1,$chr_from1,$loc_from1,$mapq1,$cigar1,$chr_from2,$loc_from2,$dist1,$read1,$read_qual1,@rest1) . "\n"); + print(join("\t",$id2,$new_flag2,$chr_from2,$loc_from2,$mapq2,$cigar2,$chr_from1,$loc_from1,$dist2,$read2,$read_qual2,@rest2) . "\n"); + + } + + $line1 = <FILE1>; + $line2 = <FILE2>; + + } + +} + + + +sub dec2bin { + + my $str = unpack("B32", pack("N", shift)); + return $str; + +} + +sub bin2dec { + return unpack("N", pack("B32", substr("0" x 32 . shift, -32))); +}