diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a8bfcdc9dd2b28e1f2e56a6929005a93ef0c3e5c --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +.DS_Store + +#ignoring the downloaded database +buscoLineage/ +buscoLineage + +#ignoring the new logs +slurm_logs/ + +�#igoring changes in .snakemake (environments) +.snakemake/ + +#ignoring pycache +__pycache__ diff --git a/Snakefile b/Snakefile index e284fb9b9f2ef483d4215cfba409e5a0c7be1538..df54f01c904ec4a1e71154b41d81f3ad45220077 100644 --- a/Snakefile +++ b/Snakefile @@ -10,9 +10,6 @@ from urllib.request import urlopen from bs4 import BeautifulSoup import argparse, subprocess -#temporary -from make_erga_assembly_report import create_yaml_file - container: "docker://gepdocker/gep_dockerimage:latest" configfile: "configuration/config.yaml" @@ -207,20 +204,46 @@ elif set(['ID', 'ASM_LEVEL', 'busco_lineage', 'PRI_asm', 'ALT_asm', 'merylDB', ' if len(samples["merylDB_basename"].unique()) != len(samples["merylDB"].unique()): print ("WARNING: please check that each meryl database is named differently") - + + last_asm1_path = "" + last_asm2_path = "" + hic1_path = "" + hic2_path = "" + hic_data = "" if config['EAR']: - if len(samples["busco_lineage"].unique()) != 1: # when EAR runs, only one busco lineage should be given + if len(samples["busco_lineage"].unique()) != 1: # when the ear report is computed, only one busco lineage should be given print ("check tsv file, multiple busco lineages inserted, all rows should point to the same busco lineage") - if len(samples["merylDB"].unique()) != 1: # when EAR runs, only one merylDB should be given + if len(samples["merylDB"].unique()) != 1: # when the ear report is computed, only one merylDB should be given print ("check tsv file, multiple merylDB inserted, all rows should point to the same database") + hic1 = [x for x in samples["HiC_R1"].unique() if x != 'None'] + hic2 = [x for x in samples["HiC_R2"].unique() if x != 'None'] + + if (len(hic1) != 0 and len(hic2) != 0): + hic_data = "true" + + if (len(hic1) != 1 or len(hic2) != 1): # when the ear report is computed, only one HiC pair should be given + print ("check tsv file, multiple HiC pairs inserted, all rows should point to the same pair") + + hic1_path = hic1[0] + hic2_path = hic2[0] + else: + hic_data = "false" + hic1_path = "None" + hic2_path = "None" + + # Extracting the last two assemblies to extimate the coverage in the EAR report + last_asm1_path = [x for x in samples["PRI_asm"].unique() if x != 'None'][-1] + if len([x for x in samples["ALT_asm"].unique() if x != 'None']) == 0: + last_asm2_path = "false" + else: + last_asm2_path = [x for x in samples["ALT_asm"].unique() if x != 'None'][-1] ruleAllQCFiles=[] ruleAll=os.path.join(config['Results'],"1_evaluation/finalResults/GEP_FINAL_REPORT.pdf"), \ os.path.join(config['Results'],"1_evaluation/finalResults/EAR_report.yaml") - else: ruleAllQCFiles=[] ruleAll=os.path.join(config['Results'],"1_evaluation/finalResults/GEP_FINAL_REPORT.pdf") @@ -285,9 +308,6 @@ else: - - - if "Results" not in config: config["Results"] = "results" diff --git a/configuration/config.yaml b/configuration/config.yaml index c068eaf74be477346c792e908d4c6dd9ea4e8677..958571b3c89f2879d9a91b75a7b766f701419e0c 100644 --- a/configuration/config.yaml +++ b/configuration/config.yaml @@ -1,11 +1,11 @@ ### PATH TO WHERE YOU WANT YOUR RESULTS FOR THIS RUN STORED - WILL BE AUTOMATICALLY CREATED IF IT DOESN'T EXIST ### -Results: "/scratch/galeov97/begendiv/pilot/results5" +Results: "/scratch/galeov97/begendiv/pilot/results" ### FULL PATH TO YOUR SAMPLESHEET ### -samplesTSV: "/scratch/galeov97/begendiv/pilot/runEvaltest_nohic.tsv" +samplesTSV: "/scratch/galeov97/begendiv/pilot/runEvaltest_test2.tsv" ### PATH TO DEFINE RESOURCES FILE - DO NOT CHANGE UNLESS YOU WISH TO USE IN A DIFFERENT LOCATION #### diff --git a/configuration/define_resources.yaml b/configuration/define_resources.yaml index 367d7896270d5350b6e4bfe49e32e52cf92008c7..6b83074738d5f7b3c59d8fb290e6f036621edada 100644 --- a/configuration/define_resources.yaml +++ b/configuration/define_resources.yaml @@ -100,6 +100,11 @@ gfastatsResults: time: "02:00:00" threads: 8 +compute_hic_coverage_ear: + mem_mb: 40000 + time: "01:00:00" + threads: 16 + ###### Rules for illuminadb building ########## unzipFastq_R1_illumina: diff --git a/envs/AUXILIARY_PYTHON_SCRIPTS.yaml b/envs/AUXILIARY_PYTHON_SCRIPTS.yaml index 01def04d5d68653961710063dffd8885547951db..74aa8e2192c78ee17f9fa681f3489ca655c0e3c5 100644 --- a/envs/AUXILIARY_PYTHON_SCRIPTS.yaml +++ b/envs/AUXILIARY_PYTHON_SCRIPTS.yaml @@ -11,4 +11,5 @@ dependencies: - pandas - numpy - ghostscript - - python=3.9.* \ No newline at end of file + - python=3.9.* + - seqkit \ No newline at end of file diff --git a/rules/evaluate.smk b/rules/evaluate.smk index 5b52808727fb6611a95f060397f8f758394abdd4..0482e589707a4778c20bdd153612b72ec2bc19a8 100644 --- a/rules/evaluate.smk +++ b/rules/evaluate.smk @@ -982,7 +982,6 @@ rule Reports_md: cat {input.landingPage} {input.indivMD} {input.IndividualPretextMD} >> {output.FullMarkdown} """ - rule Reports_pdf: input: md_report=os.path.join(config['Results'],"1_evaluation/finalResults/individual_reports/ALL_individual_REPORTS.md") @@ -997,7 +996,6 @@ rule Reports_pdf: (pandoc -o {output.pdf_report} {input.md_report} --pdf-engine=tectonic) &>> {log} """ - rule ConcatAll_pdfs: input: pdf_report=os.path.join(config['Results'],"1_evaluation/finalResults/individual_reports/ALL_individual_REPORTS.pdf"), @@ -1014,12 +1012,52 @@ rule ConcatAll_pdfs: (gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile={output.pdf_report} {input.pdf_report} {input.gradient} {input.coloured}) &>> {log} """ +rule compute_hic_coverage_ear: + output: + os.path.join(config['Results'], "1_evaluation/finalResults/hic_coverage_estimate.txt") + params: + asm1_path = last_asm1_path, + asm2_path = last_asm2_path, + hic1_path = hic1_path, + hic2_path = hic2_path + conda: + os.path.join(workflow.basedir, "envs/AUXILIARY_PYTHON_SCRIPTS.yaml") + log: + os.path.join(config['Results'],"1_evaluation/logs/hic_coverage_estimate_seqkit.log") + threads: + resource['compute_hic_coverage_ear']['threads'] + resources: + mem_mb=resource['gfastatsResults']['mem_mb'], + time=resource['gfastatsResults']['time'] + shell: + """ + # Get bases + HiC_BASES=$(seqkit stats --threads {threads} {params.hic1_path} {params.hic2_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}') + + ASM_BASES=$(seqkit stats --threads {threads} {params.asm1_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}') + + # Check if asm2 is provided + if {params.asm2_path}; then + ASM2_BASES=$(seqkit stats --threads {threads} {params.asm2_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}') + # check for the largest asm + if ((ASM_BASES < ASM2_BASES)); then + ASM_BASES=$ASM2_BASES + fi + fi + + # Get coverage + HIC_COV=$(awk "BEGIN {{printf \\"%.2f\\", $HiC_BASES / $ASM_BASES}}") + # write a file with this value only + echo $HIC_COV > {output} + """ + rule make_ear: input: expand(os.path.join(config['Results'], "1_evaluation/kmer_profiling/{name}/genomescope/results_summary.txt"), name = merylDB_basename()), [expand(os.path.join(config['Results'], "1_evaluation/{asmID}/KEY_RESULTS/short_summary.specific.{asmID}_asm1.txt"), asmID = key) for key, values in dictSamples.items()], [expand(os.path.join(config['Results'],"1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA/{asmID}_merqOutput.qv"), asmID = key) for key, values in dictSamples.items()], - [expand(os.path.join(config['Results'],"1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt"), asmID = key) for key, values in dictSamples.items()] + [expand(os.path.join(config['Results'],"1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt"), asmID = key) for key, values in dictSamples.items()], + hic_coverage_file = os.path.join(config['Results'], "1_evaluation/finalResults/hic_coverage_estimate.txt") output: os.path.join(config['Results'],"1_evaluation/finalResults/EAR_report.yaml") conda: @@ -1030,10 +1068,17 @@ rule make_ear: busco = samples["busco_lineage"].unique()[0], results_folder = config['Results'], script = os.path.join(workflow.basedir, "scripts/report/make_erga_assembly_report.py"), - smudgeplot = config['smudgeplot'] + smudgeplot = config['smudgeplot'], + hic_data = hic_data log: os.path.join(config['Results'], "1_evaluation/logs/ear_report.log") shell: """ - python {params.script} {output} {params.samples_tsv_path} {params.merylDB} {params.busco} {params.results_folder} {params.smudgeplot} + if {params.hic_data}; then + hic_coverage=$(cat {input.hic_coverage_file}) + else + hic_coverage=0 + fi + + python {params.script} {output} {params.samples_tsv_path} {params.merylDB} {params.busco} {params.results_folder} {params.smudgeplot} $hic_coverage """ diff --git a/scripts/report/make_erga_assembly_report.py b/scripts/report/make_erga_assembly_report.py index f684f7b10fb0f81c725ac3cb52ab29f9379580ef..d40ce6c343a9e07d543333b84317933da1f4835a 100644 --- a/scripts/report/make_erga_assembly_report.py +++ b/scripts/report/make_erga_assembly_report.py @@ -2,7 +2,7 @@ import os import sys import pandas as pd -def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot): +def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage): genomescope_version = "2.0" smudgeplot_version = "0.2.5" @@ -14,6 +14,7 @@ def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseN # GENERAL INFORMATION ToLID: #<Insert ToLID> Species: #<Insert species name> +Sex: <Insert species sex> # for example: XX, XY, ZZ, ZW, unknown, NA... Submitter: #<Insert submitter full name> Affiliation: #<Insert affiliation> Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite @@ -21,8 +22,13 @@ Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite # SEQUENCING DATA DATA: - - #<Insert data type>: #<insert data coverage> # if coverage is not available, leave it empty - + - hic: ''' + if hic_coverage != 0: + yaml_content += f'''{hic_coverage} # this is an estimate of the coverage of the Hi-C data +''' + else: yaml_content += f''':#<insert data coverage> # if coverage is not available, leave it empty +''' + yaml_content += f''' # GENOME PROFILING DATA PROFILING: GenomeScope: @@ -77,4 +83,5 @@ if __name__ == "__main__": buscoDataBaseName = sys.argv[4] results_folder = sys.argv[5] smudgeplot = True if sys.argv[6] == "True" else False - create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot) + hic_coverage = sys.argv[7] + create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage)