Skip to content
Snippets Groups Projects
Commit 7fcdf241 authored by Galeone's avatar Galeone
Browse files

adding hic coverage

parent cbfb41ec
Branches
No related tags found
No related merge requests found
.DS_Store
#ignoring the downloaded database
buscoLineage/
buscoLineage
#ignoring the new logs
slurm_logs/
#igoring changes in .snakemake (environments)
.snakemake/
#ignoring pycache
__pycache__
......@@ -10,9 +10,6 @@ from urllib.request import urlopen
from bs4 import BeautifulSoup
import argparse, subprocess
#temporary
from make_erga_assembly_report import create_yaml_file
container: "docker://gepdocker/gep_dockerimage:latest"
configfile: "configuration/config.yaml"
......@@ -207,20 +204,46 @@ elif set(['ID', 'ASM_LEVEL', 'busco_lineage', 'PRI_asm', 'ALT_asm', 'merylDB', '
if len(samples["merylDB_basename"].unique()) != len(samples["merylDB"].unique()):
print ("WARNING: please check that each meryl database is named differently")
last_asm1_path = ""
last_asm2_path = ""
hic1_path = ""
hic2_path = ""
hic_data = ""
if config['EAR']:
if len(samples["busco_lineage"].unique()) != 1: # when EAR runs, only one busco lineage should be given
if len(samples["busco_lineage"].unique()) != 1: # when the ear report is computed, only one busco lineage should be given
print ("check tsv file, multiple busco lineages inserted, all rows should point to the same busco lineage")
if len(samples["merylDB"].unique()) != 1: # when EAR runs, only one merylDB should be given
if len(samples["merylDB"].unique()) != 1: # when the ear report is computed, only one merylDB should be given
print ("check tsv file, multiple merylDB inserted, all rows should point to the same database")
hic1 = [x for x in samples["HiC_R1"].unique() if x != 'None']
hic2 = [x for x in samples["HiC_R2"].unique() if x != 'None']
if (len(hic1) != 0 and len(hic2) != 0):
hic_data = "true"
if (len(hic1) != 1 or len(hic2) != 1): # when the ear report is computed, only one HiC pair should be given
print ("check tsv file, multiple HiC pairs inserted, all rows should point to the same pair")
hic1_path = hic1[0]
hic2_path = hic2[0]
else:
hic_data = "false"
hic1_path = "None"
hic2_path = "None"
# Extracting the last two assemblies to extimate the coverage in the EAR report
last_asm1_path = [x for x in samples["PRI_asm"].unique() if x != 'None'][-1]
if len([x for x in samples["ALT_asm"].unique() if x != 'None']) == 0:
last_asm2_path = "false"
else:
last_asm2_path = [x for x in samples["ALT_asm"].unique() if x != 'None'][-1]
ruleAllQCFiles=[]
ruleAll=os.path.join(config['Results'],"1_evaluation/finalResults/GEP_FINAL_REPORT.pdf"), \
os.path.join(config['Results'],"1_evaluation/finalResults/EAR_report.yaml")
else:
ruleAllQCFiles=[]
ruleAll=os.path.join(config['Results'],"1_evaluation/finalResults/GEP_FINAL_REPORT.pdf")
......@@ -285,9 +308,6 @@ else:
if "Results" not in config:
config["Results"] = "results"
......
### PATH TO WHERE YOU WANT YOUR RESULTS FOR THIS RUN STORED - WILL BE AUTOMATICALLY CREATED IF IT DOESN'T EXIST ###
Results: "/scratch/galeov97/begendiv/pilot/results5"
Results: "/scratch/galeov97/begendiv/pilot/results"
### FULL PATH TO YOUR SAMPLESHEET ###
samplesTSV: "/scratch/galeov97/begendiv/pilot/runEvaltest_nohic.tsv"
samplesTSV: "/scratch/galeov97/begendiv/pilot/runEvaltest_test2.tsv"
### PATH TO DEFINE RESOURCES FILE - DO NOT CHANGE UNLESS YOU WISH TO USE IN A DIFFERENT LOCATION ####
......
......@@ -100,6 +100,11 @@ gfastatsResults:
time: "02:00:00"
threads: 8
compute_hic_coverage_ear:
mem_mb: 40000
time: "01:00:00"
threads: 16
###### Rules for illuminadb building ##########
unzipFastq_R1_illumina:
......
......@@ -11,4 +11,5 @@ dependencies:
- pandas
- numpy
- ghostscript
- python=3.9.*
\ No newline at end of file
- python=3.9.*
- seqkit
\ No newline at end of file
......@@ -982,7 +982,6 @@ rule Reports_md:
cat {input.landingPage} {input.indivMD} {input.IndividualPretextMD} >> {output.FullMarkdown}
"""
rule Reports_pdf:
input:
md_report=os.path.join(config['Results'],"1_evaluation/finalResults/individual_reports/ALL_individual_REPORTS.md")
......@@ -997,7 +996,6 @@ rule Reports_pdf:
(pandoc -o {output.pdf_report} {input.md_report} --pdf-engine=tectonic) &>> {log}
"""
rule ConcatAll_pdfs:
input:
pdf_report=os.path.join(config['Results'],"1_evaluation/finalResults/individual_reports/ALL_individual_REPORTS.pdf"),
......@@ -1014,12 +1012,52 @@ rule ConcatAll_pdfs:
(gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile={output.pdf_report} {input.pdf_report} {input.gradient} {input.coloured}) &>> {log}
"""
rule compute_hic_coverage_ear:
output:
os.path.join(config['Results'], "1_evaluation/finalResults/hic_coverage_estimate.txt")
params:
asm1_path = last_asm1_path,
asm2_path = last_asm2_path,
hic1_path = hic1_path,
hic2_path = hic2_path
conda:
os.path.join(workflow.basedir, "envs/AUXILIARY_PYTHON_SCRIPTS.yaml")
log:
os.path.join(config['Results'],"1_evaluation/logs/hic_coverage_estimate_seqkit.log")
threads:
resource['compute_hic_coverage_ear']['threads']
resources:
mem_mb=resource['gfastatsResults']['mem_mb'],
time=resource['gfastatsResults']['time']
shell:
"""
# Get bases
HiC_BASES=$(seqkit stats --threads {threads} {params.hic1_path} {params.hic2_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}')
ASM_BASES=$(seqkit stats --threads {threads} {params.asm1_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}')
# Check if asm2 is provided
if {params.asm2_path}; then
ASM2_BASES=$(seqkit stats --threads {threads} {params.asm2_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}')
# check for the largest asm
if ((ASM_BASES < ASM2_BASES)); then
ASM_BASES=$ASM2_BASES
fi
fi
# Get coverage
HIC_COV=$(awk "BEGIN {{printf \\"%.2f\\", $HiC_BASES / $ASM_BASES}}")
# write a file with this value only
echo $HIC_COV > {output}
"""
rule make_ear:
input:
expand(os.path.join(config['Results'], "1_evaluation/kmer_profiling/{name}/genomescope/results_summary.txt"), name = merylDB_basename()),
[expand(os.path.join(config['Results'], "1_evaluation/{asmID}/KEY_RESULTS/short_summary.specific.{asmID}_asm1.txt"), asmID = key) for key, values in dictSamples.items()],
[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA/{asmID}_merqOutput.qv"), asmID = key) for key, values in dictSamples.items()],
[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt"), asmID = key) for key, values in dictSamples.items()]
[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt"), asmID = key) for key, values in dictSamples.items()],
hic_coverage_file = os.path.join(config['Results'], "1_evaluation/finalResults/hic_coverage_estimate.txt")
output:
os.path.join(config['Results'],"1_evaluation/finalResults/EAR_report.yaml")
conda:
......@@ -1030,10 +1068,17 @@ rule make_ear:
busco = samples["busco_lineage"].unique()[0],
results_folder = config['Results'],
script = os.path.join(workflow.basedir, "scripts/report/make_erga_assembly_report.py"),
smudgeplot = config['smudgeplot']
smudgeplot = config['smudgeplot'],
hic_data = hic_data
log:
os.path.join(config['Results'], "1_evaluation/logs/ear_report.log")
shell:
"""
python {params.script} {output} {params.samples_tsv_path} {params.merylDB} {params.busco} {params.results_folder} {params.smudgeplot}
if {params.hic_data}; then
hic_coverage=$(cat {input.hic_coverage_file})
else
hic_coverage=0
fi
python {params.script} {output} {params.samples_tsv_path} {params.merylDB} {params.busco} {params.results_folder} {params.smudgeplot} $hic_coverage
"""
......@@ -2,7 +2,7 @@ import os
import sys
import pandas as pd
def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot):
def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage):
genomescope_version = "2.0"
smudgeplot_version = "0.2.5"
......@@ -14,6 +14,7 @@ def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseN
# GENERAL INFORMATION
ToLID: #<Insert ToLID>
Species: #<Insert species name>
Sex: <Insert species sex> # for example: XX, XY, ZZ, ZW, unknown, NA...
Submitter: #<Insert submitter full name>
Affiliation: #<Insert affiliation>
Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite
......@@ -21,8 +22,13 @@ Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite
# SEQUENCING DATA
DATA:
- #<Insert data type>: #<insert data coverage> # if coverage is not available, leave it empty
- hic: '''
if hic_coverage != 0:
yaml_content += f'''{hic_coverage} # this is an estimate of the coverage of the Hi-C data
'''
else: yaml_content += f''':#<insert data coverage> # if coverage is not available, leave it empty
'''
yaml_content += f'''
# GENOME PROFILING DATA
PROFILING:
GenomeScope:
......@@ -77,4 +83,5 @@ if __name__ == "__main__":
buscoDataBaseName = sys.argv[4]
results_folder = sys.argv[5]
smudgeplot = True if sys.argv[6] == "True" else False
create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot)
hic_coverage = sys.argv[7]
create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment