adding hic coverage

7fcdf241 · Galeone · cbfb41ec · 7fcdf241 · 7fcdf241 · 7fcdf241
Commit 7fcdf241 authored 1 year ago by Galeone
--- a/.gitignore
+++ b/.gitignore
+.DS_Store
+
+#ignoring the downloaded database
+buscoLineage/
+buscoLineage
+
+#ignoring the new logs 
+slurm_logs/
+
+#igoring changes in .snakemake (environments)
+.snakemake/
+
+#ignoring pycache 
+__pycache__
--- a/Snakefile
+++ b/Snakefile
@@ -10,9 +10,6 @@ from urllib.request import urlopen
 from bs4 import BeautifulSoup
 import argparse, subprocess

-#temporary
-from make_erga_assembly_report import create_yaml_file
-
 container: "docker://gepdocker/gep_dockerimage:latest"
 configfile: "configuration/config.yaml"

@@ -207,20 +204,46 @@ elif set(['ID',	'ASM_LEVEL', 'busco_lineage', 'PRI_asm', 'ALT_asm',	'merylDB',	'
 	if len(samples["merylDB_basename"].unique()) != len(samples["merylDB"].unique()):
 		print ("WARNING: please check that each meryl database is named differently") 
 	
-
+	
+	last_asm1_path = ""
+	last_asm2_path = ""
+	hic1_path = ""
+	hic2_path = ""
+	hic_data = ""
 	if config['EAR']:

-		if len(samples["busco_lineage"].unique()) != 1:  # when EAR runs, only one busco lineage should be given
+		if len(samples["busco_lineage"].unique()) != 1:  # when the ear report is computed, only one busco lineage should be given
 			print ("check tsv file, multiple busco lineages inserted, all rows should point to the same busco lineage")

-		if len(samples["merylDB"].unique()) != 1: # when EAR runs, only one merylDB should be given
+		if len(samples["merylDB"].unique()) != 1: # when the ear report is computed, only one merylDB should be given
 			print ("check tsv file, multiple merylDB inserted, all rows should point to the same database")

+		hic1 = [x for x in samples["HiC_R1"].unique() if x != 'None']
+		hic2 = [x for x in samples["HiC_R2"].unique() if x != 'None']
+		
+		if (len(hic1) != 0 and len(hic2) != 0):
+			hic_data = "true"
+
+			if (len(hic1) != 1 or len(hic2) != 1): # when the ear report is computed, only one HiC pair should be given
+				print ("check tsv file, multiple HiC pairs inserted, all rows should point to the same pair")
+			
+				hic1_path = hic1[0]
+				hic2_path = hic2[0]
+		else:
+			hic_data = "false"
+			hic1_path = "None"
+			hic2_path = "None"
+
+		# Extracting the last two assemblies to extimate the coverage in the EAR report
+		last_asm1_path = [x for x in samples["PRI_asm"].unique() if x != 'None'][-1]
+		if  len([x for x in samples["ALT_asm"].unique() if x != 'None']) == 0:
+			last_asm2_path = "false"
+		else:
+			last_asm2_path = [x for x in samples["ALT_asm"].unique() if x != 'None'][-1]

 		ruleAllQCFiles=[]
 		ruleAll=os.path.join(config['Results'],"1_evaluation/finalResults/GEP_FINAL_REPORT.pdf"), \
 				os.path.join(config['Results'],"1_evaluation/finalResults/EAR_report.yaml")	
-
 	else:
 		ruleAllQCFiles=[]
 		ruleAll=os.path.join(config['Results'],"1_evaluation/finalResults/GEP_FINAL_REPORT.pdf")
@@ -285,9 +308,6 @@ else:



-
-
-
 if "Results" not in config:
 	config["Results"] = "results"


--- a/configuration/config.yaml
+++ b/configuration/config.yaml

 ### PATH TO WHERE YOU WANT YOUR RESULTS FOR THIS RUN STORED - WILL BE AUTOMATICALLY CREATED IF IT DOESN'T EXIST  ###

-Results: "/scratch/galeov97/begendiv/pilot/results5"
+Results: "/scratch/galeov97/begendiv/pilot/results"

 ### FULL PATH TO YOUR SAMPLESHEET    ###

-samplesTSV: "/scratch/galeov97/begendiv/pilot/runEvaltest_nohic.tsv"
+samplesTSV: "/scratch/galeov97/begendiv/pilot/runEvaltest_test2.tsv"

 ### PATH TO DEFINE RESOURCES FILE - DO NOT CHANGE UNLESS YOU WISH TO USE IN A DIFFERENT LOCATION  ####


--- a/configuration/define_resources.yaml
+++ b/configuration/define_resources.yaml
@@ -100,6 +100,11 @@ gfastatsResults:
    time:       "02:00:00"
    threads:    8

+compute_hic_coverage_ear:
+    mem_mb:     40000
+    time:       "01:00:00"
+    threads:    16
+
 ###### Rules for illuminadb building ##########

 unzipFastq_R1_illumina:

--- a/envs/AUXILIARY_PYTHON_SCRIPTS.yaml
+++ b/envs/AUXILIARY_PYTHON_SCRIPTS.yaml
@@ -11,4 +11,5 @@ dependencies:
 - pandas
 - numpy
 - ghostscript
- - python=3.9.*
\ No newline at end of file
+ - python=3.9.*
+ - seqkit
\ No newline at end of file
--- a/rules/evaluate.smk
+++ b/rules/evaluate.smk
@@ -982,7 +982,6 @@ rule Reports_md:
 		cat {input.landingPage} {input.indivMD} {input.IndividualPretextMD} >> {output.FullMarkdown}
 		"""

-
 rule Reports_pdf:
 	input:
 		md_report=os.path.join(config['Results'],"1_evaluation/finalResults/individual_reports/ALL_individual_REPORTS.md")
@@ -997,7 +996,6 @@ rule Reports_pdf:
 		(pandoc -o {output.pdf_report} {input.md_report} --pdf-engine=tectonic) &>> {log}
 		"""

-
 rule ConcatAll_pdfs:
 	input:
 		pdf_report=os.path.join(config['Results'],"1_evaluation/finalResults/individual_reports/ALL_individual_REPORTS.pdf"),
@@ -1014,12 +1012,52 @@ rule ConcatAll_pdfs:
 		(gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile={output.pdf_report} {input.pdf_report} {input.gradient} {input.coloured}) &>> {log}
 		"""

+rule compute_hic_coverage_ear:
+	output:
+		os.path.join(config['Results'], "1_evaluation/finalResults/hic_coverage_estimate.txt")
+	params:
+		asm1_path = last_asm1_path,
+		asm2_path = last_asm2_path,
+		hic1_path = hic1_path,
+		hic2_path = hic2_path
+	conda:	
+		os.path.join(workflow.basedir, "envs/AUXILIARY_PYTHON_SCRIPTS.yaml")	
+	log:
+		os.path.join(config['Results'],"1_evaluation/logs/hic_coverage_estimate_seqkit.log")
+	threads:
+		resource['compute_hic_coverage_ear']['threads']
+	resources:
+		mem_mb=resource['gfastatsResults']['mem_mb'],
+		time=resource['gfastatsResults']['time']
+	shell:
+		"""
+		# Get bases	
+		HiC_BASES=$(seqkit stats --threads {threads} {params.hic1_path} {params.hic2_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}')
+		
+		ASM_BASES=$(seqkit stats --threads {threads} {params.asm1_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}')
+
+		# Check if asm2 is provided
+		if {params.asm2_path}; then
+			ASM2_BASES=$(seqkit stats --threads {threads} {params.asm2_path} | awk 'NR > 1 {{gsub(",", "", $5); sum += $5}} END {{print sum}}')
+			# check for the largest asm
+			if ((ASM_BASES < ASM2_BASES)); then
+				ASM_BASES=$ASM2_BASES
+			fi
+		fi
+
+		# Get coverage
+		HIC_COV=$(awk "BEGIN {{printf \\"%.2f\\", $HiC_BASES / $ASM_BASES}}")
+		# write a file with this value only
+		echo $HIC_COV > {output}
+		"""
+
 rule make_ear:
 	input:
 		expand(os.path.join(config['Results'], "1_evaluation/kmer_profiling/{name}/genomescope/results_summary.txt"), name = merylDB_basename()),
 		[expand(os.path.join(config['Results'], "1_evaluation/{asmID}/KEY_RESULTS/short_summary.specific.{asmID}_asm1.txt"), asmID = key) for key, values in dictSamples.items()],
 		[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA/{asmID}_merqOutput.qv"), asmID = key) for key, values in dictSamples.items()],
-		[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt"), asmID = key) for key, values in dictSamples.items()]
+		[expand(os.path.join(config['Results'],"1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt"), asmID = key) for key, values in dictSamples.items()],
+		hic_coverage_file = os.path.join(config['Results'], "1_evaluation/finalResults/hic_coverage_estimate.txt")
 	output:
 		os.path.join(config['Results'],"1_evaluation/finalResults/EAR_report.yaml")
 	conda:
@@ -1030,10 +1068,17 @@ rule make_ear:
 		busco = samples["busco_lineage"].unique()[0],
 		results_folder = config['Results'],
 		script = os.path.join(workflow.basedir, "scripts/report/make_erga_assembly_report.py"),
-		smudgeplot =  config['smudgeplot']
+		smudgeplot =  config['smudgeplot'],
+		hic_data = hic_data
 	log:
 		os.path.join(config['Results'], "1_evaluation/logs/ear_report.log")
 	shell:
 		"""
-		python {params.script} {output} {params.samples_tsv_path} {params.merylDB} {params.busco} {params.results_folder} {params.smudgeplot}
+		if {params.hic_data}; then
+			hic_coverage=$(cat {input.hic_coverage_file})
+		else
+			hic_coverage=0
+		fi
+		
+		python {params.script} {output} {params.samples_tsv_path} {params.merylDB} {params.busco} {params.results_folder} {params.smudgeplot} $hic_coverage
 		"""
--- a/scripts/report/make_erga_assembly_report.py
+++ b/scripts/report/make_erga_assembly_report.py
@@ -2,7 +2,7 @@ import os
 import sys
 import pandas as pd

-def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot):
+def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage):
 	genomescope_version = "2.0"
 	smudgeplot_version = "0.2.5"

@@ -14,6 +14,7 @@ def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseN
 # GENERAL INFORMATION
 ToLID: #<Insert ToLID>
 Species: #<Insert species name>
+Sex: <Insert species sex> # for example: XX, XY, ZZ, ZW, unknown, NA...
 Submitter: #<Insert submitter full name>
 Affiliation: #<Insert affiliation>
 Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite
@@ -21,8 +22,13 @@ Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite

 # SEQUENCING DATA
 DATA:
-	- #<Insert data type>: #<insert data coverage> # if coverage is not available, leave it empty
-
+	- hic: ''' 
+	if hic_coverage != 0:
+		yaml_content += f'''{hic_coverage} # this is an estimate of the coverage of the Hi-C data
+'''
+	else: yaml_content += f''':#<insert data coverage> # if coverage is not available, leave it empty
+'''
+	yaml_content += f'''
 # GENOME PROFILING DATA
 PROFILING:
 	GenomeScope:
@@ -77,4 +83,5 @@ if __name__ == "__main__":
 		buscoDataBaseName = sys.argv[4]
 		results_folder = sys.argv[5]
 		smudgeplot = True if sys.argv[6] == "True" else False
-		create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot)
+		hic_coverage = sys.argv[7]
+		create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage)