From 43b4c8be9cfe0b549500a0ab23d2e530b6e56e6d Mon Sep 17 00:00:00 2001
From: valegale <valentina_galeone@outlook.it>
Date: Wed, 12 Feb 2025 02:57:28 +0100
Subject: [PATCH] adding longQC rule

---
 .gitignore                          |  1 +
 Snakefile                           |  5 +++-
 TODO                                |  3 ++-
 configuration/define_resources.yaml |  5 ++++
 envs/LONGQC.yaml                    | 13 ++++++++++
 rules/build_hifi.smk                | 37 ++++++++++++++++++++++++++++-
 6 files changed, 61 insertions(+), 3 deletions(-)
 create mode 100644 envs/LONGQC.yaml

diff --git a/.gitignore b/.gitignore
index a8bfcdc..5c567d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 buscoLineage/
 buscoLineage
 
+tools/LongQC/
 #ignoring the new logs 
 slurm_logs/
 
diff --git a/Snakefile b/Snakefile
index 288ad8d..8facab1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -138,8 +138,11 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i
 
 
 	samples=samples.set_index(['sample','readCounter'])
-	ruleAllQCFiles=[]
+	longqc_dir = "tools/LongQC"
 
+	
+	ruleAllQCFiles=[]
+	### TODO change this column in a generic QC analysis. (both nanoplot and and longqc running)
 	if samples['fastQC'].str.contains('True').any():
 		ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/multiqc/{sample}.multiqcReport.html"), sample=key) for key, [value1, value2] in testDictQC.items()]
 	ruleAll=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/merylDb/complete_hifi.{sample}.{kmer}.meryl"), sample=key, kmer=value1) for key, [value1, value2] in dictSamples.items()]
diff --git a/TODO b/TODO
index 23abf39..95f1314 100644
--- a/TODO
+++ b/TODO
@@ -1,4 +1,5 @@
 - add nanoplot DONE
 - remove cutadapter
 - change hifi to long reads or something similar
-- add longqc (optional)
+- add longqc DONE
+- add a column for pacbio/ont
diff --git a/configuration/define_resources.yaml b/configuration/define_resources.yaml
index b70c4dd..4383973 100644
--- a/configuration/define_resources.yaml
+++ b/configuration/define_resources.yaml
@@ -167,6 +167,11 @@ nanoplot_hifi:
     time:       "12:00:00"
     threads:    2
 
+longQC_hifi:
+    mem_mb:     12000
+    time:       "12:00:00"
+    threads:    4
+
 multiQC_hifi:
     mem_mb:     4000
     time:       "06:00:00"
diff --git a/envs/LONGQC.yaml b/envs/LONGQC.yaml
new file mode 100644
index 0000000..170b1d9
--- /dev/null
+++ b/envs/LONGQC.yaml
@@ -0,0 +1,13 @@
+channels:
+ - anaconda
+ - conda-forge
+ - bioconda
+dependencies:
+ - matplotlib=3.10.0
+ - scikit-learn=1.6.1
+ - pandas=2.2.3
+ - jinja2=3.1.5
+ - h5py=3.12.1
+ - pysam=0.23.0
+ - edlib=1.2.3
+ - python-edlib=1.3.9.post1
\ No newline at end of file
diff --git a/rules/build_hifi.smk b/rules/build_hifi.smk
index c05f060..2b1bd5f 100644
--- a/rules/build_hifi.smk
+++ b/rules/build_hifi.smk
@@ -113,7 +113,8 @@ rule fastQC_hifi:
 rule multiQC_hifi:
 	input:
 		lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/fastqc/{readCounter}.{smrtornot}_fastqc.html"), sample=wildcards.sample, readCounter=dictReadCounter[wildcards.sample], smrtornot=dictSamples[wildcards.sample][1]),
-		os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/nanoplot/NanoPlot-report.html")
+		os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/nanoplot/NanoPlot-report.html"),
+		os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/longQC/results/web_summary.html")
 	params:
 		folder2qc=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/fastqc/"),
 		folder2qc2=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/nanoplot/"),
@@ -199,3 +200,37 @@ rule NanoPlot:
 		"""
 		(NanoPlot -t {threads} -o {params.folder2out} --fastq {input} --plots kde dot) &> {log}
 		"""
+
+rule longQC:
+	input:
+		fastq = lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/temp_trimReads/{readCounter}.{smrtornot}.fastq"), sample=wildcards.sample, readCounter=dictReadCounter[wildcards.sample], smrtornot=dictSamples[wildcards.sample][1]),
+		path_lonqqc = os.path.join(longqc_dir,"longQC.py")
+	params:
+		folder2out=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/longQC/")
+	output:
+		os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/longQC/results/web_summary.html")
+	threads:
+		resource['longQC_hifi']['threads']
+	resources:
+		mem_mb=resource['longQC_hifi']['mem_mb'],
+		time=resource['longQC_hifi']['time'],
+	log:
+		os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/longQC.log")
+	conda:
+		os.path.join(workflow.basedir, "envs/LONGQC.yaml")
+	shell:
+		"""
+		## TODO making either pacbio or nanopore for -x
+		(python tools/LongQC/longQC.py sampleqc -x ont-rapid -o {params.folder2out}/results {input.fastq}) &> {log}
+		"""
+
+rule install_longqc:
+    output:
+        os.path.join(longqc_dir,"longQC.py")
+    shell:
+        """
+        mkdir -p tools
+        cd tools
+        git clone https://github.com/yfukasawa/LongQC.git
+        cd LongQC/minimap2-coverage && make
+        """
\ No newline at end of file
-- 
GitLab