From 43b4c8be9cfe0b549500a0ab23d2e530b6e56e6d Mon Sep 17 00:00:00 2001 From: valegale <valentina_galeone@outlook.it> Date: Wed, 12 Feb 2025 02:57:28 +0100 Subject: [PATCH] adding longQC rule --- .gitignore | 1 + Snakefile | 5 +++- TODO | 3 ++- configuration/define_resources.yaml | 5 ++++ envs/LONGQC.yaml | 13 ++++++++++ rules/build_hifi.smk | 37 ++++++++++++++++++++++++++++- 6 files changed, 61 insertions(+), 3 deletions(-) create mode 100644 envs/LONGQC.yaml diff --git a/.gitignore b/.gitignore index a8bfcdc..5c567d6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ buscoLineage/ buscoLineage +tools/LongQC/ #ignoring the new logs slurm_logs/ diff --git a/Snakefile b/Snakefile index 288ad8d..8facab1 100644 --- a/Snakefile +++ b/Snakefile @@ -138,8 +138,11 @@ elif set(['sample', 'hifi_reads', 'meryl_kmer_size','trimSMRTbell', 'fastQC']).i samples=samples.set_index(['sample','readCounter']) - ruleAllQCFiles=[] + longqc_dir = "tools/LongQC" + + ruleAllQCFiles=[] + ### TODO change this column in a generic QC analysis. (both nanoplot and and longqc running) if samples['fastQC'].str.contains('True').any(): ruleAllQCFiles=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/multiqc/{sample}.multiqcReport.html"), sample=key) for key, [value1, value2] in testDictQC.items()] ruleAll=[expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/merylDb/complete_hifi.{sample}.{kmer}.meryl"), sample=key, kmer=value1) for key, [value1, value2] in dictSamples.items()] diff --git a/TODO b/TODO index 23abf39..95f1314 100644 --- a/TODO +++ b/TODO @@ -1,4 +1,5 @@ - add nanoplot DONE - remove cutadapter - change hifi to long reads or something similar -- add longqc (optional) +- add longqc DONE +- add a column for pacbio/ont diff --git a/configuration/define_resources.yaml b/configuration/define_resources.yaml index b70c4dd..4383973 100644 --- a/configuration/define_resources.yaml +++ b/configuration/define_resources.yaml @@ -167,6 +167,11 @@ nanoplot_hifi: time: "12:00:00" threads: 2 +longQC_hifi: + mem_mb: 12000 + time: "12:00:00" + threads: 4 + multiQC_hifi: mem_mb: 4000 time: "06:00:00" diff --git a/envs/LONGQC.yaml b/envs/LONGQC.yaml new file mode 100644 index 0000000..170b1d9 --- /dev/null +++ b/envs/LONGQC.yaml @@ -0,0 +1,13 @@ +channels: + - anaconda + - conda-forge + - bioconda +dependencies: + - matplotlib=3.10.0 + - scikit-learn=1.6.1 + - pandas=2.2.3 + - jinja2=3.1.5 + - h5py=3.12.1 + - pysam=0.23.0 + - edlib=1.2.3 + - python-edlib=1.3.9.post1 \ No newline at end of file diff --git a/rules/build_hifi.smk b/rules/build_hifi.smk index c05f060..2b1bd5f 100644 --- a/rules/build_hifi.smk +++ b/rules/build_hifi.smk @@ -113,7 +113,8 @@ rule fastQC_hifi: rule multiQC_hifi: input: lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/fastqc/{readCounter}.{smrtornot}_fastqc.html"), sample=wildcards.sample, readCounter=dictReadCounter[wildcards.sample], smrtornot=dictSamples[wildcards.sample][1]), - os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/nanoplot/NanoPlot-report.html") + os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/nanoplot/NanoPlot-report.html"), + os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/longQC/results/web_summary.html") params: folder2qc=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/fastqc/"), folder2qc2=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/nanoplot/"), @@ -199,3 +200,37 @@ rule NanoPlot: """ (NanoPlot -t {threads} -o {params.folder2out} --fastq {input} --plots kde dot) &> {log} """ + +rule longQC: + input: + fastq = lambda wildcards: expand(os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/temp_trimReads/{readCounter}.{smrtornot}.fastq"), sample=wildcards.sample, readCounter=dictReadCounter[wildcards.sample], smrtornot=dictSamples[wildcards.sample][1]), + path_lonqqc = os.path.join(longqc_dir,"longQC.py") + params: + folder2out=os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/longQC/") + output: + os.path.join(config['Results'],"0_buildDatabases/{sample}/hifiReads/QC/longQC/results/web_summary.html") + threads: + resource['longQC_hifi']['threads'] + resources: + mem_mb=resource['longQC_hifi']['mem_mb'], + time=resource['longQC_hifi']['time'], + log: + os.path.join(config['Results'], "0_buildDatabases/{sample}/hifiReads/logs/longQC.log") + conda: + os.path.join(workflow.basedir, "envs/LONGQC.yaml") + shell: + """ + ## TODO making either pacbio or nanopore for -x + (python tools/LongQC/longQC.py sampleqc -x ont-rapid -o {params.folder2out}/results {input.fastq}) &> {log} + """ + +rule install_longqc: + output: + os.path.join(longqc_dir,"longQC.py") + shell: + """ + mkdir -p tools + cd tools + git clone https://github.com/yfukasawa/LongQC.git + cd LongQC/minimap2-coverage && make + """ \ No newline at end of file -- GitLab