From 0f3a63110feda19dc3997afb4826446781f32c70 Mon Sep 17 00:00:00 2001
From: ddepanis <ddepanis@mi.fu-berlin.de>
Date: Sat, 16 Mar 2024 12:11:41 +0000
Subject: [PATCH] Add new script for EAR

---
 scripts/report/make_erga_assembly_report.py | 95 +++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 scripts/report/make_erga_assembly_report.py

diff --git a/scripts/report/make_erga_assembly_report.py b/scripts/report/make_erga_assembly_report.py
new file mode 100644
index 0000000..2a692b5
--- /dev/null
+++ b/scripts/report/make_erga_assembly_report.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import pandas as pd
+
+def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage, kmer_coverage):
+    genomescope_version = "2.0"
+    smudgeplot_version = "0.2.5"
+
+    yaml_content = '''# This is the yaml file for generating the ERGA Assembly Report (EAR) using the make_EAR.py script (https://github.com/ERGA-consortium/EARs)
+# Please complete the required information pointed as #<Insert ...>
+# The file [example]rCarCar2_EAR.yaml contains an example of a completed yaml file (https://github.com/ERGA-consortium/EARs)
+
+# GENERAL INFORMATION
+ToLID: #<Insert ToLID>
+Species: #<Insert species name>
+Sex: <Insert species sex> # for example: XX, XY, ZZ, ZW, unknown, NA...
+Submitter: #<Insert submitter full name>
+Affiliation: #<Insert affiliation>
+Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite
+
+# SEQUENCING DATA
+DATA:
+    - hic: '''
+    
+    if hic_coverage != 0:
+        yaml_content += f'''{hic_coverage} # this is an estimate of the coverage of the Hi-C data
+'''
+    else:
+        yaml_content += f''':#<insert data coverage> # if coverage is not available, leave it empty
+'''
+    yaml_content += f'''    - hifi/illumina #<select data type>: {kmer_coverage} # this is an estimate of the coverage based on the kmer coverage computed by genomescope2
+
+# GENOME PROFILING DATA
+PROFILING:
+    GenomeScope:
+        version: {genomescope_version}
+        results_folder: {results_folder}/1_evaluation/kmer_profiling/{merylDB}/genomescope/
+'''
+
+    if smudgeplot:
+        yaml_content += f'''    Smudgeplot:
+        version: {smudgeplot_version}
+        results_folder: {results_folder}/1_evaluation/kmer_profiling/{merylDB}/smudgeplot/
+'''
+
+    yaml_content += '''ASSEMBLIES:
+'''
+
+    samples = pd.read_csv(samples_tsv_path, dtype=str, index_col=False, delim_whitespace=True, skip_blank_lines=True)
+    samples = samples.set_index(['ID'])
+
+    for asmID, row in samples.iterrows():
+        assembly_level = "Pre-curation" if row['ASM_LEVEL'] == "scaff" else "Curated"
+        
+        yaml_content += f'''  {assembly_level}:
+    pipeline: [<Insert ToolA_v1.2.3|ParamX|ParamY>, <Insert ToolB_v2.3.4>] # valid input is empty or between brackets ToolName followed by _v followed by versionNumber followed by | followed by keyToolParameter
+    hap1: # this name was autocompleted by GEP, check the correct naming, valid types are hap1, pri, collapsed
+      gfastats--nstar-report_txt: {results_folder}/1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt
+      busco_short_summary_txt: {results_folder}/1_evaluation/{asmID}/BUSCOs/{asmID}/asm1/short_summary.specific.{buscoDataBaseName}_odb10.{asmID}.txt
+      merqury_folder: {results_folder}/1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA
+'''
+        if assembly_level == "Curated":
+            yaml_content += f'''      hic_FullMap_png: {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm1.HiC.COMBINED.FILTERED_FullMap.png
+      hic_FullMap_link: # Upload the file {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm1.HiC.COMBINED.FILTERED.pretext to the web and insert the link here
+      blobplot_cont_png: #<Insert blobplot contamination .png file full path>
+'''
+
+        if (row['ALT_asm'] != "None") and (not pd.isna(row['ALT_asm'])):
+            yaml_content += f'''    hap2: # this name was autocompleted by GEP, check the correct naming
+      gfastats--nstar-report_txt: {results_folder}/1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm2/{asmID}_gfastats.txt
+      busco_short_summary_txt: {results_folder}/1_evaluation/{asmID}/BUSCOs/{asmID}/asm2/short_summary.specific.{buscoDataBaseName}_odb10.{asmID}.txt
+      merqury_folder: {results_folder}/1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA
+'''
+            if assembly_level == "Curated":
+                yaml_content += f'''      hic_FullMap_png: {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm2.HiC.COMBINED.FILTERED_FullMap.png
+      hic_FullMap_link: # Upload the file {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm2.HiC.COMBINED.FILTERED.pretext to the web and insert the link here
+      blobplot_cont_png: #<Insert blobplot contamination .png file full path>
+'''
+        yaml_content += "\n"
+
+    with open(output_file_path, 'w') as file:
+        file.write(yaml_content)
+
+if __name__ == "__main__":
+    output_file_path = sys.argv[1]
+    samples_tsv_path = sys.argv[2]
+    merylDB = sys.argv[3]
+    buscoDataBaseName = sys.argv[4]
+    results_folder = sys.argv[5]
+    smudgeplot = True if sys.argv[6] == "True" else False
+    hic_coverage = sys.argv[7]
+    kmer_coverage = sys.argv[8]
+    create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage, kmer_coverage)
-- 
GitLab