From 0f3a63110feda19dc3997afb4826446781f32c70 Mon Sep 17 00:00:00 2001 From: ddepanis <ddepanis@mi.fu-berlin.de> Date: Sat, 16 Mar 2024 12:11:41 +0000 Subject: [PATCH] Add new script for EAR --- scripts/report/make_erga_assembly_report.py | 95 +++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 scripts/report/make_erga_assembly_report.py diff --git a/scripts/report/make_erga_assembly_report.py b/scripts/report/make_erga_assembly_report.py new file mode 100644 index 0000000..2a692b5 --- /dev/null +++ b/scripts/report/make_erga_assembly_report.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +import os +import sys +import pandas as pd + +def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage, kmer_coverage): + genomescope_version = "2.0" + smudgeplot_version = "0.2.5" + + yaml_content = '''# This is the yaml file for generating the ERGA Assembly Report (EAR) using the make_EAR.py script (https://github.com/ERGA-consortium/EARs) +# Please complete the required information pointed as #<Insert ...> +# The file [example]rCarCar2_EAR.yaml contains an example of a completed yaml file (https://github.com/ERGA-consortium/EARs) + +# GENERAL INFORMATION +ToLID: #<Insert ToLID> +Species: #<Insert species name> +Sex: <Insert species sex> # for example: XX, XY, ZZ, ZW, unknown, NA... +Submitter: #<Insert submitter full name> +Affiliation: #<Insert affiliation> +Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite + +# SEQUENCING DATA +DATA: + - hic: ''' + + if hic_coverage != 0: + yaml_content += f'''{hic_coverage} # this is an estimate of the coverage of the Hi-C data +''' + else: + yaml_content += f''':#<insert data coverage> # if coverage is not available, leave it empty +''' + yaml_content += f''' - hifi/illumina #<select data type>: {kmer_coverage} # this is an estimate of the coverage based on the kmer coverage computed by genomescope2 + +# GENOME PROFILING DATA +PROFILING: + GenomeScope: + version: {genomescope_version} + results_folder: {results_folder}/1_evaluation/kmer_profiling/{merylDB}/genomescope/ +''' + + if smudgeplot: + yaml_content += f''' Smudgeplot: + version: {smudgeplot_version} + results_folder: {results_folder}/1_evaluation/kmer_profiling/{merylDB}/smudgeplot/ +''' + + yaml_content += '''ASSEMBLIES: +''' + + samples = pd.read_csv(samples_tsv_path, dtype=str, index_col=False, delim_whitespace=True, skip_blank_lines=True) + samples = samples.set_index(['ID']) + + for asmID, row in samples.iterrows(): + assembly_level = "Pre-curation" if row['ASM_LEVEL'] == "scaff" else "Curated" + + yaml_content += f''' {assembly_level}: + pipeline: [<Insert ToolA_v1.2.3|ParamX|ParamY>, <Insert ToolB_v2.3.4>] # valid input is empty or between brackets ToolName followed by _v followed by versionNumber followed by | followed by keyToolParameter + hap1: # this name was autocompleted by GEP, check the correct naming, valid types are hap1, pri, collapsed + gfastats--nstar-report_txt: {results_folder}/1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt + busco_short_summary_txt: {results_folder}/1_evaluation/{asmID}/BUSCOs/{asmID}/asm1/short_summary.specific.{buscoDataBaseName}_odb10.{asmID}.txt + merqury_folder: {results_folder}/1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA +''' + if assembly_level == "Curated": + yaml_content += f''' hic_FullMap_png: {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm1.HiC.COMBINED.FILTERED_FullMap.png + hic_FullMap_link: # Upload the file {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm1.HiC.COMBINED.FILTERED.pretext to the web and insert the link here + blobplot_cont_png: #<Insert blobplot contamination .png file full path> +''' + + if (row['ALT_asm'] != "None") and (not pd.isna(row['ALT_asm'])): + yaml_content += f''' hap2: # this name was autocompleted by GEP, check the correct naming + gfastats--nstar-report_txt: {results_folder}/1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm2/{asmID}_gfastats.txt + busco_short_summary_txt: {results_folder}/1_evaluation/{asmID}/BUSCOs/{asmID}/asm2/short_summary.specific.{buscoDataBaseName}_odb10.{asmID}.txt + merqury_folder: {results_folder}/1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA +''' + if assembly_level == "Curated": + yaml_content += f''' hic_FullMap_png: {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm2.HiC.COMBINED.FILTERED_FullMap.png + hic_FullMap_link: # Upload the file {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm2.HiC.COMBINED.FILTERED.pretext to the web and insert the link here + blobplot_cont_png: #<Insert blobplot contamination .png file full path> +''' + yaml_content += "\n" + + with open(output_file_path, 'w') as file: + file.write(yaml_content) + +if __name__ == "__main__": + output_file_path = sys.argv[1] + samples_tsv_path = sys.argv[2] + merylDB = sys.argv[3] + buscoDataBaseName = sys.argv[4] + results_folder = sys.argv[5] + smudgeplot = True if sys.argv[6] == "True" else False + hic_coverage = sys.argv[7] + kmer_coverage = sys.argv[8] + create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage, kmer_coverage) -- GitLab