Skip to content
Snippets Groups Projects
Commit 0f3a6311 authored by ddepanis's avatar ddepanis
Browse files

Add new script for EAR

parent 209f1b03
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
import os
import sys
import pandas as pd
def create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage, kmer_coverage):
genomescope_version = "2.0"
smudgeplot_version = "0.2.5"
yaml_content = '''# This is the yaml file for generating the ERGA Assembly Report (EAR) using the make_EAR.py script (https://github.com/ERGA-consortium/EARs)
# Please complete the required information pointed as #<Insert ...>
# The file [example]rCarCar2_EAR.yaml contains an example of a completed yaml file (https://github.com/ERGA-consortium/EARs)
# GENERAL INFORMATION
ToLID: #<Insert ToLID>
Species: #<Insert species name>
Sex: <Insert species sex> # for example: XX, XY, ZZ, ZW, unknown, NA...
Submitter: #<Insert submitter full name>
Affiliation: #<Insert affiliation>
Tag: #<Insert tag> # valid tags are ERGA-Pilot, ERGA-BGE, ERGA-Satellite
# SEQUENCING DATA
DATA:
- hic: '''
if hic_coverage != 0:
yaml_content += f'''{hic_coverage} # this is an estimate of the coverage of the Hi-C data
'''
else:
yaml_content += f''':#<insert data coverage> # if coverage is not available, leave it empty
'''
yaml_content += f''' - hifi/illumina #<select data type>: {kmer_coverage} # this is an estimate of the coverage based on the kmer coverage computed by genomescope2
# GENOME PROFILING DATA
PROFILING:
GenomeScope:
version: {genomescope_version}
results_folder: {results_folder}/1_evaluation/kmer_profiling/{merylDB}/genomescope/
'''
if smudgeplot:
yaml_content += f''' Smudgeplot:
version: {smudgeplot_version}
results_folder: {results_folder}/1_evaluation/kmer_profiling/{merylDB}/smudgeplot/
'''
yaml_content += '''ASSEMBLIES:
'''
samples = pd.read_csv(samples_tsv_path, dtype=str, index_col=False, delim_whitespace=True, skip_blank_lines=True)
samples = samples.set_index(['ID'])
for asmID, row in samples.iterrows():
assembly_level = "Pre-curation" if row['ASM_LEVEL'] == "scaff" else "Curated"
yaml_content += f''' {assembly_level}:
pipeline: [<Insert ToolA_v1.2.3|ParamX|ParamY>, <Insert ToolB_v2.3.4>] # valid input is empty or between brackets ToolName followed by _v followed by versionNumber followed by | followed by keyToolParameter
hap1: # this name was autocompleted by GEP, check the correct naming, valid types are hap1, pri, collapsed
gfastats--nstar-report_txt: {results_folder}/1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm1/{asmID}_gfastats.txt
busco_short_summary_txt: {results_folder}/1_evaluation/{asmID}/BUSCOs/{asmID}/asm1/short_summary.specific.{buscoDataBaseName}_odb10.{asmID}.txt
merqury_folder: {results_folder}/1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA
'''
if assembly_level == "Curated":
yaml_content += f''' hic_FullMap_png: {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm1.HiC.COMBINED.FILTERED_FullMap.png
hic_FullMap_link: # Upload the file {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm1.HiC.COMBINED.FILTERED.pretext to the web and insert the link here
blobplot_cont_png: #<Insert blobplot contamination .png file full path>
'''
if (row['ALT_asm'] != "None") and (not pd.isna(row['ALT_asm'])):
yaml_content += f''' hap2: # this name was autocompleted by GEP, check the correct naming
gfastats--nstar-report_txt: {results_folder}/1_evaluation/{asmID}/ASSEMBLY_STATISTICS/asm2/{asmID}_gfastats.txt
busco_short_summary_txt: {results_folder}/1_evaluation/{asmID}/BUSCOs/{asmID}/asm2/short_summary.specific.{buscoDataBaseName}_odb10.{asmID}.txt
merqury_folder: {results_folder}/1_evaluation/{asmID}/QV.KMER-COMPLETENESS.CN-SPECTRA
'''
if assembly_level == "Curated":
yaml_content += f''' hic_FullMap_png: {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm2.HiC.COMBINED.FILTERED_FullMap.png
hic_FullMap_link: # Upload the file {results_folder}/1_evaluation/{asmID}/HiC_MAPS/{asmID}.asm2.HiC.COMBINED.FILTERED.pretext to the web and insert the link here
blobplot_cont_png: #<Insert blobplot contamination .png file full path>
'''
yaml_content += "\n"
with open(output_file_path, 'w') as file:
file.write(yaml_content)
if __name__ == "__main__":
output_file_path = sys.argv[1]
samples_tsv_path = sys.argv[2]
merylDB = sys.argv[3]
buscoDataBaseName = sys.argv[4]
results_folder = sys.argv[5]
smudgeplot = True if sys.argv[6] == "True" else False
hic_coverage = sys.argv[7]
kmer_coverage = sys.argv[8]
create_yaml_file(output_file_path, samples_tsv_path, merylDB, buscoDataBaseName, results_folder, smudgeplot, hic_coverage, kmer_coverage)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment