Code owners
Assign users and groups as approvers for specific file changes. Learn more.
makePDF_indivMD.py 9.19 KiB
#!/usr/bin/env python3
#import os
#import numpy as np
import pandas as pd
from tabulate import tabulate
import re
gfastats_asm1=snakemake.input[0]
# extract gfastats values
def extract_gfastats_values(content, keys):
return [re.findall(f"{key}: (.+)", content)[0] for key in keys]
keys = ["Total scaffold length",
"GC content %",
"# gaps in scaffolds",
"Total gap length in scaffolds",
"# scaffolds",
"Largest scaffold",
"Scaffold N50",
"Scaffold L50",
"Scaffold L90",
"# contigs",
"Largest contig",
"Contig N50",
"Contig L50",
"Contig L90",
"Expected genome size",
]
with open(gfastats_asm1, 'r') as file:
content = file.read()
data = extract_gfastats_values(content, keys)
gfastats_table = pd.DataFrame({"Metric": keys, "asm1": data})
expected_genome_size = gfastats_table[gfastats_table["Metric"] == "Expected genome size"]["asm1"].values[0]
gfastats_table = gfastats_table.iloc[:-1]
genomescope_linear=snakemake.input[1]
genomescope_log = ""
smudgeplot_plot = ""
run_smudgeplot = snakemake.params[6]
kmer_plot_flat = snakemake.params[7]
if run_smudgeplot:
smudgeplot_plot=snakemake.input[2]
else:
genomescope_log=snakemake.input[2]
QV_score=snakemake.input[3]
QV_score_table = pd.read_csv(QV_score, dtype=str, delim_whitespace=True, skip_blank_lines=True, names=["assembly_name","Assembly_Only", "Total_Kmers", "QV_Score", "Error"])
QV_score_table = QV_score_table.drop(['Error'], axis=1).set_index(['assembly_name'])
kmer_completeness=snakemake.input[4]
kmer_completeness_table = pd.read_csv(kmer_completeness, dtype=str, delim_whitespace=True, skip_blank_lines=True, names=["assembly_name","all","Kmers_Assembly", "Kmers_Reads", "%"])
kmer_completeness_table = kmer_completeness_table.drop(["all"], axis=1).set_index(['assembly_name'])
busco_asm1=snakemake.input[5]
with open(busco_asm1) as scoresOnlyFile:
lines = scoresOnlyFile.readlines()
lines = [line.rstrip() for line in lines]
#create the busco table:
percentage_data = re.split(r'[,\[\]]', lines[0])
percentage_values = {item.split(':')[0]: item.split(':')[1].split('[')[0] for item in percentage_data if len(item) > 0}
names = ["Complete", "Complete and single-copy", "Complete and duplicated", "Fragmented", "Missing", "Total searched groups"]
letters = list(percentage_values.keys())
numbers = []
percentages = []
# Iterate through the data starting from the second line
i = 0
for line in lines[1:]:
parts = line.split('\t')
number = int(parts[0])
# Extract the corresponding percentage from the dictionary
percentage_key = letters[i] # Using the first letter as the key to match the percentage
percentage = float(percentage_values.get(percentage_key, '0.0').rstrip('%'))
numbers.append(number)
percentages.append(percentage)
i += 1
# Create a pandas DataFrame
busco_table = pd.DataFrame({
'Metric': names,
'asm1 #': numbers,
'asm1 %': percentages
})
kmer_flat=snakemake.input[6]
kmer_stacked=snakemake.input[7]
kmer_flat_asm1=snakemake.input[8]
kmer_stacked_asm1=snakemake.input[9]
input_gscopeSum=snakemake.input[10]
with open(input_gscopeSum, "r") as f:
for line in f:
if 'Heterozygous (ab)' in line:
heterozygous_ab_value = float(line.split()[3][:-1])
heterozygous_ab_value = round(heterozygous_ab_value, 3)
break
params_assemblyID=snakemake.params[0]
params_merylKmer=snakemake.params[1]
params_buscoDB=snakemake.params[2]
params_asm2provided=snakemake.params[3]
number_haploptypes=2 if params_asm2provided else 1
if params_asm2provided:
gfastats_asm2=snakemake.params[4]
with open(gfastats_asm2, 'r') as file:
content = file.read()
data = extract_gfastats_values(content, keys)
gfastats_asm2_table = pd.DataFrame({"Metric": keys, "asm2": data})
gfastats_asm2_table = gfastats_asm2_table.iloc[:-1]
gfastats_table = pd.concat([gfastats_table, gfastats_asm2_table["asm2"]], axis=1)
busco_asm2=snakemake.params[5]
with open(busco_asm2) as scoresOnlyFile:
lines = scoresOnlyFile.readlines()
lines = [line.rstrip() for line in lines]
percentage_data = re.split(r'[,\[\]]', lines[0])
percentage_values = {item.split(':')[0]: item.split(':')[1].split('[')[0] for item in percentage_data if len(item) > 0}
percentages = []
numbers = []
# Iterate through the data starting from the second line
i = 0
for line in lines[1:]:
parts = line.split('\t')
number = int(parts[0])
# Extract the corresponding percentage from the dictionary
percentage_key = letters[i] # Using the first letter as the key to match the percentage
percentage = float(percentage_values.get(percentage_key, '0.0').rstrip('%'))
numbers.append(number)
percentages.append(percentage)
i += 1
busco_table_2 = pd.DataFrame({
'Metric': names,
'asm2 #': numbers,
'asm2 %': percentages
})
busco_table = pd.merge(busco_table, busco_table_2, on='Metric')
print(busco_table)
else:
gfastats_asm2=""
busco_asm2=""
# Change the percentage values to 100.0 for 'Total searched groups'
busco_table.loc[busco_table['Metric'] == 'Total searched groups', 'asm1 %'] = 100.0
if 'asm2 %' in busco_table.columns:
busco_table.loc[busco_table['Metric'] == 'Total searched groups', 'asm2 %'] = 100.0
output_file=snakemake.output[0]
with open(output_file, 'w') as outFile:
# print("---", file=outFile)
# print("geometry: b3paper, margin=1.3cm", file=outFile)
# print("classoption:", file=outFile)
# print(" - table", file=outFile)
# print(" - twocolumn", file=outFile)
# print("urlcolor: blue", file=outFile)
# print("colorlinks: true", file=outFile)
# print("header-includes:", file=outFile)
# print("- |", file=outFile)
# print(" ```{=latex}", file=outFile)
# print(r" \rowcolors{2}{gray!10}{gray!25}", file=outFile)
# print(r" \usepackage{longtable}", file=outFile)
# print(r" \usepackage{supertabular}", file=outFile)
# print(r" \let\longtable\supertabular", file=outFile)
# print(r" \let\endlongtable\endsupertabular", file=outFile)
# print(r" \let\endhead\relax", file=outFile)
# print(r" ```", file=outFile)
# print("output: pdf_document", file=outFile)
# print("---", file=outFile)
print("", file=outFile)
print("\\Large", file=outFile)
print("\\twocolumn", file=outFile)
print("# \\LARGE Assembly ID:", params_assemblyID, file=outFile)
print("\\hrule", file=outFile) # Add a horizontal line
print("Database built with kmer size: ", params_merylKmer, "bp\n", file=outFile)
print("Number of haploptypes/assemblies: ", number_haploptypes, file=outFile)
print("\nExpected genome size: ", expected_genome_size, file=outFile)
print("\nHeterozygosity: ", heterozygous_ab_value, " %", file=outFile)
print("", file=outFile)
print("\\large", file=outFile)
print("## QV Score", file=outFile)
print(tabulate(QV_score_table, headers='keys',tablefmt="pipe", showindex=True), file=outFile)
print("", file=outFile)
print("\\large", file=outFile)
print("## Kmer Completeness", file=outFile)
print(tabulate(kmer_completeness_table, headers='keys',tablefmt="pipe", showindex=True), file=outFile)
print("", file=outFile)
print("## BUSCOv5 (database: ", params_buscoDB, ")", file=outFile)
print(tabulate(busco_table, headers='keys',tablefmt="pipe", showindex=False, disable_numparse=True), file=outFile)
print("## Genomescope2 Profile (Linear)", file=outFile)
print("{ width=38% }", file=outFile)
print("", file=outFile)
if run_smudgeplot:
print("## Smudgeplot Profile", file=outFile)
print("{ width=38% }", file=outFile)
print("", file=outFile)
else:
print("## Genomescope2 Profile (Log)", file=outFile)
print("{ width=38% }", file=outFile)
print("\\", file=outFile)
print("\\pagebreak", file=outFile)
print("\\", file=outFile)
print("\\Large", file=outFile)
print("", file=outFile)
print("", file=outFile)
print("", file=outFile)
print("## Genome Statistics with gfastats", file=outFile)
print(tabulate(gfastats_table, headers='keys',tablefmt="pipe", showindex=False, disable_numparse=True), file=outFile)
print("", file=outFile)
print("", file=outFile)
print("\\", file=outFile)
print("\\", file=outFile)
print("\\large", file=outFile)
print("", file=outFile)
print("\\large", file=outFile)
print("", file=outFile)
print("\\large", file=outFile)
print("", file=outFile)
if kmer_plot_flat:
print("## K-mer Multiplicity asm1 Only (Flat)", file=outFile)
print("{ width=43% }", file=outFile)
print("\\", file=outFile)
print("\\Large", file=outFile)
print("", file=outFile)
else:
print("## K-mer Multiplicity asm1 Only (Stacked)", file=outFile)
print("{ width=43% }", file=outFile)
print("\\", file=outFile)
print("\\Large", file=outFile)
print("", file=outFile)
if kmer_plot_flat and params_asm2provided:
print("## K-mer Multiplicity asm1 + asm2 (Flat)", file=outFile)
print("{ width=43% }", file=outFile)
print("\\", file=outFile)
print("\\Large", file=outFile)
print("", file=outFile)
elif params_asm2provided:
print("## K-mer Multiplicity asm1 + asm2 (Stacked)", file=outFile)
print("{ width=43% }", file=outFile)
print("\\", file=outFile)
print("\\Large", file=outFile)
print("", file=outFile)