makePDF_indivMD.py

#!/usr/bin/env python3

#import os
#import numpy as np
import pandas as pd
from tabulate import tabulate
import re

gfastats_asm1=snakemake.input[0]

# extract gfastats values
def extract_gfastats_values(content, keys):
    return [re.findall(f"{key}: (.+)", content)[0] for key in keys]

keys = ["Total scaffold length",
        "GC content %",
        "# gaps in scaffolds",
        "Total gap length in scaffolds",
        "# scaffolds",
        "Largest scaffold",
        "Scaffold N50",
        "Scaffold L50",
        "Scaffold L90",
        "# contigs",
        "Largest contig",
        "Contig N50",
        "Contig L50",
        "Contig L90",
        "Expected genome size",
]

with open(gfastats_asm1, 'r') as file:
	content = file.read()
	data = extract_gfastats_values(content, keys)

gfastats_table = pd.DataFrame({"Metric": keys, "asm1": data})
expected_genome_size = gfastats_table[gfastats_table["Metric"] == "Expected genome size"]["asm1"].values[0]

gfastats_table = gfastats_table.iloc[:-1]


genomescope_linear=snakemake.input[1]

genomescope_log = ""
smudgeplot_plot = ""
run_smudgeplot = snakemake.params[6]
kmer_plot_flat = snakemake.params[7]

if run_smudgeplot:
	smudgeplot_plot=snakemake.input[2]
else:
	genomescope_log=snakemake.input[2]


QV_score=snakemake.input[3]
QV_score_table = pd.read_csv(QV_score, dtype=str, delim_whitespace=True, skip_blank_lines=True, names=["assembly_name","Assembly_Only", "Total_Kmers", "QV_Score", "Error"])
QV_score_table	= QV_score_table.drop(['Error'], axis=1).set_index(['assembly_name'])

kmer_completeness=snakemake.input[4]
kmer_completeness_table = pd.read_csv(kmer_completeness, dtype=str, delim_whitespace=True, skip_blank_lines=True, names=["assembly_name","all","Kmers_Assembly", "Kmers_Reads", "%"])
kmer_completeness_table	= kmer_completeness_table.drop(["all"], axis=1).set_index(['assembly_name'])

busco_asm1=snakemake.input[5]

with open(busco_asm1) as scoresOnlyFile:
	lines = scoresOnlyFile.readlines()
	lines = [line.rstrip() for line in lines]

#create the busco table:
percentage_data = re.split(r'[,\[\]]', lines[0])
percentage_values = {item.split(':')[0]: item.split(':')[1].split('[')[0] for item in percentage_data if len(item) > 0}

names = ["Complete", "Complete and single-copy", "Complete and duplicated", "Fragmented", "Missing", "Total searched groups"]
letters = list(percentage_values.keys())
numbers = []
percentages = []

# Iterate through the data starting from the second line
i = 0
for line in lines[1:]:
	parts = line.split('\t')
	number = int(parts[0])

	# Extract the corresponding percentage from the dictionary
	percentage_key = letters[i]  # Using the first letter as the key to match the percentage
	percentage = float(percentage_values.get(percentage_key, '0.0').rstrip('%'))

	numbers.append(number)
	percentages.append(percentage)

	i += 1

 # Create a pandas DataFrame
busco_table = pd.DataFrame({
	'Metric': names,
	'asm1 #': numbers,
	'asm1 %': percentages
})

kmer_flat=snakemake.input[6]
kmer_stacked=snakemake.input[7]
kmer_flat_asm1=snakemake.input[8]
kmer_stacked_asm1=snakemake.input[9]
input_gscopeSum=snakemake.input[10]

with open(input_gscopeSum, "r") as f:
    for line in f:
        if 'Heterozygous (ab)' in line:
            heterozygous_ab_value = float(line.split()[3][:-1])
            heterozygous_ab_value = round(heterozygous_ab_value, 3)
            break


params_assemblyID=snakemake.params[0]

params_merylKmer=snakemake.params[1]

params_buscoDB=snakemake.params[2]

params_asm2provided=snakemake.params[3]

number_haploptypes=2 if params_asm2provided else 1

if params_asm2provided:
	gfastats_asm2=snakemake.params[4]

	with open(gfastats_asm2, 'r') as file:
		content = file.read()
		data = extract_gfastats_values(content, keys)

	gfastats_asm2_table = pd.DataFrame({"Metric": keys, "asm2": data})

	gfastats_asm2_table = gfastats_asm2_table.iloc[:-1]
	gfastats_table = pd.concat([gfastats_table, gfastats_asm2_table["asm2"]], axis=1)

	busco_asm2=snakemake.params[5]

	with open(busco_asm2) as scoresOnlyFile:
		lines = scoresOnlyFile.readlines()
		lines = [line.rstrip() for line in lines]

		percentage_data = re.split(r'[,\[\]]', lines[0])
		percentage_values = {item.split(':')[0]: item.split(':')[1].split('[')[0] for item in percentage_data if len(item) > 0}

		percentages = []
		numbers = []
		# Iterate through the data starting from the second line
		i = 0
		for line in lines[1:]:
			parts = line.split('\t')
			number = int(parts[0])

			# Extract the corresponding percentage from the dictionary
			percentage_key = letters[i]  # Using the first letter as the key to match the percentage
			percentage = float(percentage_values.get(percentage_key, '0.0').rstrip('%'))

			numbers.append(number)
			percentages.append(percentage)

			i += 1

		busco_table_2 = pd.DataFrame({
			'Metric': names,
			'asm2 #': numbers,
			'asm2 %': percentages
		})

		busco_table = pd.merge(busco_table, busco_table_2, on='Metric')
		print(busco_table)


else:
	gfastats_asm2=""
	busco_asm2=""

# Change the percentage values to 100.0 for 'Total searched groups'
busco_table.loc[busco_table['Metric'] == 'Total searched groups', 'asm1 %'] = 100.0
if 'asm2 %' in busco_table.columns:
    busco_table.loc[busco_table['Metric'] == 'Total searched groups', 'asm2 %'] = 100.0

output_file=snakemake.output[0]

with open(output_file, 'w') as outFile:
#	print("---", file=outFile)
#	print("geometry: b3paper, margin=1.3cm", file=outFile)
#	print("classoption:", file=outFile)
#	print(" - table", file=outFile)
#	print(" - twocolumn", file=outFile)
#	print("urlcolor: blue", file=outFile)
#	print("colorlinks: true", file=outFile)
#	print("header-includes:", file=outFile)
#	print("- |", file=outFile)
#	print("  ```{=latex}", file=outFile)
#	print(r"  \rowcolors{2}{gray!10}{gray!25}", file=outFile)
#	print(r"  \usepackage{longtable}", file=outFile)
#	print(r"  \usepackage{supertabular}", file=outFile)
#	print(r"  \let\longtable\supertabular", file=outFile)
#	print(r"  \let\endlongtable\endsupertabular", file=outFile)
#	print(r"  \let\endhead\relax", file=outFile)
#	print(r"  ```", file=outFile)
#	print("output: pdf_document", file=outFile)
#	print("---", file=outFile)
	print("", file=outFile)
	print("\\Large", file=outFile)
	print("\\twocolumn", file=outFile)
	print("# \\LARGE Assembly ID:", params_assemblyID, file=outFile)
	print("\\hrule", file=outFile)  # Add a horizontal line
	print("Database built with kmer size: ", params_merylKmer, "bp\n", file=outFile)
	print("Number of haploptypes/assemblies: ", number_haploptypes, file=outFile)
	print("\nExpected genome size: ", expected_genome_size, file=outFile)
	print("\nHeterozygosity: ", heterozygous_ab_value, " %",  file=outFile)
	print("", file=outFile)
	print("\\large", file=outFile)
	print("## QV Score", file=outFile)
	print(tabulate(QV_score_table, headers='keys',tablefmt="pipe", showindex=True), file=outFile)
	print("", file=outFile)
	print("\\large", file=outFile)
	print("## Kmer Completeness", file=outFile)
	print(tabulate(kmer_completeness_table, headers='keys',tablefmt="pipe", showindex=True), file=outFile)
	print("", file=outFile)
	print("## BUSCOv5 (database: ", params_buscoDB, ")", file=outFile)
	print(tabulate(busco_table, headers='keys',tablefmt="pipe", showindex=False, disable_numparse=True), file=outFile)
	print("## Genomescope2 Profile (Linear)", file=outFile)
	print("![](", genomescope_linear, "){ width=38% }", file=outFile)
	print("", file=outFile)

	if run_smudgeplot:
		print("## Smudgeplot Profile", file=outFile)
		print("![](", smudgeplot_plot, "){ width=38% }", file=outFile)
		print("", file=outFile)
	else:
		print("## Genomescope2 Profile (Log)", file=outFile)
		print("![](", genomescope_log, "){ width=38% }", file=outFile)
		print("\\", file=outFile)

	print("\\pagebreak", file=outFile)
	print("\\", file=outFile)
	print("\\Large", file=outFile)
	print("", file=outFile)
	print("", file=outFile)
	print("", file=outFile)
	print("## Genome Statistics with gfastats", file=outFile)
	print(tabulate(gfastats_table, headers='keys',tablefmt="pipe", showindex=False, disable_numparse=True), file=outFile)
	print("", file=outFile)
	print("", file=outFile)
	print("\\", file=outFile)
	print("\\", file=outFile)
	print("\\large", file=outFile)
	print("", file=outFile)
	print("\\large", file=outFile)
	print("", file=outFile)
	print("\\large", file=outFile)
	print("", file=outFile)

	if kmer_plot_flat:
		print("## K-mer Multiplicity asm1 Only (Flat)", file=outFile)
		print("![](", kmer_flat_asm1, "){ width=43% }", file=outFile)
		print("\\", file=outFile)
		print("\\Large", file=outFile)
		print("", file=outFile)

	else:
		print("## K-mer Multiplicity asm1 Only (Stacked)", file=outFile)
		print("![](", kmer_stacked_asm1, "){ width=43% }", file=outFile)
		print("\\", file=outFile)
		print("\\Large", file=outFile)
		print("", file=outFile)

	if kmer_plot_flat and params_asm2provided:
		print("## K-mer Multiplicity asm1 + asm2 (Flat)", file=outFile)
		print("![](", kmer_flat, "){ width=43% }", file=outFile)
		print("\\", file=outFile)
		print("\\Large", file=outFile)
		print("", file=outFile)

	elif params_asm2provided:
		print("## K-mer Multiplicity asm1 + asm2 (Stacked)", file=outFile)
		print("![](", kmer_stacked, "){ width=43% }", file=outFile)
		print("\\", file=outFile)
		print("\\Large", file=outFile)
		print("", file=outFile)