diff --git a/SUBMIT_CONFIG/local/config.yaml b/SUBMIT_CONFIG/local/config.yaml deleted file mode 100644 index d0c1ee42ee2ba9dff711c2ca1a3f4ee0e2882c50..0000000000000000000000000000000000000000 --- a/SUBMIT_CONFIG/local/config.yaml +++ /dev/null @@ -1,10 +0,0 @@ -cores: 64 -dry-run: False -use-conda: True -latency-wait: 60 -rerun-incomplete: True -printshellcmds: False -keep-going: False - -#shadow-prefix: /srv/public/user/<user>/tempfileshere -#use-singularity: True diff --git a/scripts/assembly_stats/stats.py b/scripts/assembly_stats/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..497b6e80794c2edebac968292b563397e779de37 --- /dev/null +++ b/scripts/assembly_stats/stats.py @@ -0,0 +1,368 @@ +import numpy as np +from itertools import groupby +import json +import sys +import csv +import os +import pandas as pd +from tabulate import tabulate + +#pd.set_option('display.float_format', lambda x: '%,g' % x) +with open(sys.argv[2]) as f: + ff=f.readlines() + +# estSizeFile = open() +# +# size_bp = estSizeFile.readlines() + +estSize=int(float(sys.argv[4])) + +if estSize == 0: + size_bp = ff[0][:-1] + print("No estimated Genome Size provided, using estimation from Genomescope2: ", size_bp) +else: + print("Estimated Genome Size provided as:", estSize, " using this size for NG and LG stats") + size_bp = estSize + +def fasta_iter(fasta_file): + """ + Takes a FASTA file, and produces a generator of Header and Sequences. + This is a memory-efficient way of analyzing a FASTA files -- without + reading the entire file into memory. + + Parameters + ---------- + fasta_file : str + The file location of the FASTA file + + Returns + ------- + header: str + The string contained in the header portion of the sequence record + (everything after the '>') + seq: str + The sequence portion of the sequence record + """ + + fh = open(fasta_file) + fa_iter = (x[1] for x in groupby(fh, lambda line: line[0] == ">")) + for header in fa_iter: + # drop the ">" + header = next(header)[1:].strip() + # join all sequence lines to one. + seq = "".join(s.upper().strip() for s in next(fa_iter)) + yield header, seq + + +def read_genome(fasta_file): + """ + Takes a FASTA file, and produces 2 lists of sequence lengths. It also + calculates the GC Content, since this is the only statistic that is not + calculated based on sequence lengths. + + Parameters + ---------- + fasta_file : str + The file location of the FASTA file + + Returns + ------ + contig_lens: list + A list of lengths of all contigs in the genome. + scaffold_lens: list + A list of lengths of all scaffolds in the genome. + gc_cont: float + The percentage of total basepairs in the genome that are either G or C. + """ + + gc = 0 + total_len = 0 + contig_lens = [] + scaffold_lens = [] + for _, seq in fasta_iter(fasta_file): + scaffold_lens.append(len(seq)) + if "NN" in seq: + contig_list = seq.split("NN") + else: + contig_list = [seq] + for contig in contig_list: + if len(contig): + gc += contig.count('G') + contig.count('C') + total_len += len(contig) + contig_lens.append(len(contig)) + gc_cont = (gc / total_len) * 100 +# print("this is gc_content", gc_cont) + return contig_lens, scaffold_lens, gc_cont + + +def calculate_stats(seq_lens, gc_cont): + naming = sys.argv[3] + stats = {} + seq_array = np.array(seq_lens) +# stats['Assembly:']=naming + stats['sequence_count'] = seq_array.size + testsize=stats['sequence_count'] + stats['number_of_gaps'] = 0 +# print("this is the count",naming," ", testsize) + stats['gc_content (%)'] = gc_cont + sorted_lens = seq_array[np.argsort(-seq_array)] + stats['longest (bp)'] = int(sorted_lens[0]) + testlongest= stats['longest (bp)'] +# print("this is the longest", naming," ",testlongest) + stats['shortest (bp)'] = int(sorted_lens[-1]) +# stats['median'] = np.median(sorted_lens) +# stats['mean'] = np.mean(sorted_lens) + stats['total_bps (bp)'] = int(np.sum(sorted_lens)) + testprint=stats['total_bps (bp)'] +# print("total_bp is", naming," ",testprint) + stats['estimated_size (bp)'] = int(size_bp) + csum = np.cumsum(sorted_lens) + # if stats['total_bps (bp)'] < stats['estimated_size (bp)']: + # csum_ng = np.append(csum, stats['estimated_size (bp)']) + # else: + # csum_ng=csum + for level in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 96, 97, 98, 99, 100]: + nx = int(stats['total_bps (bp)'] * (level / 100)) + csumn = min(csum[csum >= nx]) + l_level = int(np.where(csum == csumn)[0]) + 1 + stats['L' + str(level)] = l_level + for level in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 96, 97, 98, 99, 100]: +# print("the totalbps are:", stats['total_bps (bp)']) + nx = int(stats['total_bps (bp)'] * (level / 100)) +# print("this is the nx", nx) +# print("this is the csum", csum) + csumn = min(csum[csum >= nx]) +# print("this is the csumn", csumn) + l_level = int(np.where(csum == csumn)[0]) + n_level = int(sorted_lens[l_level]) + stats['N' + str(level)] = n_level +# print(level, " ", n_level) + for level in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 96, 97, 98, 99, 100]: +# print("the estbps are:", stats['estimated_size (bp)']) + ngx = int(stats['estimated_size (bp)'] * (level / 100)) +# print("this is the ngx", ngx) +# print("this is the csum", csum_ng) +# print("this is the csum", csum) +# print("this is the [csum >= ngx]", np.array(csum >= ngx)) + new_array=np.array(csum >= ngx) + # print(np.any(new_array)) + if np.any(new_array) == False: + csumng = csum[seq_array.size-1] + # print("this is the csumng", csumng) + lg_level = int(np.where(csum == csumng)[0]) + 1 + stats['LG' + str(level)] = lg_level + elif np.any(new_array) == True: + csumng = min(csum[csum >= ngx]) + # print("this is the csumng", csumng) + lg_level = int(np.where(csum == csumng)[0]) + 1 + stats['LG' + str(level)] = lg_level + for level in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 96, 97, 98, 99, 100]: + ngx = int(stats['estimated_size (bp)'] * (level / 100)) + new_array=np.array(csum >= ngx) + # print(np.any(new_array)) + if np.any(new_array) == False: + csumng = csum[seq_array.size-1] + # print("this is the csumng", csumng) + lg_level = int(np.where(csum == csumng)[0]) + ng_level = int(sorted_lens[lg_level]) + stats['NG' + str(level)] = ng_level + elif np.any(new_array) == True: + csumng = min(csum[csum >= ngx]) + # print("this is the csumng", csumng) + lg_level = int(np.where(csum == csumng)[0]) + ng_level = int(sorted_lens[lg_level]) + stats['NG' + str(level)] = ng_level + return stats + + +if __name__ == "__main__": +# print(size_bp) +# print(type(size_bp)) + naming = sys.argv[3] + infilename = sys.argv[1] + contig_lens, scaffold_lens, gc_cont = read_genome(infilename) +# contig_stats = calculate_stats(contig_lens, gc_cont) + scaffold_stats = calculate_stats(scaffold_lens, gc_cont) + rounded_gc = round(gc_cont, 2) +# print("this is the rounded_gc: ", rounded_gc) + contig_stats = calculate_stats(contig_lens, gc_cont) + gaps=contig_stats.get('sequence_count') - scaffold_stats.get('sequence_count') + scaffold_stats['number_of_gaps'] = gaps + contig_stats['number_of_gaps'] = gaps +# print("this is the scaffold_stats: ", scaffold_stats) +# print(scaffold_stats) +# df_scaffold_all= pd.DataFrame.from_dict(scaffold_stats, orient= 'index') +# print(df_scaffold_all) +# df2 = pd.DataFrame([scaffold_stats]).T +# print(df2) +# s = pd.Series(scaffold_stats, name=naming) +# s.index.name = 'Assembly:' +# s.reset_index() +# print(s) + scaff_seq=pd.DataFrame(scaffold_stats.items(), columns=['Assembly:', naming]) +# print("this is the scaff_seq: ", scaff_seq) + df_scaffold_top=scaff_seq.iloc[0:7,0:2] +# print("this is GC CONTENT", gc_cont) + df_scaffold_top[naming]=df_scaffold_top[naming].astype('int64').apply('{:,}'.format) +# print("this is the top: ", df_scaffold_top) + df_scaffold_top[naming][2] = rounded_gc +# print("this is df_scaffold_top[naming][2]", df_scaffold_top[naming][2]) +# print("this is the top: ", df_scaffold_top) + # df_scaffold_top=df_scaffold_top.style.hide_index() +# df_scaffold_top[naming].round(decimals=0) + # df_contig_all=pd.DataFrame(data=contig_stats) + # df_contig_top=df_contig_all.iloc[0:6,0:2] + df_scaffold_Nxx=pd.DataFrame(scaffold_stats.items(), columns=['Nxx Level (%)', 'Length of Nxx Scaffold (bp)']) + df_scaffold_Nxx=df_scaffold_Nxx.iloc[31:55,0:2] + df_scaffold_Nxx=df_scaffold_Nxx.reset_index() +# print(df_scaffold_Nxx) + df_scaffold_NGxx=pd.DataFrame(scaffold_stats.items(), columns=['NGxx Level (%)', 'Length of NGxx Scaffold (bp)']) + df_scaffold_NGxx=df_scaffold_NGxx.iloc[79:104,0:2] + df_scaffold_NGxx=df_scaffold_NGxx.reset_index() +# print(df_scaffold_NGxx) + df_scaffold_N_NG=pd.concat([df_scaffold_Nxx,df_scaffold_NGxx], axis=1) + df_scaffold_N_NG=df_scaffold_N_NG.drop(df_scaffold_N_NG.columns[[0,3]], axis = 1) + df_scaffold_N_NG['Length of Nxx Scaffold (bp)']=df_scaffold_N_NG['Length of Nxx Scaffold (bp)'].astype('int64').apply('{:,}'.format) + df_scaffold_N_NG['Length of NGxx Scaffold (bp)']=df_scaffold_N_NG['Length of NGxx Scaffold (bp)'].astype('int64').apply('{:,}'.format) + # df_scaffold_N_NG=df_scaffold_N_NG.style.hide_index() + df_scaffold_Lxx=pd.DataFrame(scaffold_stats.items(), columns=['Lxx Level (%)', 'Count of scaffolds (for Lxx Level)']) + df_scaffold_Lxx=df_scaffold_Lxx.iloc[7:31,0:2] + df_scaffold_Lxx=df_scaffold_Lxx.reset_index() +# print(df_scaffold_Nxx) + df_scaffold_LGxx=pd.DataFrame(scaffold_stats.items(), columns=['LGxx Level (%)', 'Count of scaffolds (for LGxx Level)']) + df_scaffold_LGxx=df_scaffold_LGxx.iloc[55:79,0:2] + df_scaffold_LGxx=df_scaffold_LGxx.reset_index() +# print(df_scaffold_NGxx) + df_scaffold_L_LG=pd.concat([df_scaffold_Lxx,df_scaffold_LGxx], axis=1) + df_scaffold_L_LG=df_scaffold_L_LG.drop(df_scaffold_L_LG.columns[[0,3]], axis = 1) + df_scaffold_L_LG['Count of scaffolds (for Lxx Level)']=df_scaffold_L_LG['Count of scaffolds (for Lxx Level)'].astype('int64').apply('{:,}'.format) + df_scaffold_L_LG['Count of scaffolds (for LGxx Level)']=df_scaffold_L_LG['Count of scaffolds (for LGxx Level)'].astype('int64').apply('{:,}'.format) +################################################################################################################ + + contig_seq=pd.DataFrame(contig_stats.items(), columns=['Assembly:', naming]) + df_contig_top=contig_seq.iloc[0:7,0:2] + df_contig_top[naming]=df_contig_top[naming].astype('int64').apply('{:,}'.format) + df_contig_top[naming][2] = rounded_gc + # df_scaffold_top=df_scaffold_top.style.hide_index() +# df_scaffold_top[naming].round(decimals=0) + # df_contig_all=pd.DataFrame(data=contig_stats) + # df_contig_top=df_contig_all.iloc[0:6,0:2] + df_contig_Nxx=pd.DataFrame(contig_stats.items(), columns=['Nxx Level (%)', 'Length of Nxx contig (bp)']) + df_contig_Nxx=df_contig_Nxx.iloc[31:55,0:2] + df_contig_Nxx=df_contig_Nxx.reset_index() +# print(df_scaffold_Nxx) + df_contig_NGxx=pd.DataFrame(contig_stats.items(), columns=['NGxx Level (%)', 'Length of NGxx contig (bp)']) + df_contig_NGxx=df_contig_NGxx.iloc[79:104,0:2] + df_contig_NGxx=df_contig_NGxx.reset_index() +# print(df_scaffold_NGxx) + df_contig_N_NG=pd.concat([df_contig_Nxx,df_contig_NGxx], axis=1) + df_contig_N_NG=df_contig_N_NG.drop(df_contig_N_NG.columns[[0,3]], axis = 1) + df_contig_N_NG['Length of Nxx contig (bp)']=df_contig_N_NG['Length of Nxx contig (bp)'].astype('int64').apply('{:,}'.format) + df_contig_N_NG['Length of NGxx contig (bp)']=df_contig_N_NG['Length of NGxx contig (bp)'].astype('int64').apply('{:,}'.format) + # df_scaffold_N_NG=df_scaffold_N_NG.style.hide_index() + df_contig_Lxx=pd.DataFrame(contig_stats.items(), columns=['Lxx Level (%)', 'Count of contig (for Lxx Level)']) + df_contig_Lxx=df_contig_Lxx.iloc[7:31,0:2] + df_contig_Lxx=df_contig_Lxx.reset_index() +# print(df_scaffold_Nxx) + df_contig_LGxx=pd.DataFrame(contig_stats.items(), columns=['LGxx Level (%)', 'Count of contig (for LGxx Level)']) + df_contig_LGxx=df_contig_LGxx.iloc[55:79,0:2] + df_contig_LGxx=df_contig_LGxx.reset_index() +# print(df_scaffold_NGxx) + df_contig_L_LG=pd.concat([df_contig_Lxx,df_contig_LGxx], axis=1) + df_contig_L_LG=df_contig_L_LG.drop(df_contig_L_LG.columns[[0,3]], axis = 1) + df_contig_L_LG['Count of contig (for Lxx Level)']=df_contig_L_LG['Count of contig (for Lxx Level)'].astype('int64').apply('{:,}'.format) + df_contig_L_LG['Count of contig (for LGxx Level)']=df_contig_L_LG['Count of contig (for LGxx Level)'].astype('int64').apply('{:,}'.format) + # df_scaffold_L_LG=df_scaffold_L_LG.style.hide_index() + # print(df_contig_top) +# print(scaffold_stats) +# stat_output = {'Contig Stats': contig_stats, +# 'Scaffold Stats': scaffold_stats} +# print(json.dumps(stat_output, indent=2, sort_keys=False)) +# scaffold_out = {scaffold_stats} +# contig_out = {contig_stats} + + +# print(json.dumps(scaffold_stats, indent=2, sort_keys=False)) +# print(json.dumps(contig_stats, indent=2, sort_keys=False)) + # dict_writer = csv.DictWriter(stat_output, fieldnames=None, delimiter='\t') + # print(dict_writer) +# df_raw = pd.DataFrame.from_dict(data, orient='index') + # print(df_raw) +# with open('statdata.txt', 'w') as outfile: +# json.dump(scaffold_stats, outfile) +# with open('contigdata.txt', 'w') as out2file: +# json.dump(contig_stats, out2file) + # scaff = csv.writer(open(naming + "_scaffold_stats.tsv", "w"), delimiter='\t') + # for key, val in scaffold_stats.items(): + # scaff.writerow([key, val]) + # + # contig = csv.writer(open(naming + "_contig_stats.tsv", "w"), delimiter='\t') + # for key, val in contig_stats.items(): + # + with open(sys.argv[5], 'w') as outputfile: +# # print('#' + libraryName, file=outputfile) +# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile) +# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile) +# # print("Successfully Processed reads (Paired): " + successfull + " ( "+str(successfull_perc)+"%)", file=outputfile) + print(df_scaffold_top.to_string(index=False), file=outputfile) + print("", file=outputfile) + print(df_scaffold_N_NG.to_string(index=False), file=outputfile) + print("", file=outputfile) + print(df_scaffold_L_LG.to_string(index=False), file=outputfile) + + with open(sys.argv[6], 'w') as outputfile2: +# # print('#' + libraryName, file=outputfile) +# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile) +# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile) +# # print("Successfully Processed reads (Paired): " + successfull + " ( "+str(successfull_perc)+"%)", file=outputfile) + print(df_contig_top.to_string(index=False), file=outputfile2) + print("", file=outputfile2) + print(df_contig_N_NG.to_string(index=False), file=outputfile2) + print("", file=outputfile2) + print(df_contig_L_LG.to_string(index=False), file=outputfile2) + + # with open(sys.argv[4], 'w') as outputRst: + # print(tabulate(df_scaffold_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst) + # print("", file=outputRst) + # print(tabulate(df_scaffold_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst) + # print("", file=outputRst) + # print(tabulate(df_scaffold_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst) + # print("", file=outputRst) + # # + # with open(sys.argv[5], 'w') as outputRst2: + # print(tabulate(df_contig_top, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) + # print("", file=outputRst2) + # print(tabulate(df_contig_N_NG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) + # print("", file=outputRst2) + # print(tabulate(df_contig_L_LG, headers='keys',tablefmt="rst", showindex=False), file=outputRst2) + # print("", file=outputRst2) + + # with open(sys.argv[4], 'w') as outputRst: + # print(tabulate(df_scaffold_top, headers='keys',tablefmt="pipe", showindex=False), file=outputRst) + # print("", file=outputRst) + # print(tabulate(df_scaffold_N_NG, headers='keys',tablefmt="pipe", showindex=False), file=outputRst) + # print("", file=outputRst) + # print(tabulate(df_scaffold_L_LG, headers='keys',tablefmt="pipe", showindex=False), file=outputRst) + # print("", file=outputRst) + # # + # with open(sys.argv[5], 'w') as outputRst2: + # print(tabulate(df_contig_top, headers='keys',tablefmt="pipe", showindex=False), file=outputRst2) + # print("", file=outputRst2) + # print(tabulate(df_contig_N_NG, headers='keys',tablefmt="pipe", showindex=False), file=outputRst2) + # print("", file=outputRst2) + # print(tabulate(df_contig_L_LG, headers='keys',tablefmt="pipe", showindex=False), file=outputRst2) + # print("", file=outputRst2) + # list_of_dfs=[df_scaffold_top,df_scaffold_N_NG,df_scaffold_L_LG] + # for df in list_of_dfs: + # with open('all_dfs.tsv','a') as f: + # df.to_csv(f) + # f.write("\n") +# with open("testok.tsv", 'w') as outputfile: +# # print('#' + libraryName, file=outputfile) +# # print("Total Reads Processed (Paired): " + total_processed + " ( 100 %)", file=outputfile) +# # print("Discarded reads (Paired): " + discarded + " ( "+str(discarded_perc)+"%)", file=outputfile) +# # print("Successfully Processed reads (Paired): " + successfull + " ( "+str(successfull_perc)+"%)", file=outputfile) +# print(tabulate(loadinTop, headers='keys',tablefmt="rst", showindex=False), file=outputfile) +# # print('', file=outputfile) +# print(tabulate(result, headers='keys', tablefmt="psql", showindex=False), file=outputfile) +# print('', file=outputfile) diff --git a/scripts/compare_results/colouredHeatmap_legend.tsv b/scripts/compare_results/colouredHeatmap_legend.tsv new file mode 100644 index 0000000000000000000000000000000000000000..526c788361c6c9c8f6db83c0fe27a95d290939a0 --- /dev/null +++ b/scripts/compare_results/colouredHeatmap_legend.tsv @@ -0,0 +1,5 @@ +Gaps_per_Gb Scaff_NG50_Mb Cont_NG50_Mb QV Completeness Comp_Single_BUSCOs_% +'< 200' '> 100Mbp' '> 10Mbp' '> 50' '> 95%' '> 95%' +'200 - 1000' '10Mbp - 100Mbp' '1Mbp - 10Mbp' '40 - 50' '90% - 95%' '90% - 95%' +'1000 - 10000' '0.1Mbp - 10Mbp' '0.01Mbp - 1Mbp' '35 - 40' '80% - 90%' '80% - 90%' +'> 10000' '< 0.1Mbp' '< 0.01Mbp' '< 35' '< 80%' '< 80%' diff --git a/scripts/compare_results/fullTable_heatmap_external.R b/scripts/compare_results/fullTable_heatmap_external.R new file mode 100644 index 0000000000000000000000000000000000000000..3d251c8f0614b3d1d62e96d5394fb41ef1adaa15 --- /dev/null +++ b/scripts/compare_results/fullTable_heatmap_external.R @@ -0,0 +1,132 @@ +suppressMessages(library(dplyr)) +suppressMessages(library(formattable)) +#suppressMessages(library(data.table)) + + +fullTableOfStats<-read.table(file = snakemake@input[[1]], sep = "", header = TRUE, row.names=NULL, check.names = FALSE) + + + +fullTableOfStats$Gaps_per_Gb <- ((fullTableOfStats$Gaps * fullTableOfStats$Est_Size_Mb)/1000) +fullTableOfStats$Gaps_per_Gb <- as.integer(fullTableOfStats$Gaps_per_Gb) + + +customPurple='#c699e8' +customGreen='#8fc773' + +selectionOfStats_colouredHeatmap <- fullTableOfStats %>% + select(c('ASM_ID','ASM_LEVEL', + 'Gaps_per_Gb', 'Scaff_NG50_Mb', + 'Cont_NG50_Mb','QV', + 'Completeness', 'Comp_Single_BUSCOs_%')) + +sink(file = snakemake@output[[1]]) +format_table(selectionOfStats_colouredHeatmap, + align =c("l","c","c","c","c", "c", "c", "c", "c"), + list(Gaps_per_Gb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(ASM_LEVEL == 'cont', "#666666", + ifelse(between(Gaps_per_Gb,1001, 10000), "#FFCC99", + ifelse(between(Gaps_per_Gb,201 , 1000), customGreen, + ifelse(Gaps_per_Gb <= 200, customPurple, "#FF9999")))))), + Scaff_NG50_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(between(Scaff_NG50_Mb,0.1, 9.99999), "#FFCC99", + ifelse(between(Scaff_NG50_Mb,10, 99.99999), customGreen, + ifelse(Scaff_NG50_Mb >= 100, customPurple, "#FF9999"))))), + Cont_NG50_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(between(Cont_NG50_Mb,0.01, 0.99999), "#FFCC99", + ifelse(between(Cont_NG50_Mb,1, 9.999999), customGreen, + ifelse(Cont_NG50_Mb >= 10, customPurple, "#FF9999"))))), + Completeness = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(between(Completeness,80, 89.9999), "#FFCC99", + ifelse(between(Completeness,90, 94.99999), customGreen, + ifelse(Completeness >= 95, customPurple, "#FF9999"))))), + `Comp_Single_BUSCOs_%` = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(between(`Comp_Single_BUSCOs_%`,80, 89.9999), "#FFCC99", + ifelse(between(`Comp_Single_BUSCOs_%`,90, 95), customGreen, + ifelse(`Comp_Single_BUSCOs_%` >= 95, customPurple, "#FF9999"))))), + QV = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(between(QV,35, 39.9999999999), "#FFCC99", + ifelse(between(QV,40, 49.999999999), customGreen, + ifelse(QV >= 50, customPurple, "#FF9999"))))))) + +cat('<br>') +cat("\n") +cat('<br>') + +legendTable<-read.table(file = snakemake@params[[1]], sep = "", header = TRUE, row.names=NULL, check.names = FALSE) + +format_table(legendTable, + align =c("c","c","c", "c", "c", "c", "c"), + list(Gaps_per_Gb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + "font.size" = "12px", + "width" = '150px', + `background-color` = ifelse(Gaps_per_Gb == '> 10000',"#FF9999", + ifelse(Gaps_per_Gb == '1000 - 10000',"#FFCC99", + ifelse(Gaps_per_Gb == '200 - 1000',customGreen, customPurple))))), + Scaff_NG50_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + "font.size" = "12px", + "width" = '150px', + `background-color` = ifelse(Scaff_NG50_Mb == '< 0.1Mbp',"#FF9999", + ifelse(Scaff_NG50_Mb == '0.1Mbp - 10Mbp',"#FFCC99", + ifelse(Scaff_NG50_Mb == '10Mbp - 100Mbp',customGreen, customPurple))))), + Cont_NG50_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + "font.size" = "12px", + "width" = '150px', + `background-color` = ifelse(Cont_NG50_Mb == '< 0.01Mbp',"#FF9999", + ifelse(Cont_NG50_Mb == '0.01Mbp - 1Mbp',"#FFCC99", + ifelse(Cont_NG50_Mb == '1Mbp - 10Mbp',customGreen, customPurple))))), + QV = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + "font.size" = "12px", + "width" = '100px', + `background-color` = ifelse(QV == '< 35',"#FF9999", + ifelse(QV == '35 - 40',"#FFCC99", + ifelse(QV == '40 - 50',customGreen, customPurple))))), + Completeness = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + "font.size" = "12px", + "width" = '150px', + `background-color` = ifelse(Completeness == '< 80%',"#FF9999", + ifelse(Completeness == '80% - 90%',"#FFCC99", + ifelse(Completeness == '90% - 95%',customGreen, customPurple))))), + `Comp_Single_BUSCOs_%` = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + "font.size" = "12px", + "width" = '220px', + `background-color` = ifelse(`Comp_Single_BUSCOs_%` == '< 80%',"#FF9999", + ifelse(`Comp_Single_BUSCOs_%` == '80% - 90%',"#FFCC99", + ifelse(`Comp_Single_BUSCOs_%` == '90% - 95%',customGreen, customPurple))))))) +sink(file = NULL) diff --git a/scripts/compare_results/fullTable_heatmap_internalComparison.R b/scripts/compare_results/fullTable_heatmap_internalComparison.R new file mode 100644 index 0000000000000000000000000000000000000000..e1d5ae732bd41144b15084f8d84e04df8de7b749 --- /dev/null +++ b/scripts/compare_results/fullTable_heatmap_internalComparison.R @@ -0,0 +1,140 @@ +suppressMessages(library(dplyr)) +suppressMessages(library(formattable)) + +fullTableOfStats<-read.table(file = snakemake@input[[1]], sep = "", header = TRUE, row.names=NULL, check.names = FALSE) + +fullTableOfStats$Gaps_per_Gb <- ((fullTableOfStats$Gaps * fullTableOfStats$Est_Size_Mb)/1000) +fullTableOfStats$Gaps_per_Gb <- as.integer(fullTableOfStats$Gaps_per_Gb) + + + +selectionOfStats_internalComparisonHeatmap <- fullTableOfStats %>% + select(c('ASM_ID','ASM_LEVEL', 'Bases_Mb', 'Het_%','GC_%', 'Gaps_per_Gb', 'Scaff', 'Cont', + 'Longest_Scaff_Mb','Scaff_NG50_Mb', 'Scaff_NG95_Mb', + 'Longest_Cont_Mb', 'Cont_NG50_Mb','Cont_NG95_Mb', + 'QV', 'Completeness', + 'Comp_BUSCOs_%','Comp_Single_BUSCOs_%')) + + + + +customBlue_max = "#2e96ff" + +customBlue_min = "#dcecfc" + + +customGray_max = "#8c8c8c" + +customGray_min = "#e6e6e6" + +sink(file = snakemake@output[[1]]) + +format_table(selectionOfStats_internalComparisonHeatmap, + align =c("l","c","c","c","c", "c", "c", "c", "c"), + list(Bases_Mb = color_tile(customGray_min, customGray_max), + `Het_%` = color_tile(customGray_min, customGray_max), + `GC_%` = color_tile(customGray_min, customGray_max), + Gaps_per_Gb = color_tile(customBlue_max, customBlue_min), + Scaff = color_tile(customBlue_max,customBlue_min), + Cont = color_tile(customBlue_max,customBlue_min), + Longest_Scaff_Mb = color_tile(customBlue_min, customBlue_max), + Scaff_NG50_Mb = color_tile(customBlue_min, customBlue_max), + Scaff_NG95_Mb = color_tile(customBlue_min, customBlue_max), + Longest_Cont_Mb = color_tile(customBlue_min, customBlue_max), + Cont_NG50_Mb = color_tile(customBlue_min, customBlue_max), + Cont_NG95_Mb = color_tile(customBlue_min, customBlue_max), + QV = color_tile(customBlue_min, customBlue_max), + Completeness = color_tile(customBlue_min, customBlue_max), + `Comp_BUSCOs_%` = color_tile(customBlue_min, customBlue_max), + `Comp_Single_BUSCOs_%` = color_tile(customBlue_min, customBlue_max))) + +cat('<br>') +cat("\n") +cat('<br>') + +legendTable<-read.table(file = snakemake@params[[1]], sep = "", header = TRUE, row.names=NULL, check.names = FALSE) + +format_table(legendTable, + align =c("c","c","c", "c", "c", "c", "c","c","c","c", "c", "c", "c", "c","c", "c"), + list(Bases_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Bases_Mb == 'Max',customGray_max, customGray_min))), + `Het_%` = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(`Het_%` == 'Max',customGray_max, customGray_min))), + `GC_%` = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(`GC_%` == 'Max',customGray_max, customGray_min))), + Gaps_per_Gb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Gaps_per_Gb == 'Min',customBlue_max, customBlue_min))), + Scaff = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Scaff == 'Min',customBlue_max, customBlue_min))), + Cont = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Cont == 'Min',customBlue_max, customBlue_min))), + Longest_Scaff_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Longest_Scaff_Mb == 'Max',customBlue_max, customBlue_min))), + Scaff_NG50_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Scaff_NG50_Mb == 'Max',customBlue_max, customBlue_min))), + Scaff_NG95_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Scaff_NG95_Mb == 'Max',customBlue_max, customBlue_min))), + Longest_Cont_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Longest_Cont_Mb == 'Max',customBlue_max, customBlue_min))), + Cont_NG50_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Cont_NG50_Mb == 'Max',customBlue_max, customBlue_min))), + Cont_NG95_Mb = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Cont_NG95_Mb == 'Max',customBlue_max, customBlue_min))), + QV = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(QV == 'Max',customBlue_max, customBlue_min))), + Completeness = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(Completeness == 'Max',customBlue_max, customBlue_min))), + `Comp_BUSCOs_%` = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(`Comp_BUSCOs_%` == 'Max',customBlue_max, customBlue_min))), + `Comp_Single_BUSCOs_%` = formatter("span", + style = ~style(display = "block", + padding = "0 4px", + `border-radius` = "3px", + `background-color` = ifelse(`Comp_Single_BUSCOs_%` == 'Max',customBlue_max, customBlue_min))))) + +sink(file = NULL) diff --git a/scripts/compare_results/internalComparison_legend.tsv b/scripts/compare_results/internalComparison_legend.tsv new file mode 100644 index 0000000000000000000000000000000000000000..b8e16cc4f892ca64d9fc1510ca0686cde3119064 --- /dev/null +++ b/scripts/compare_results/internalComparison_legend.tsv @@ -0,0 +1,3 @@ +Bases_Mb Het_% GC_% Gaps_per_Gb Scaff Cont Longest_Scaff_Mb Scaff_NG50_Mb Scaff_NG95_Mb Longest_Cont_Mb Cont_NG50_Mb Cont_NG95_Mb QV Completeness Comp_BUSCOs_% Comp_Single_BUSCOs_% +'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' 'Max' +'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' 'Min' diff --git a/scripts/process_HiC/filter_five_end.pl b/scripts/process_HiC/filter_five_end.pl new file mode 100644 index 0000000000000000000000000000000000000000..79d320536c29824816c1098a7798db4a4ff6904a --- /dev/null +++ b/scripts/process_HiC/filter_five_end.pl @@ -0,0 +1,110 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $prev_id = ""; +my @five; +my @three; +my @unmap; +my @mid; +my @all; +my $counter = 0; + + +while (<STDIN>){ + chomp; + if (/^@/){ + print $_."\n"; + next; + } + my ($id, $flag, $chr_from, $loc_from, $mapq, $cigar, $d1, $d2, $d3, $read, $read_qual, @rest) = split /\t/; + my $bin = reverse(dec2bin($flag)); + my @binary = split(//,$bin); + if ($prev_id ne $id && $prev_id ne ""){ + if ($counter == 1){ + if (@five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } + } + elsif ($counter == 2 && @five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } + + $counter = 0; + undef @unmap; + undef @five; + undef @three; + undef @mid; + undef @all; + } + + $counter++; + $prev_id = $id; + push @all,$_; + if ($binary[2]==1){ + push @unmap,$_; + } + elsif ($binary[4]==0 && $cigar =~ m/^[0-9]*M/ || $binary[4]==1 && $cigar =~ m/.*M$/){ + push @five, $_; + } + elsif ($binary[4]==1 && $cigar =~ m/^[0-9]*M/ || $binary[4]==0 && $cigar =~ m/.*M$/){ + push @three, $_; + } + elsif ($cigar =~ m/^[0-9]*[HS].*M.*[HS]$/){ + push @mid, $_; + } +} + +if ($counter == 1){ + if (@five == 1){ + print $five[0]."\n"; + } + else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); + } +} +elsif ($counter == 2 && @five == 1){ + print $five[0]."\n"; +} +else{ + my ($id_1, $flag_1, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1) = split /\t/, $all[0]; + my $bin_1 = reverse(dec2bin($flag_1)); + my @binary_1 = split(//,$bin_1); + $binary_1[2] = 1; + my $bin_1_new = reverse(join("",@binary_1)); + my $flag_1_new = bin2dec($bin_1_new); + print(join("\t",$id_1, $flag_1_new, $chr_from_1, $loc_from_1, $mapq_1, $cigar_1, $d1_1, $d2_1, $d3_1, $read_1, $read_qual_1, @rest_1)."\n"); +} + +sub dec2bin { + my $str = unpack("B32", pack("N", shift)); + return $str; +} + +sub bin2dec { + return unpack("N", pack("B32", substr("0" x 32 . shift, -32))); +} diff --git a/scripts/process_HiC/two_read_bam_combiner.pl b/scripts/process_HiC/two_read_bam_combiner.pl new file mode 100644 index 0000000000000000000000000000000000000000..96e6149c93a46905e897a7bb0a346aa3990dbbcf --- /dev/null +++ b/scripts/process_HiC/two_read_bam_combiner.pl @@ -0,0 +1,124 @@ +#!/usr/bin/perl + +use strict; + +MAIN : { + + my ($read1_bam, $read2_bam) = @ARGV; + + if ((not defined $read1_bam) || + (not defined $read2_bam)) { + die ("Usage: ./two_read_bam_combiner.pl <read 1 bam> <read 2 bam>\n"); + } + + open(FILE1,"samtools view -h $read1_bam |"); + open(FILE2,"samtools view -h $read2_bam |"); + + my $line1 = <FILE1>; + my $line2 = <FILE2>; + + my $counter = 0; + my $new_counter = 0; + + while (defined $line1) { + ## the line directly below this needed to be modified slightly from the genotyping pipeline ## + if ($line1 =~ /^(\@)SQ/){ + if ($line1 ne $line2){print $line1;print $line2; die ("inconsistent bam header.");} + else{ + print $line1; + } + $line1 = <FILE1>; + $line2 = <FILE2>; + next; + } + + $counter++; + if ($counter == ($new_counter + 1000000)) { + print STDERR $counter . "\n"; + $new_counter = $counter; + } + + chomp $line1; + chomp $line2; + + my ($id1, $flag1, $chr_from1, $loc_from1, $mapq1, $cigar1, $d1_1, $d2_1, $d3_1, $read1, $read_qual1, @rest1) = split(/\t/,$line1); + my ($id2, $flag2, $chr_from2, $loc_from2, $mapq2, $cigar2, $d1_2, $d2_2, $d3_2, $read2, $read_qual2, @rest2) = split(/\t/,$line2); + + if ($id1 ne $id2) { + die ("The id's of the two files do not match up at line number $counter\n"); + } + + my $bin1 = reverse(dec2bin($flag1)); + my $bin2 = reverse(dec2bin($flag2)); + + my @binary1 = split(//,$bin1); + my @binary2 = split(//,$bin2); + + my $trouble = 0; + if (($binary1[2] == 1) || ($mapq1 < 10)) { + $trouble = 1; + } + if (($binary2[2]== 1) || ($mapq2 < 10)) { + $trouble = 1; + } + + my $proper_pair1; + my $proper_pair2; + my $dist1; + my $dist2; + + if (($binary1[2] == 0) && ($binary2[2] == 0)) { + $proper_pair1 = 1; + $proper_pair2 = 1; + if ($chr_from1 eq $chr_from2) { + my $dist = abs($loc_from1 - $loc_from2); + if ($loc_from1 >= $loc_from2) { + $dist1 = -1*$dist; + $dist2 = $dist; + } else { + $dist1 = $dist; + $dist2 = -1*$dist; + } + } else { + $dist1 = 0; + $dist2 = 0; + } + } else { + $proper_pair1 = 0; + $proper_pair2 = 0; + $dist1 = 0; + $dist2 = 0; + } + + my $new_bin1 = join("","000000000000000000000",$binary1[10],$binary1[9],"0","0","1",$binary2[4],$binary1[4],$binary2[2],$binary1[2],$proper_pair1,"1"); + my $new_bin2 = join("","000000000000000000000",$binary2[10],$binary2[9],"0","1","0",$binary1[4],$binary2[4],$binary1[2],$binary2[2],$proper_pair2,"1"); + + my $new_flag1 = bin2dec($new_bin1); + my $new_flag2 = bin2dec($new_bin2); + + unless ($trouble == 1) { + + print(join("\t",$id1,$new_flag1,$chr_from1,$loc_from1,$mapq1,$cigar1,$chr_from2,$loc_from2,$dist1,$read1,$read_qual1,@rest1) . "\n"); + print(join("\t",$id2,$new_flag2,$chr_from2,$loc_from2,$mapq2,$cigar2,$chr_from1,$loc_from1,$dist2,$read2,$read_qual2,@rest2) . "\n"); + + } + + $line1 = <FILE1>; + $line2 = <FILE2>; + + } + +} + + + +sub dec2bin { + + my $str = unpack("B32", pack("N", shift)); + return $str; + +} + +sub bin2dec { + return unpack("N", pack("B32", substr("0" x 32 . shift, -32))); +} diff --git a/scripts/report/addFullTableForReport.py b/scripts/report/addFullTableForReport.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac763be71fb80fdf5dc06b9b9e9d3279e96f7bc --- /dev/null +++ b/scripts/report/addFullTableForReport.py @@ -0,0 +1,26 @@ +import pandas as pd +from tabulate import tabulate +samples=pd.read_csv(snakemake.input.results, dtype=str, index_col=0, delim_whitespace=True, skip_blank_lines=True) +# samples=samples.reset_index +turn2FloatAndMb=['Bases_Mb','Est_Size_Mb','Longest_Scaff_Mb','Scaff_N50_Mb','Scaff_NG50_Mb','Scaff_N95_Mb','Scaff_NG95_Mb','Longest_Cont_Mb','Cont_N50_Mb','Cont_NG50_Mb','Cont_N95_Mb','Cont_NG95_Mb'] +roundDecimals=['Comp_BUSCOs_%','Comp_Single_BUSCOs_%','Het_%','GC_%','QV','Completeness','Bases_Mb','Est_Size_Mb','Longest_Scaff_Mb','Scaff_N50_Mb','Scaff_NG50_Mb','Scaff_N95_Mb','Scaff_NG95_Mb','Longest_Cont_Mb','Cont_N50_Mb','Cont_NG50_Mb','Cont_N95_Mb','Cont_NG95_Mb'] +print('this is samples',samples) +sampleTransposed=samples.T +print('this is sampleTransposed',sampleTransposed) +for header in turn2FloatAndMb: + sampleTransposed[header]=sampleTransposed[header].astype(float).div(1000000) +for i in range(0,4): + sampleTransposed[roundDecimals[i]]=sampleTransposed[roundDecimals[i]].str.replace('%','') +for roundHeader in roundDecimals: + sampleTransposed[roundHeader]=sampleTransposed[roundHeader].astype(float).round(2) +with open(snakemake.output[0], "w") as out_markdown: + print("", file=out_markdown) + print("\\blandscape", file=out_markdown) + print("", file=out_markdown) + print("\\tiny", file=out_markdown) + print(tabulate(sampleTransposed.rename_axis('ASM_ID'), headers='keys',tablefmt="pipe", showindex=True), file=out_markdown) + print("\\elandscape", file=out_markdown) + print("", file=out_markdown) + +with open(snakemake.output[1], "w") as out_plain: + print(tabulate(sampleTransposed.rename_axis('ASM_ID'), headers='keys',tablefmt="plain", showindex=True), file=out_plain) diff --git a/scripts/report/makePDF_indivMD.py b/scripts/report/makePDF_indivMD.py new file mode 100644 index 0000000000000000000000000000000000000000..a32501c44423c73feaf04f833013edb1b55af2a8 --- /dev/null +++ b/scripts/report/makePDF_indivMD.py @@ -0,0 +1,121 @@ +import os +import numpy as np +import pandas as pd +from tabulate import tabulate + +key_stats=snakemake.input[0] +#key_stats="/srv/public/users/james94/data/Lsceleratus/GEP_results/L_sceleratus/5_Key_Results/L_sceleratus_aggregatedResults.tsv" +key_stats_table = pd.read_csv(key_stats, dtype=str, delim_whitespace=True, skip_blank_lines=True) +key_stats_table = key_stats_table.iloc[:-4 , :] + +genomescope_linear=snakemake.input[1] +#genomescope_linear="/srv/public/users/james94/data/Lsceleratus/GEP_results/L_sceleratus/5_Key_Results/L_sceleratus_gScope_linear_plot.png" +genomescope_log=snakemake.input[2] +#genomescope_log="/srv/public/users/james94/data/Lsceleratus/GEP_results/L_sceleratus/5_Key_Results/L_sceleratus_gScope_log_plot.png" + +QV_score=snakemake.input[3] +#QV_score="/srv/public/users/james94/data/Lsceleratus/GEP_results/L_sceleratus/5_Key_Results/L_sceleratus_merq.qv" +QV_score_table = pd.read_csv(QV_score, dtype=str, delim_whitespace=True, skip_blank_lines=True, names=["assembly_name","Assembly_Only", "Total_Kmers", "QV_Score", "Error"]) +QV_score_table = QV_score_table.drop(['Error'], axis=1).set_index(['assembly_name']) + +kmer_completeness=snakemake.input[4] +##kmer_completeness="/srv/public/users/james94/data/Lsceleratus/GEP_results/L_sceleratus/5_Key_Results/L_sceleratus_merq.completeness.stats" +kmer_completeness_table = pd.read_csv(kmer_completeness, dtype=str, delim_whitespace=True, skip_blank_lines=True, names=["assembly_name","all","Kmers_Assembly", "Kmers_Reads", "%"]) +kmer_completeness_table = kmer_completeness_table.drop(["all"], axis=1).set_index(['assembly_name']) + + +busco=snakemake.input[5] +#busco="/srv/public/users/james94/data/Lsceleratus/GEP_results/L_sceleratus/5_Key_Results/only_busco_scores.txt" + +with open(busco) as scoresOnlyFile: + lines = scoresOnlyFile.readlines() + lines = [line.rstrip() for line in lines] + +kmer_flat=snakemake.input[6] +#kmer_flat="/srv/public/users/james94/data/Lsceleratus/GEP_results/L_sceleratus/5_Key_Results/L_sceleratus_merq.L_sceleratus.spectra-cn.fl.png" + +kmer_stacked=snakemake.input[7] +#kmer_stacked="/srv/public/users/james94/data/Lsceleratus/GEP_results/L_sceleratus/5_Key_Results/L_sceleratus_merq.L_sceleratus.spectra-cn.st.png" + + +params_assemblyID=snakemake.params[0] +#params_assemblyID="Lagocephalus sceleratus" + +params_merylKmer=snakemake.params[1] +# params_merylKmer="21" + +params_buscoDB=snakemake.params[2] +# params_buscoDB="Vertebrata" + + + +with open(snakemake.output[0], 'w') as outFile: +# print("---", "title: 'GEP Quick-View'", "author: James Sullivan", "date: 11/09/2021", "geometry: margin=2cm", "classoption: table", "documentclass: extarticle", "urlcolor: blue", "colorlinks: true", "header-includes: |", " \\rowcolors{2}{gray!10}{gray!25}" , " ```{=latex}", "---", sep="\n") + print("", file=outFile) + print("\\twocolumn", file=outFile) + print("\\Large", file=outFile) + print("# Assembly:",params_assemblyID, file=outFile) + print("", file=outFile) + print(" - db built with kmer size ", params_merylKmer, "bp", file=outFile) + print("", file=outFile) + print("### Genome Statistics", file=outFile) + print("", file=outFile) + print(tabulate(key_stats_table, headers='keys',tablefmt="pipe", showindex=False), file=outFile) + print("", file=outFile) + print("### Genomescope2 Profile (Linear)", file=outFile) + print("{ width=37% }", file=outFile) + print("", file=outFile) + print("### Genomescope2 Profile (Log)", file=outFile) + print("{ width=37% }", file=outFile) + print("\\", file=outFile) + print("\\pagebreak", file=outFile) + print("\\", file=outFile) + print("", file=outFile) + print("\\large", file=outFile) + print("### QV Score", file=outFile) + print(tabulate(QV_score_table, headers='keys',tablefmt="pipe", showindex=True), file=outFile) + print("", file=outFile) + print("\\large", file=outFile) + print("### Kmer Completeness", file=outFile) + print(tabulate(kmer_completeness_table, headers='keys',tablefmt="pipe", showindex=True), file=outFile) + print("", file=outFile) + print("\\Large", file=outFile) + print("### BUSCOv5 (database: ", params_buscoDB, ")", file=outFile) + print("```", file=outFile) + for line in lines: + print(line, file=outFile) + print("```", file=outFile) + print("\\", file=outFile) + print("\\large", file=outFile) + print("", file=outFile) + print("### K-mer Multiplicity PRI Only (Stacked)", file=outFile) + print("{ width=44% }", file=outFile) + print("\\", file=outFile) + print("", file=outFile) + print("### K-mer Multiplicity PRI + ALT (Stacked)", file=outFile) + print("{ width=44% }", file=outFile) + print("\\", file=outFile) + print("\\pagebreak", file=outFile) + + +# --- +# title: "GEP Quick-View" +# date: 11/09/2021 +# geometry: a4paper +# classoption: table +# documentclass: extarticle +# urlcolor: blue +# #pdf_document: null +# colorlinks: true +# header-includes: | +# \rowcolors{2}{gray!10}{gray!25} +# ```{=latex} +# \usepackage{pdflscape} +# \usepackage{longtable} +# \newcommand{\blandscape}{\begin{landscape}} +# \newcommand{\elandscape}{\end{landscape}} +# ``` +# output: pdf_document +# --- +# +# print("---", "title: 'GEP Quick-View'", "author: James Sullivan", "date: 11/09/2021", "geometry: margin=2cm", "classoption: table", "documentclass: extarticle", "urlcolor: blue", "colorlinks: true", "header-includes: |", " \\rowcolors{2}{gray!10}{gray!25}" , " ```{=latex}", "" sep="\n") diff --git a/scripts/report/pretextMapsToMarkdown.py b/scripts/report/pretextMapsToMarkdown.py new file mode 100644 index 0000000000000000000000000000000000000000..bd87669d1752a36ed51cb5916bc72fa62069366f --- /dev/null +++ b/scripts/report/pretextMapsToMarkdown.py @@ -0,0 +1,6 @@ +import shutil +shutil.copyfile(snakemake.input.PretextMap, snakemake.output.pretextCP2keyResults) +with open(snakemake.output.IndividualPretextMD, "w") as out: + print("", file=out) + print("###",snakemake.params.assemblyName," HiC Heatmap", file=out) + print("{ height=30% }", file=out) diff --git a/scripts/report/reportLandingPage.md b/scripts/report/reportLandingPage.md new file mode 100644 index 0000000000000000000000000000000000000000..f72f3be761f4823134c64be5205e6e0564bfccde --- /dev/null +++ b/scripts/report/reportLandingPage.md @@ -0,0 +1,19 @@ +--- +geometry: b3paper, margin=1.3cm +classoption: + - table + - twocolumn +urlcolor: blue +colorlinks: true +header-includes: +- | + ```{=latex} + \rowcolors{2}{gray!10}{gray!25} + \usepackage{longtable} + \usepackage{supertabular} + \let\longtable\supertabular + \let\endlongtable\endsupertabular + \let\endhead\relax + ``` +output: pdf_document +--- diff --git a/scripts/report/tableOnSamePage.css b/scripts/report/tableOnSamePage.css new file mode 100644 index 0000000000000000000000000000000000000000..ad4e7addc582e765020db0b1bd15e0c337b739d3 --- /dev/null +++ b/scripts/report/tableOnSamePage.css @@ -0,0 +1,3 @@ +<style> +.main-container { width: 1200px; max-width:2800px;} +</style> diff --git a/scripts/standIN_files/ALT_missing.fasta b/scripts/standIN_files/ALT_missing.fasta new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391