diff --git a/final_project/README.md b/final_project/README.md index f642db9a54e3f7ccd2dd8413363837b64f5f9bdf..fe28bdc4c0aa1aec76d8335ba0786f2935b4319a 100644 --- a/final_project/README.md +++ b/final_project/README.md @@ -1 +1,14 @@ -snakemake --config samples=input.tsv reference=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.fasta annotation=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.gff output_directory=/storage/mi/fisched99/output wgs=wgs.tsv --conda-prefix=/storage/mi/fisched99/condaCache --profile config/local --rerun-incomplete \ No newline at end of file +### Run the workflow + +To run the workflow in reference based mode using a reference and annotation file use the following command: + + +``` +snakemake --config samples=input.tsv reference=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.fasta annotation=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.gff output_directory=output metadata=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/Lib2M/samples_sheet_2M.tsv --conda-prefix=/storage/mi/fisched99/condaCache --profile config/local --rerun-incomplete +``` + +To run in denovo assembly mode add the path to the samplesheet (e.g.: wgs.tsv) for your WGS read data as an argument to the workflow as follows: + +``` +wgs=wgs.tsv +``` \ No newline at end of file diff --git a/final_project/workflow/Snakefile b/final_project/workflow/Snakefile index a4558c7072715b94156fc9ce8b7849b4202103bf..97b906599fdd7bb38d18d55388bc2420b10b2535 100644 --- a/final_project/workflow/Snakefile +++ b/final_project/workflow/Snakefile @@ -27,33 +27,18 @@ include: "rules/phylogeny.smk" include: "rules/decontamination.smk" - - - - # define rule all rule all: input: - #expand(os.path.join(config['output_directory'], "02-ASSEMBLY", "samtools", "{sample}_consensus.fasta"), sample = list(samples.index)), - #expand(os.path.join(config['output_directory'], "03-ANNOTATION", "bedtools", "{sample}"), sample = list(samples.index)), - #os.path.join(config['output_directory'], "04-MSA", "gene_list.txt"), - #expand(os.path.join(config['output_directory'], "07-PHYLOGENY", "{gene}_msa.fasta"), gene = list(gene)) - #checkpointbla - #os.path.join(config['output_directory'], "04-COUNTS", "featurecounts", "counts.tsv"), - #os.path.join(config['output_directory'], "00-Log", "phylogeny", "{gene}_mafft.log") - #os.path.join(config['output_directory'], "04-MSA", "gene_list.txt") os.path.join(config['output_directory'], "06-DGE", "dge_results.csv"), - os.path.join(config['output_directory'], "07-PHYLOGENY", "tree.nwk"), - os.path.join(config['output_directory'], "01-QC", "multiqc.html") - - #os.path.join(config['output_directory'], "00-Log", "phylogeny", "concat_msa.log") - #checkcheck, - #os.path.join(config['output_directory'], "07-PHYLOGENY", "msa_all.fasta"), - #os.path.join(config['output_directory'], "04-MSA", "gene_list.txt") - + os.path.join(config['output_directory'], "01-QC", "multiqc.html"), + os.path.join(config['output_directory'], "07-PHYLOGENY", "tree.png") onsuccess: - print(f"Workflow finished successfully.\nTryin' to hold tryin' to hold but there's nothing to grasp so I leet goooooo.\nGoodbye.") + # remove the intermediate gene files each containing one gene that we can't mark as temporary + # because they are output of the checkpoint and it wont run if we mark it as temp() + shutil.rmtree(os.path.join(config['output_directory'], "03-ANNOTATION", "bedtools")) + print(f"Workflow and cleanup finished successfully.\nTryin' to hold tryin' to hold but there's nothing to grasp so I leet goooooo.\nGoodbye.") diff --git a/final_project/workflow/envs/draw_tree.yaml b/final_project/workflow/envs/draw_tree.yaml index 96c6851cafefb5b243fea51d0c454ebf0c25a60b..ff42b3532f4e44b53694a0bb4421523a3a65c2a4 100644 --- a/final_project/workflow/envs/draw_tree.yaml +++ b/final_project/workflow/envs/draw_tree.yaml @@ -2,8 +2,7 @@ name: draw_tree channels: - bioconda - conda-forge - - anaconda dependencies: + - python>=3.7 - biopython - - matplotlib - - argparse \ No newline at end of file + - matplotlib \ No newline at end of file diff --git a/final_project/workflow/rules/annotation.smk b/final_project/workflow/rules/annotation.smk index 92785d87407c4d13cc534729699088afe9ce4a90..1e79806728b5cc8e642d6e8b67b11e599095a225 100644 --- a/final_project/workflow/rules/annotation.smk +++ b/final_project/workflow/rules/annotation.smk @@ -4,6 +4,7 @@ def get_gff2bed_input(wildcards): else: return os.path.join(config['annotation']) + rule gff2bed: input: get_gff2bed_input @@ -44,6 +45,7 @@ checkpoint cut_genes: awk '/^>/{{OUT="{output.genes}/"substr($0,2)".fasta"; system("touch "OUT" 2>> {log}"); print ">"substr($0,2)"_sample_{wildcards.sample}" > OUT; next}}{{print > OUT}}' {output.genes_all} 2>> {log} """ + def get_prokka_input(wildcards): if config['polish']=="yes": return os.path.join(config['output_directory'], "02-ASSEMBLY", "polish", "assembly_polished.fasta") @@ -68,12 +70,14 @@ rule prokka: prokka --outdir {output.outdir} --prefix annotation {input.assembly} --force --cpus {threads} &> {log} """ + def get_annotation(wildcards): if config['wgs']: return os.path.join(config['output_directory'], "03-ANNOTATION", "prokka", "annotation.gff") else: return config['annotation'] + rule gff2gtf: input: get_annotation diff --git a/final_project/workflow/rules/assembly.smk b/final_project/workflow/rules/assembly.smk index c6e3ac09252e18f35df22065b285970b084f212e..dfc784c3728c9f9f004b374f0bf71f3620c7054d 100644 --- a/final_project/workflow/rules/assembly.smk +++ b/final_project/workflow/rules/assembly.smk @@ -7,6 +7,7 @@ def get_hisat_ref(wildcards): else: return config['reference'] + rule hisat2_idx: input: reference = get_hisat_ref @@ -48,6 +49,7 @@ rule hisat2_map: hisat2 -x {params.idx_prefix} -1 {input.r1} -2 {input.r2} -p {threads} 2> {log} | samtools view -bS 2> {log} | samtools sort -o {output} -T tmp --threads {threads} 2> {log} """ + rule samtools_consensus: input: sorted_bams = os.path.join(config['output_directory'], "02-ASSEMBLY", "hisat2_map", "{sample}_sorted.bam") @@ -64,6 +66,7 @@ rule samtools_consensus: samtools consensus -f fasta -o {output.consensus} -d 5 {input.sorted_bams} 2> {log} """ + def get_spades_input(wildcards): if config["host_sequence"]: return { "r1" : expand(os.path.join(config["output_directory"], "01-QC", "screen", "samtools", "{sample}_decontaminated.1.fastq"), sample = list(wgs.index)), @@ -95,6 +98,7 @@ rule spades: spades.py -o {output.outdir} -1 {output.reads_1} -2 {output.reads_2} -t {threads} --phred-offset 33 > {log} """ + rule polishing: input: assembly = os.path.join(config['output_directory'], "02-ASSEMBLY", "spades", "contigs.fasta"), diff --git a/final_project/workflow/rules/screen.smk b/final_project/workflow/rules/screen.smk index 8358a5b789a02e550c9b47c473bef12cbf9c905b..adf389662b3749c801d21a7bbae36c5bf3237d41 100644 --- a/final_project/workflow/rules/screen.smk +++ b/final_project/workflow/rules/screen.smk @@ -17,6 +17,7 @@ rule kraken2: kraken2 --db {params.kraken2_db} --threads {threads} --paired --report {output} {input.r1} {input.r2} 2> {log} > /dev/null # we dont care about the actual output that is written to stdout (and is quite large) """ + def screen_input(wildcards):#otherwise wgs is not defined if we do not have wgs data if config['wgs']: return expand(os.path.join(config['output_directory'], "01-QC", "screen", "kraken2", "{sample}_report.txt"), sample = list(wgs.index)) @@ -24,6 +25,7 @@ def screen_input(wildcards):#otherwise wgs is not defined if we do not have wgs print("WARNING: Screening requested but not WGS reads were provided to screen. Exiting..") exit() + rule screen: input: screen_input diff --git a/final_project/workflow/scripts/DGE.R b/final_project/workflow/scripts/DGE.R index 9f5a7ee5c5eae4434fc8ce5de7320b9c90ad83b2..dbb15fbd028505b18b7b1b7d5dd1ae14cb9a90d1 100644 --- a/final_project/workflow/scripts/DGE.R +++ b/final_project/workflow/scripts/DGE.R @@ -14,15 +14,10 @@ colorblind_friendly_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E44 ### read in feature counts output and with first line as header and column 'geneid' as rownames counts <- read.table(snakemake@input[[1]], header=TRUE, row.names = 1) -#counts <- read.table('/home/david/Uni/appliedsequenceanalysis/final_project/output/05-COUNTS/featurecounts/counts.tsv', header=TRUE, row.names = 1) - - condition <- read.table(snakemake@input[[2]], header=TRUE, row.names = 1) -#condition <- read.table('/home/david/Uni/appliedsequenceanalysis/Lib2M/samples_group_sheet.tsv', header=TRUE, row.names = 1) colnames(condition)[1] <- 'group' - ### remove first 6 columns since they contain gene information but we only want count data now and convert to numeric counts <- counts[,6:ncol(counts)] counts[] <- sapply(counts, as.numeric) @@ -55,8 +50,8 @@ dds <- DESeqDataSetFromMatrix(countData = counts, dds <- DESeq(dds) res <- results(dds) -sorted_res <- res[order(res$padj, -res$log2FoldChange),]#sort rows for padj and then decreasing logfoldchange -sorted_res <- sorted_res[1:20,!names(res) %in% c("stat", "baseMean", "pvalue","lfcSE")] #only take the first 20 rows and some columns +sorted_res <- res[order(res$padj, -res$log2FoldChange),] # sort rows for padj and then decreasing logfoldchange +sorted_res <- sorted_res[1:20,!names(res) %in% c("stat", "baseMean", "pvalue","lfcSE")] # only take the first 20 rows and some columns print(paste('Writing results to output file as', as.character(snakemake@output[['dge_results']]), '..')) write.csv(as.data.frame(sorted_res), snakemake@output[['dge_results']], row.names=TRUE) @@ -64,13 +59,6 @@ write.csv(as.data.frame(sorted_res), snakemake@output[['dge_results']], row.name print(paste('Creating volcano plot as ', snakemake@output[['volcano_plot']], '..')) volcano_data <- data.frame(Log2FoldChange = res$log2FoldChange, Log10AdjustedPvalue = -log10(res$padj)) -#volcano_plot <- ggplot(volcano_data, aes(x=Log2FoldChange, y=-Log10AdjustedPvalue)) + -# geom_point(aes(color=ifelse(abs(Log2FoldChange) >= 1 & res$padj < 0.05, "red", "black"))) + -# labs(title="Volcano Plot", -# x="log2(Fold Change)", -# y="-log10(Adjusted p-value)") + -# theme_minimal() -#bin <- ggsave(plot=volcano_plot, filename=snakemake@output[['volcano_plot']], width=6, height=4, dpi=300) plot.volcano <-function(df, title_txt, subtitle_txt){ @@ -91,7 +79,6 @@ bin <- ggsave(plot=volcano_plot, filename=snakemake@output[['volcano_plot']], wi #Heatmap -# Assuming 'samples' is a vector of sample names print(paste('Creating clustered heatmap as', snakemake@output[['heatmap']], '..')) de_genes <- rownames(subset(res, padj < 0.05)) de_counts <- counts(dds)[de_genes, rownames(condition)] diff --git a/final_project/workflow/scripts/draw_tree.py b/final_project/workflow/scripts/draw_tree.py index e80ed40e93bc2e7791c8223029db8956de116439..966822650bb710637c9a8f253b928d6b55d813a6 100644 --- a/final_project/workflow/scripts/draw_tree.py +++ b/final_project/workflow/scripts/draw_tree.py @@ -1,12 +1,17 @@ import matplotlib.pyplot as plt from Bio import Phylo -import argparse +import sys -parser = argparse.ArgumentParser(description="Description of your program", epilog="Example usage: python3 draw_tree.py <TREE_NWK> <PLOT_PNG>" ) -parser.add_argument("TREE_NWK", help="Tree input file name. (newick formatted)") -parser.add_argument("PLOT_PNG", help="Tree plot output file name.") -args = parser.parse_args() -Phylo.draw(Phylo.read(tree_nwk, "newick"), do_show = False) -plt.title(f"Phylogenetic tree over all samples clustered by variance of genes above coverage threshold") -plt.savefig(tree_plot) \ No newline at end of file +stdout = sys.stdout +stderr = sys.stderr + +with open(snakemake.log[0], 'w') as sys.stdout: + + sys.stderr = sys.stdout + Phylo.draw(Phylo.read(snakemake.input[0], "newick"), do_show = False) + plt.title(f"Phylogenetic tree over all samples clustered by variance of genes") + plt.savefig(snakemake.output[0]) + +sys.stdout = stdout +sys.stderr = stderr \ No newline at end of file