ABAGBEEEBBBBBEEEEEEEEE

74b60dd8 · fischer-hub · e6232233 · 74b60dd8 · 74b60dd8 · 74b60dd8
Commit 74b60dd8 authored Jul 24, 2023 by fischer-hub
--- a/final_project/README.md
+++ b/final_project/README.md
-snakemake --config samples=input.tsv reference=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.fasta annotation=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.gff output_directory=/storage/mi/fisched99/output wgs=wgs.tsv --conda-prefix=/storage/mi/fisched99/condaCache  --profile config/local --rerun-incomplete
\ No newline at end of file
+### Run the workflow
+
+To run the workflow in reference based mode using a reference and annotation file use the following command:
+
+
+```
+snakemake --config samples=input.tsv reference=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.fasta annotation=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.gff output_directory=output  metadata=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/Lib2M/samples_sheet_2M.tsv --conda-prefix=/storage/mi/fisched99/condaCache  --profile config/local --rerun-incomplete
+```
+
+To run in denovo assembly mode add the path to the samplesheet (e.g.: wgs.tsv) for your WGS read data as an argument to the workflow as follows:
+
+```
+wgs=wgs.tsv
+```
\ No newline at end of file
--- a/final_project/workflow/Snakefile
+++ b/final_project/workflow/Snakefile
@@ -27,33 +27,18 @@ include: "rules/phylogeny.smk"
 include: "rules/decontamination.smk"


-
-
-
-
 # define rule all
 rule all:
    input:
-        #expand(os.path.join(config['output_directory'], "02-ASSEMBLY", "samtools", "{sample}_consensus.fasta"), sample = list(samples.index)),
-        #expand(os.path.join(config['output_directory'], "03-ANNOTATION", "bedtools", "{sample}"), sample = list(samples.index)),
-        #os.path.join(config['output_directory'], "04-MSA", "gene_list.txt"),
-        #expand(os.path.join(config['output_directory'], "07-PHYLOGENY", "{gene}_msa.fasta"), gene = list(gene))
-        #checkpointbla
-        #os.path.join(config['output_directory'], "04-COUNTS", "featurecounts", "counts.tsv"),
-        #os.path.join(config['output_directory'], "00-Log", "phylogeny", "{gene}_mafft.log")
-        #os.path.join(config['output_directory'], "04-MSA", "gene_list.txt")
        os.path.join(config['output_directory'], "06-DGE", "dge_results.csv"),
-        os.path.join(config['output_directory'], "07-PHYLOGENY", "tree.nwk"),
-        os.path.join(config['output_directory'], "01-QC", "multiqc.html")
-
-        #os.path.join(config['output_directory'], "00-Log", "phylogeny", "concat_msa.log")
-        #checkcheck,
-        #os.path.join(config['output_directory'], "07-PHYLOGENY", "msa_all.fasta"),
-        #os.path.join(config['output_directory'], "04-MSA", "gene_list.txt")
-
+        os.path.join(config['output_directory'], "01-QC", "multiqc.html"),
+        os.path.join(config['output_directory'], "07-PHYLOGENY", "tree.png")


 onsuccess:
-    print(f"Workflow finished successfully.\nTryin' to hold tryin' to hold but there's nothing to grasp so I leet goooooo.\nGoodbye.")
+    # remove the intermediate gene files each containing one gene that we can't mark as temporary
+    # because they are output of the checkpoint and it wont run if we mark it as temp()
+    shutil.rmtree(os.path.join(config['output_directory'], "03-ANNOTATION", "bedtools"))
+    print(f"Workflow and cleanup finished successfully.\nTryin' to hold tryin' to hold but there's nothing to grasp so I leet goooooo.\nGoodbye.")


--- a/final_project/workflow/envs/draw_tree.yaml
+++ b/final_project/workflow/envs/draw_tree.yaml
@@ -2,8 +2,7 @@ name: draw_tree
 channels:
  - bioconda
  - conda-forge
-  - anaconda
 dependencies:
+  - python>=3.7
  - biopython
  - matplotlib
\ No newline at end of file
-  - argparse
\ No newline at end of file
--- a/final_project/workflow/rules/annotation.smk
+++ b/final_project/workflow/rules/annotation.smk
@@ -4,6 +4,7 @@ def get_gff2bed_input(wildcards):
    else:
        return os.path.join(config['annotation'])

+
 rule gff2bed:
    input:
        get_gff2bed_input
@@ -44,6 +45,7 @@ checkpoint cut_genes:
        awk '/^>/{{OUT="{output.genes}/"substr($0,2)".fasta"; system("touch "OUT" 2>> {log}"); print ">"substr($0,2)"_sample_{wildcards.sample}" > OUT; next}}{{print > OUT}}' {output.genes_all} 2>> {log}
        """

+
 def get_prokka_input(wildcards):
    if config['polish']=="yes":
        return os.path.join(config['output_directory'], "02-ASSEMBLY", "polish", "assembly_polished.fasta")
@@ -68,12 +70,14 @@ rule prokka:
        prokka --outdir {output.outdir} --prefix annotation {input.assembly} --force --cpus {threads} &> {log}
        """

+
 def get_annotation(wildcards):
    if config['wgs']:
        return os.path.join(config['output_directory'], "03-ANNOTATION", "prokka", "annotation.gff")
    else:
        return config['annotation']

+
 rule gff2gtf:
    input:
        get_annotation

--- a/final_project/workflow/rules/assembly.smk
+++ b/final_project/workflow/rules/assembly.smk
@@ -7,6 +7,7 @@ def get_hisat_ref(wildcards):
    else:
        return config['reference']

+
 rule hisat2_idx:
    input:
        reference = get_hisat_ref
@@ -48,6 +49,7 @@ rule hisat2_map:
        hisat2 -x {params.idx_prefix} -1 {input.r1} -2 {input.r2} -p {threads} 2> {log} | samtools view -bS 2> {log} | samtools sort -o {output} -T tmp --threads {threads} 2> {log}
        """

+
 rule samtools_consensus:
    input:
        sorted_bams = os.path.join(config['output_directory'], "02-ASSEMBLY", "hisat2_map", "{sample}_sorted.bam")
@@ -64,6 +66,7 @@ rule samtools_consensus:
        samtools consensus -f fasta -o {output.consensus} -d 5 {input.sorted_bams} 2> {log}
        """

+
 def get_spades_input(wildcards):
    if config["host_sequence"]:
        return { "r1" : expand(os.path.join(config["output_directory"], "01-QC", "screen", "samtools", "{sample}_decontaminated.1.fastq"), sample = list(wgs.index)),
@@ -95,6 +98,7 @@ rule spades:
        spades.py -o {output.outdir} -1 {output.reads_1} -2 {output.reads_2} -t {threads} --phred-offset 33 > {log}
        """

+
 rule polishing:
    input:
        assembly   = os.path.join(config['output_directory'], "02-ASSEMBLY", "spades", "contigs.fasta"),

--- a/final_project/workflow/rules/screen.smk
+++ b/final_project/workflow/rules/screen.smk
@@ -17,6 +17,7 @@ rule kraken2:
        kraken2 --db {params.kraken2_db} --threads {threads} --paired --report {output} {input.r1} {input.r2} 2> {log} > /dev/null # we dont care about the actual output that is written to stdout (and is quite large)
        """

+
 def screen_input(wildcards):#otherwise wgs is not defined if we do not have wgs data
    if config['wgs']:
        return expand(os.path.join(config['output_directory'], "01-QC", "screen", "kraken2", "{sample}_report.txt"), sample = list(wgs.index))
@@ -24,6 +25,7 @@ def screen_input(wildcards):#otherwise wgs is not defined if we do not have wgs
        print("WARNING: Screening requested but not WGS reads were provided to screen. Exiting..")
        exit()

+
 rule screen:
    input:
        screen_input

--- a/final_project/workflow/scripts/DGE.R
+++ b/final_project/workflow/scripts/DGE.R
@@ -14,15 +14,10 @@ colorblind_friendly_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E44

 ### read in feature counts output and with first line as header and column 'geneid' as rownames
 counts <- read.table(snakemake@input[[1]], header=TRUE, row.names = 1)
-#counts <- read.table('/home/david/Uni/appliedsequenceanalysis/final_project/output/05-COUNTS/featurecounts/counts.tsv', header=TRUE, row.names = 1)
-
-
 condition <- read.table(snakemake@input[[2]], header=TRUE, row.names = 1)
-#condition <- read.table('/home/david/Uni/appliedsequenceanalysis/Lib2M/samples_group_sheet.tsv', header=TRUE, row.names = 1)

 colnames(condition)[1] <- 'group'

-
 ### remove first 6 columns since they contain gene information but we only want count data now and convert to numeric
 counts <- counts[,6:ncol(counts)]
 counts[] <- sapply(counts, as.numeric)
@@ -64,13 +59,6 @@ write.csv(as.data.frame(sorted_res), snakemake@output[['dge_results']], row.name

 print(paste('Creating volcano plot as ', snakemake@output[['volcano_plot']], '..'))
 volcano_data <- data.frame(Log2FoldChange = res$log2FoldChange, Log10AdjustedPvalue = -log10(res$padj))
-#volcano_plot <- ggplot(volcano_data, aes(x=Log2FoldChange, y=-Log10AdjustedPvalue)) +
-#                       geom_point(aes(color=ifelse(abs(Log2FoldChange) >= 1 & res$padj < 0.05, "red", "black"))) +
-#                       labs(title="Volcano Plot",
-#                            x="log2(Fold Change)",
-#                            y="-log10(Adjusted p-value)") +
-#                       theme_minimal()
-#bin <- ggsave(plot=volcano_plot, filename=snakemake@output[['volcano_plot']], width=6, height=4, dpi=300)


 plot.volcano <-function(df, title_txt, subtitle_txt){
@@ -91,7 +79,6 @@ bin <- ggsave(plot=volcano_plot, filename=snakemake@output[['volcano_plot']], wi


 #Heatmap
-# Assuming 'samples' is a vector of sample names
 print(paste('Creating clustered heatmap as', snakemake@output[['heatmap']], '..'))
 de_genes <- rownames(subset(res, padj < 0.05))
 de_counts <- counts(dds)[de_genes, rownames(condition)]

--- a/final_project/workflow/scripts/draw_tree.py
+++ b/final_project/workflow/scripts/draw_tree.py
 import matplotlib.pyplot as plt
 from Bio import Phylo
-import argparse
+import sys

-parser = argparse.ArgumentParser(description="Description of your program", epilog="Example usage: python3 draw_tree.py <TREE_NWK> <PLOT_PNG>" )
-parser.add_argument("TREE_NWK", help="Tree input file name. (newick formatted)")
-parser.add_argument("PLOT_PNG", help="Tree plot output file name.")
-args = parser.parse_args()

-Phylo.draw(Phylo.read(tree_nwk, "newick"), do_show = False)
-plt.title(f"Phylogenetic tree over all samples clustered by variance of genes above coverage threshold")
-plt.savefig(tree_plot)
\ No newline at end of file
+stdout = sys.stdout
+stderr = sys.stderr
+
+with open(snakemake.log[0], 'w') as sys.stdout:
+
+    sys.stderr = sys.stdout
+    Phylo.draw(Phylo.read(snakemake.input[0], "newick"), do_show = False)
+    plt.title(f"Phylogenetic tree over all samples clustered by variance of genes")
+    plt.savefig(snakemake.output[0])
+
+sys.stdout = stdout
+sys.stderr = stderr
\ No newline at end of file