Skip to content
Snippets Groups Projects
Commit 74b60dd8 authored by fischer-hub's avatar fischer-hub
Browse files

ABAGBEEEBBBBBEEEEEEEEE

parent e6232233
No related branches found
No related tags found
No related merge requests found
snakemake --config samples=input.tsv reference=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.fasta annotation=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.gff output_directory=/storage/mi/fisched99/output wgs=wgs.tsv --conda-prefix=/storage/mi/fisched99/condaCache --profile config/local --rerun-incomplete
\ No newline at end of file
### Run the workflow
To run the workflow in reference based mode using a reference and annotation file use the following command:
```
snakemake --config samples=input.tsv reference=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.fasta annotation=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/REF/shigella_flexneri.gff output_directory=output metadata=/buffer/ag_bsc/PS_SEQAN_DATA/2023/DATA/Lib2M/samples_sheet_2M.tsv --conda-prefix=/storage/mi/fisched99/condaCache --profile config/local --rerun-incomplete
```
To run in denovo assembly mode add the path to the samplesheet (e.g.: wgs.tsv) for your WGS read data as an argument to the workflow as follows:
```
wgs=wgs.tsv
```
\ No newline at end of file
......@@ -27,33 +27,18 @@ include: "rules/phylogeny.smk"
include: "rules/decontamination.smk"
# define rule all
rule all:
input:
#expand(os.path.join(config['output_directory'], "02-ASSEMBLY", "samtools", "{sample}_consensus.fasta"), sample = list(samples.index)),
#expand(os.path.join(config['output_directory'], "03-ANNOTATION", "bedtools", "{sample}"), sample = list(samples.index)),
#os.path.join(config['output_directory'], "04-MSA", "gene_list.txt"),
#expand(os.path.join(config['output_directory'], "07-PHYLOGENY", "{gene}_msa.fasta"), gene = list(gene))
#checkpointbla
#os.path.join(config['output_directory'], "04-COUNTS", "featurecounts", "counts.tsv"),
#os.path.join(config['output_directory'], "00-Log", "phylogeny", "{gene}_mafft.log")
#os.path.join(config['output_directory'], "04-MSA", "gene_list.txt")
os.path.join(config['output_directory'], "06-DGE", "dge_results.csv"),
os.path.join(config['output_directory'], "07-PHYLOGENY", "tree.nwk"),
os.path.join(config['output_directory'], "01-QC", "multiqc.html")
#os.path.join(config['output_directory'], "00-Log", "phylogeny", "concat_msa.log")
#checkcheck,
#os.path.join(config['output_directory'], "07-PHYLOGENY", "msa_all.fasta"),
#os.path.join(config['output_directory'], "04-MSA", "gene_list.txt")
os.path.join(config['output_directory'], "01-QC", "multiqc.html"),
os.path.join(config['output_directory'], "07-PHYLOGENY", "tree.png")
onsuccess:
print(f"Workflow finished successfully.\nTryin' to hold tryin' to hold but there's nothing to grasp so I leet goooooo.\nGoodbye.")
# remove the intermediate gene files each containing one gene that we can't mark as temporary
# because they are output of the checkpoint and it wont run if we mark it as temp()
shutil.rmtree(os.path.join(config['output_directory'], "03-ANNOTATION", "bedtools"))
print(f"Workflow and cleanup finished successfully.\nTryin' to hold tryin' to hold but there's nothing to grasp so I leet goooooo.\nGoodbye.")
......@@ -2,8 +2,7 @@ name: draw_tree
channels:
- bioconda
- conda-forge
- anaconda
dependencies:
- python>=3.7
- biopython
- matplotlib
- argparse
\ No newline at end of file
- matplotlib
\ No newline at end of file
......@@ -4,6 +4,7 @@ def get_gff2bed_input(wildcards):
else:
return os.path.join(config['annotation'])
rule gff2bed:
input:
get_gff2bed_input
......@@ -44,6 +45,7 @@ checkpoint cut_genes:
awk '/^>/{{OUT="{output.genes}/"substr($0,2)".fasta"; system("touch "OUT" 2>> {log}"); print ">"substr($0,2)"_sample_{wildcards.sample}" > OUT; next}}{{print > OUT}}' {output.genes_all} 2>> {log}
"""
def get_prokka_input(wildcards):
if config['polish']=="yes":
return os.path.join(config['output_directory'], "02-ASSEMBLY", "polish", "assembly_polished.fasta")
......@@ -68,12 +70,14 @@ rule prokka:
prokka --outdir {output.outdir} --prefix annotation {input.assembly} --force --cpus {threads} &> {log}
"""
def get_annotation(wildcards):
if config['wgs']:
return os.path.join(config['output_directory'], "03-ANNOTATION", "prokka", "annotation.gff")
else:
return config['annotation']
rule gff2gtf:
input:
get_annotation
......
......@@ -7,6 +7,7 @@ def get_hisat_ref(wildcards):
else:
return config['reference']
rule hisat2_idx:
input:
reference = get_hisat_ref
......@@ -48,6 +49,7 @@ rule hisat2_map:
hisat2 -x {params.idx_prefix} -1 {input.r1} -2 {input.r2} -p {threads} 2> {log} | samtools view -bS 2> {log} | samtools sort -o {output} -T tmp --threads {threads} 2> {log}
"""
rule samtools_consensus:
input:
sorted_bams = os.path.join(config['output_directory'], "02-ASSEMBLY", "hisat2_map", "{sample}_sorted.bam")
......@@ -64,6 +66,7 @@ rule samtools_consensus:
samtools consensus -f fasta -o {output.consensus} -d 5 {input.sorted_bams} 2> {log}
"""
def get_spades_input(wildcards):
if config["host_sequence"]:
return { "r1" : expand(os.path.join(config["output_directory"], "01-QC", "screen", "samtools", "{sample}_decontaminated.1.fastq"), sample = list(wgs.index)),
......@@ -95,6 +98,7 @@ rule spades:
spades.py -o {output.outdir} -1 {output.reads_1} -2 {output.reads_2} -t {threads} --phred-offset 33 > {log}
"""
rule polishing:
input:
assembly = os.path.join(config['output_directory'], "02-ASSEMBLY", "spades", "contigs.fasta"),
......
......@@ -17,6 +17,7 @@ rule kraken2:
kraken2 --db {params.kraken2_db} --threads {threads} --paired --report {output} {input.r1} {input.r2} 2> {log} > /dev/null # we dont care about the actual output that is written to stdout (and is quite large)
"""
def screen_input(wildcards):#otherwise wgs is not defined if we do not have wgs data
if config['wgs']:
return expand(os.path.join(config['output_directory'], "01-QC", "screen", "kraken2", "{sample}_report.txt"), sample = list(wgs.index))
......@@ -24,6 +25,7 @@ def screen_input(wildcards):#otherwise wgs is not defined if we do not have wgs
print("WARNING: Screening requested but not WGS reads were provided to screen. Exiting..")
exit()
rule screen:
input:
screen_input
......
......@@ -14,15 +14,10 @@ colorblind_friendly_pal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E44
### read in feature counts output and with first line as header and column 'geneid' as rownames
counts <- read.table(snakemake@input[[1]], header=TRUE, row.names = 1)
#counts <- read.table('/home/david/Uni/appliedsequenceanalysis/final_project/output/05-COUNTS/featurecounts/counts.tsv', header=TRUE, row.names = 1)
condition <- read.table(snakemake@input[[2]], header=TRUE, row.names = 1)
#condition <- read.table('/home/david/Uni/appliedsequenceanalysis/Lib2M/samples_group_sheet.tsv', header=TRUE, row.names = 1)
colnames(condition)[1] <- 'group'
### remove first 6 columns since they contain gene information but we only want count data now and convert to numeric
counts <- counts[,6:ncol(counts)]
counts[] <- sapply(counts, as.numeric)
......@@ -55,8 +50,8 @@ dds <- DESeqDataSetFromMatrix(countData = counts,
dds <- DESeq(dds)
res <- results(dds)
sorted_res <- res[order(res$padj, -res$log2FoldChange),]#sort rows for padj and then decreasing logfoldchange
sorted_res <- sorted_res[1:20,!names(res) %in% c("stat", "baseMean", "pvalue","lfcSE")] #only take the first 20 rows and some columns
sorted_res <- res[order(res$padj, -res$log2FoldChange),] # sort rows for padj and then decreasing logfoldchange
sorted_res <- sorted_res[1:20,!names(res) %in% c("stat", "baseMean", "pvalue","lfcSE")] # only take the first 20 rows and some columns
print(paste('Writing results to output file as', as.character(snakemake@output[['dge_results']]), '..'))
write.csv(as.data.frame(sorted_res), snakemake@output[['dge_results']], row.names=TRUE)
......@@ -64,13 +59,6 @@ write.csv(as.data.frame(sorted_res), snakemake@output[['dge_results']], row.name
print(paste('Creating volcano plot as ', snakemake@output[['volcano_plot']], '..'))
volcano_data <- data.frame(Log2FoldChange = res$log2FoldChange, Log10AdjustedPvalue = -log10(res$padj))
#volcano_plot <- ggplot(volcano_data, aes(x=Log2FoldChange, y=-Log10AdjustedPvalue)) +
# geom_point(aes(color=ifelse(abs(Log2FoldChange) >= 1 & res$padj < 0.05, "red", "black"))) +
# labs(title="Volcano Plot",
# x="log2(Fold Change)",
# y="-log10(Adjusted p-value)") +
# theme_minimal()
#bin <- ggsave(plot=volcano_plot, filename=snakemake@output[['volcano_plot']], width=6, height=4, dpi=300)
plot.volcano <-function(df, title_txt, subtitle_txt){
......@@ -91,7 +79,6 @@ bin <- ggsave(plot=volcano_plot, filename=snakemake@output[['volcano_plot']], wi
#Heatmap
# Assuming 'samples' is a vector of sample names
print(paste('Creating clustered heatmap as', snakemake@output[['heatmap']], '..'))
de_genes <- rownames(subset(res, padj < 0.05))
de_counts <- counts(dds)[de_genes, rownames(condition)]
......
import matplotlib.pyplot as plt
from Bio import Phylo
import argparse
import sys
parser = argparse.ArgumentParser(description="Description of your program", epilog="Example usage: python3 draw_tree.py <TREE_NWK> <PLOT_PNG>" )
parser.add_argument("TREE_NWK", help="Tree input file name. (newick formatted)")
parser.add_argument("PLOT_PNG", help="Tree plot output file name.")
args = parser.parse_args()
Phylo.draw(Phylo.read(tree_nwk, "newick"), do_show = False)
plt.title(f"Phylogenetic tree over all samples clustered by variance of genes above coverage threshold")
plt.savefig(tree_plot)
\ No newline at end of file
stdout = sys.stdout
stderr = sys.stderr
with open(snakemake.log[0], 'w') as sys.stdout:
sys.stderr = sys.stdout
Phylo.draw(Phylo.read(snakemake.input[0], "newick"), do_show = False)
plt.title(f"Phylogenetic tree over all samples clustered by variance of genes")
plt.savefig(snakemake.output[0])
sys.stdout = stdout
sys.stderr = stderr
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment