From fd06c1adc1044d3fcedd211b6b8e3b7309d01608 Mon Sep 17 00:00:00 2001 From: David Fischer <davidfischer99@gmx.de> Date: Tue, 13 Jun 2023 20:15:54 +0200 Subject: [PATCH] the forking worst task ever --- project2/workflow/Snakefile | 3 +- project2/workflow/envs/assembly.yaml | 4 +- project2/workflow/rules/assembly.smk | 96 +++++++++++++++++++++------ project2/workflow/rules/phylogeny.smk | 2 +- 4 files changed, 80 insertions(+), 25 deletions(-) diff --git a/project2/workflow/Snakefile b/project2/workflow/Snakefile index 061f593..62ce4f3 100644 --- a/project2/workflow/Snakefile +++ b/project2/workflow/Snakefile @@ -31,7 +31,8 @@ rule all: input: os.path.join(config['output_directory'], "04-PHYLOGENY", "variability.png"), os.path.join(config['output_directory'], "04-PHYLOGENY", "tree.png"), - os.path.join(config['output_directory'], "02-SCREEN", "multiqc_screen_report.html") + os.path.join(config['output_directory'], "02-SCREEN", "multiqc_screen_report.html"), + expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv"), sample=list(samples.index)) diff --git a/project2/workflow/envs/assembly.yaml b/project2/workflow/envs/assembly.yaml index 1e7ff43..d9c127a 100644 --- a/project2/workflow/envs/assembly.yaml +++ b/project2/workflow/envs/assembly.yaml @@ -6,4 +6,6 @@ channels: dependencies: - spades==3.15.5 - ragtag==2.1.0 - - masurca==4.1.0 \ No newline at end of file + - masurca==4.1.0 + - blast==2.14.0 + - seqkit==2.4.0 \ No newline at end of file diff --git a/project2/workflow/rules/assembly.smk b/project2/workflow/rules/assembly.smk index 7d03bdd..5d66f39 100644 --- a/project2/workflow/rules/assembly.smk +++ b/project2/workflow/rules/assembly.smk @@ -1,3 +1,5 @@ +import glob + def get_inbutt(wildcards): if config["host_sequence"]: return [ os.path.join(config["output_directory"], "02-SCREEN", "bowtie2", "{wildcards.sample}_decontaminated.1.fastq.gz".format(wildcards=wildcards)), @@ -25,48 +27,98 @@ rule spades: spades.py -o {output.outdir} -1 {input[0]} -2 {input[1]} -t {threads} --phred-offset 33 > {log} """ -rule ragtag: + +rule polishing: input: - contigs = os.path.join(config['output_directory'], "03-ASSEMBLY", "spades", "{sample}", "contigs.fasta") + get_inbutt, + assembly = os.path.join(config['output_directory'], "03-ASSEMBLY", "spades", "{sample}", "contigs.fasta"), output: - scaffold = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "ragtag.scaffold.fasta"), - main = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"), - outdir = directory(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}")) + polished = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta") log: - os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_ragtag.log") + os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_polca.log") threads: 2 - params: - ref = config["ref"] conda: os.path.join("..", "envs", "assembly.yaml") + shadow: # we need to do this becauzse polca saves output files to the project directory by default and overwrites the other instances output and crashes because its dumb + "minimal" shell: """ - ragtag.py scaffold -w -t {threads} -o {output.outdir} {params.ref} {input.contigs} 2> {log} + polca.sh -a {input.assembly} -r '{input[0]} {input[1]}' -t {threads} &> {log} - # hack to replace default header and cut off unaligned sequences because ragtag is stupid and doesnt provide these features - echo ">{wildcards.sample}" > {output.outdir}/main.scaffold.fasta - head -n 2 {output.outdir}/ragtag.scaffold.fasta | tail -n 1 >> {output.outdir}/main.scaffold.fasta + # move output file to output h7ujzgnb cm because polca is stupid and doersnt work + mv *.PolcaCorrected.fa {output.polished} 2>> {log} """ -rule polishing: +def get_blast_references(wildcards): + file_paths = [] + + if config["blast"]["reference"][-1] == '/': + for file in glob.glob(f"{config['blast']['reference']}/*.fasta"): + file_paths.append(file) + else: + file_paths.append(config["blast"]["reference"]) + return file_paths + + +rule blastn: input: - get_inbutt, - assembly = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta") + references = get_blast_references, + contigs = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta") output: - polished = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta") + all_refs = temp("{sample}_all_refs.fasta"), + result = os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv") log: - os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_polca.log") + os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_blast.log") + conda: + os.path.join("..", "envs", "assembly.yaml") + threads: + 6 + shell: + """ + cat {input.references} > {wildcards.sample}_all_refs.fasta 2> {log} + blastn -num_threads {threads} -query {input.contigs} -subject {output.all_refs} -out {output.result} -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" 2>> {log} + """ + +rule ragtag: + input: + all_refs = "{sample}_all_refs.fasta", + blast = os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv"), + contigs = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta") + output: + scaffold = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "ragtag.scaffold.fasta"), + main = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"), + outdir = directory(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}")), + tsv = temp("{sample}_temp.tsv"), + best_ref = temp("{sample}_best_ref.fasta") + log: + os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_ragtag.log") threads: 2 + params: + ref = "resources/references/ref.fasta" conda: os.path.join("..", "envs", "assembly.yaml") - shadow: # we need to do this becauzse polca saves output files to the project directory by default and overwrites the other instances output and crashes because its dumb - "minimal" shell: """ - polca.sh -a {input.assembly} -r '{input[0]} {input[1]}' -t {threads} &> {log} + # do some awk magic to get from the best reference sequence from blast output to actualy reference sequence (sorry) + # filter for longest contig because small contigs often have high percentage identity for all references + awk '{{if(max<=$4){{max=$4;line=$0;print line}}}}' {input.blast} > {wildcards.sample}_temp.tsv 2> {log} + + # filter from the longest contig the line with highest percentage identity and get reference sequence from line + ref_id=$(awk '{{if(max<$3){{max=$3;ref_id=$2}}}}END{{print ref_id}}' {wildcards.sample}_temp.tsv) 2>> {log} - # move output file to output h7ujzgnb cm because polca is stupid and doersnt work - mv *.PolcaCorrected.fa {output.polished} + seqkit grep -p "$ref_id" {input.all_refs} > {wildcards.sample}_best_ref.fasta 2>> {log} + + ### code graveyard (this didnt work but we spent like 4h on it) + # count reference (multi) fasta length and grep sequence by reference id + #ref_len=$(cat {input.all_refs} | wc -l) + #cat {input.all_refs} | grep -A $ref_len -e $ref_id | grep -m 2 -B $ref_len '>' | head -n -1 > {wildcards.sample}_best_ref.fasta + + # run ragtag on this best reference + ragtag.py scaffold -w -t {threads} -o {output.outdir} {wildcards.sample}_best_ref.fasta {input.contigs} 2>> {log} + + # hacky way to replace default header and cut off unaligned sequences because ragtag is stupid and doesnt provide these features + echo ">{wildcards.sample}" > {output.outdir}/main.scaffold.fasta + head -n 2 {output.outdir}/ragtag.scaffold.fasta | tail -n 1 >> {output.outdir}/main.scaffold.fasta 2>> {log} """ \ No newline at end of file diff --git a/project2/workflow/rules/phylogeny.smk b/project2/workflow/rules/phylogeny.smk index b0cadf0..306df18 100644 --- a/project2/workflow/rules/phylogeny.smk +++ b/project2/workflow/rules/phylogeny.smk @@ -1,6 +1,6 @@ rule mafft: input: - assemblies = expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta"), sample = list(samples.index)) + assemblies = expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"), sample = list(samples.index)) output: msa = os.path.join(config['output_directory'], "04-PHYLOGENY", "msa.fasta"), all = temp("all.fasta") -- GitLab