From fd06c1adc1044d3fcedd211b6b8e3b7309d01608 Mon Sep 17 00:00:00 2001
From: David Fischer <davidfischer99@gmx.de>
Date: Tue, 13 Jun 2023 20:15:54 +0200
Subject: [PATCH] the forking worst task ever

---
 project2/workflow/Snakefile           |  3 +-
 project2/workflow/envs/assembly.yaml  |  4 +-
 project2/workflow/rules/assembly.smk  | 96 +++++++++++++++++++++------
 project2/workflow/rules/phylogeny.smk |  2 +-
 4 files changed, 80 insertions(+), 25 deletions(-)

diff --git a/project2/workflow/Snakefile b/project2/workflow/Snakefile
index 061f593..62ce4f3 100644
--- a/project2/workflow/Snakefile
+++ b/project2/workflow/Snakefile
@@ -31,7 +31,8 @@ rule all:
     input:
         os.path.join(config['output_directory'], "04-PHYLOGENY", "variability.png"),
         os.path.join(config['output_directory'], "04-PHYLOGENY", "tree.png"),
-        os.path.join(config['output_directory'], "02-SCREEN", "multiqc_screen_report.html")
+        os.path.join(config['output_directory'], "02-SCREEN", "multiqc_screen_report.html"),
+        expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv"), sample=list(samples.index))
 
 
 
diff --git a/project2/workflow/envs/assembly.yaml b/project2/workflow/envs/assembly.yaml
index 1e7ff43..d9c127a 100644
--- a/project2/workflow/envs/assembly.yaml
+++ b/project2/workflow/envs/assembly.yaml
@@ -6,4 +6,6 @@ channels:
 dependencies:
   - spades==3.15.5
   - ragtag==2.1.0
-  - masurca==4.1.0
\ No newline at end of file
+  - masurca==4.1.0
+  - blast==2.14.0
+  - seqkit==2.4.0
\ No newline at end of file
diff --git a/project2/workflow/rules/assembly.smk b/project2/workflow/rules/assembly.smk
index 7d03bdd..5d66f39 100644
--- a/project2/workflow/rules/assembly.smk
+++ b/project2/workflow/rules/assembly.smk
@@ -1,3 +1,5 @@
+import glob
+
 def get_inbutt(wildcards):
     if config["host_sequence"]:
         return [ os.path.join(config["output_directory"], "02-SCREEN", "bowtie2", "{wildcards.sample}_decontaminated.1.fastq.gz".format(wildcards=wildcards)),
@@ -25,48 +27,98 @@ rule spades:
         spades.py -o {output.outdir} -1 {input[0]} -2 {input[1]} -t {threads} --phred-offset 33 > {log}
         """
 
-rule ragtag:
+
+rule polishing:
     input:
-        contigs     = os.path.join(config['output_directory'], "03-ASSEMBLY", "spades", "{sample}", "contigs.fasta")
+        get_inbutt,
+        assembly   = os.path.join(config['output_directory'], "03-ASSEMBLY", "spades", "{sample}", "contigs.fasta"),
     output:
-        scaffold    = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "ragtag.scaffold.fasta"),
-        main        = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"),
-        outdir      = directory(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}"))
+        polished     = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta")
     log:
-        os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_ragtag.log")
+        os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_polca.log")
     threads: 
         2
-    params:
-        ref = config["ref"]
     conda:
         os.path.join("..", "envs", "assembly.yaml")
+    shadow: # we need to do this becauzse polca saves output files to the project directory by default and overwrites the other instances output and crashes because its dumb
+        "minimal"
     shell:
         """
-        ragtag.py scaffold -w -t {threads} -o {output.outdir} {params.ref} {input.contigs} 2> {log}
+        polca.sh -a {input.assembly} -r '{input[0]} {input[1]}' -t {threads} &> {log}
 
-        # hack to replace default header and cut off unaligned sequences because ragtag is stupid and doesnt provide these features
-        echo ">{wildcards.sample}" > {output.outdir}/main.scaffold.fasta
-        head -n 2 {output.outdir}/ragtag.scaffold.fasta | tail -n 1 >> {output.outdir}/main.scaffold.fasta
+        # move output file to output h7ujzgnb cm because polca is stupid and doersnt work
+        mv *.PolcaCorrected.fa {output.polished} 2>> {log}
         """
 
-rule polishing:
+def get_blast_references(wildcards):
+    file_paths = []
+
+    if config["blast"]["reference"][-1] == '/':
+        for file in glob.glob(f"{config['blast']['reference']}/*.fasta"):
+            file_paths.append(file)
+    else:
+        file_paths.append(config["blast"]["reference"])
+    return file_paths
+
+
+rule blastn:
     input:
-        get_inbutt,
-        assembly     = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta")
+        references = get_blast_references,
+        contigs = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta")
     output:
-        polished     = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta")
+        all_refs = temp("{sample}_all_refs.fasta"),
+        result   = os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv")
     log:
-        os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_polca.log")
+        os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_blast.log")
+    conda:
+        os.path.join("..", "envs", "assembly.yaml")
+    threads:
+        6
+    shell:
+        """
+        cat {input.references} > {wildcards.sample}_all_refs.fasta 2> {log}
+        blastn -num_threads {threads} -query {input.contigs} -subject {output.all_refs} -out {output.result} -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" 2>> {log}
+        """
+
+rule ragtag:
+    input:
+        all_refs    = "{sample}_all_refs.fasta",
+        blast       = os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv"),
+        contigs     = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta")
+    output:
+        scaffold    = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "ragtag.scaffold.fasta"),
+        main        = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"),
+        outdir      = directory(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}")),
+        tsv         = temp("{sample}_temp.tsv"),
+        best_ref    = temp("{sample}_best_ref.fasta")
+    log:
+        os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_ragtag.log")
     threads: 
         2
+    params:
+        ref = "resources/references/ref.fasta"
     conda:
         os.path.join("..", "envs", "assembly.yaml")
-    shadow: # we need to do this becauzse polca saves output files to the project directory by default and overwrites the other instances output and crashes because its dumb
-        "minimal"
     shell:
         """
-        polca.sh -a {input.assembly} -r '{input[0]} {input[1]}' -t {threads} &> {log}
+        # do some awk magic to get from the best reference sequence from blast output to actualy reference sequence (sorry)
+        # filter for longest contig because small contigs often have high percentage identity for all references
+        awk '{{if(max<=$4){{max=$4;line=$0;print line}}}}' {input.blast} > {wildcards.sample}_temp.tsv 2> {log}
+        
+        # filter from the longest contig the line with highest percentage identity and get reference sequence from line
+        ref_id=$(awk '{{if(max<$3){{max=$3;ref_id=$2}}}}END{{print ref_id}}' {wildcards.sample}_temp.tsv) 2>> {log}
 
-        # move output file to output h7ujzgnb cm because polca is stupid and doersnt work
-        mv *.PolcaCorrected.fa {output.polished}
+        seqkit grep -p "$ref_id" {input.all_refs} > {wildcards.sample}_best_ref.fasta 2>> {log}
+        
+        ### code graveyard (this didnt work but we spent like 4h on it)
+        # count reference (multi) fasta length and grep sequence by reference id
+        #ref_len=$(cat {input.all_refs} | wc -l)
+        #cat {input.all_refs} | grep -A $ref_len -e $ref_id | grep -m 2 -B $ref_len '>' | head -n -1 > {wildcards.sample}_best_ref.fasta
+        
+        # run ragtag on this best reference
+        ragtag.py scaffold -w -t {threads} -o {output.outdir} {wildcards.sample}_best_ref.fasta {input.contigs} 2>> {log}
+
+        # hacky way to replace default header and cut off unaligned sequences because ragtag is stupid and doesnt provide these features
+        echo ">{wildcards.sample}" > {output.outdir}/main.scaffold.fasta
+        head -n 2 {output.outdir}/ragtag.scaffold.fasta | tail -n 1 >> {output.outdir}/main.scaffold.fasta 2>> {log}
         """
\ No newline at end of file
diff --git a/project2/workflow/rules/phylogeny.smk b/project2/workflow/rules/phylogeny.smk
index b0cadf0..306df18 100644
--- a/project2/workflow/rules/phylogeny.smk
+++ b/project2/workflow/rules/phylogeny.smk
@@ -1,6 +1,6 @@
 rule mafft:
     input:
-        assemblies = expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta"), sample = list(samples.index))
+        assemblies = expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"), sample = list(samples.index))
     output:
         msa    = os.path.join(config['output_directory'], "04-PHYLOGENY", "msa.fasta"),
         all    = temp("all.fasta")
-- 
GitLab