Skip to content
Snippets Groups Projects
Commit fd06c1ad authored by David Fischer's avatar David Fischer
Browse files

the forking worst task ever

parent fdc7fc2b
No related branches found
No related tags found
No related merge requests found
...@@ -31,7 +31,8 @@ rule all: ...@@ -31,7 +31,8 @@ rule all:
input: input:
os.path.join(config['output_directory'], "04-PHYLOGENY", "variability.png"), os.path.join(config['output_directory'], "04-PHYLOGENY", "variability.png"),
os.path.join(config['output_directory'], "04-PHYLOGENY", "tree.png"), os.path.join(config['output_directory'], "04-PHYLOGENY", "tree.png"),
os.path.join(config['output_directory'], "02-SCREEN", "multiqc_screen_report.html") os.path.join(config['output_directory'], "02-SCREEN", "multiqc_screen_report.html"),
expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv"), sample=list(samples.index))
......
...@@ -6,4 +6,6 @@ channels: ...@@ -6,4 +6,6 @@ channels:
dependencies: dependencies:
- spades==3.15.5 - spades==3.15.5
- ragtag==2.1.0 - ragtag==2.1.0
- masurca==4.1.0 - masurca==4.1.0
\ No newline at end of file - blast==2.14.0
- seqkit==2.4.0
\ No newline at end of file
import glob
def get_inbutt(wildcards): def get_inbutt(wildcards):
if config["host_sequence"]: if config["host_sequence"]:
return [ os.path.join(config["output_directory"], "02-SCREEN", "bowtie2", "{wildcards.sample}_decontaminated.1.fastq.gz".format(wildcards=wildcards)), return [ os.path.join(config["output_directory"], "02-SCREEN", "bowtie2", "{wildcards.sample}_decontaminated.1.fastq.gz".format(wildcards=wildcards)),
...@@ -25,48 +27,98 @@ rule spades: ...@@ -25,48 +27,98 @@ rule spades:
spades.py -o {output.outdir} -1 {input[0]} -2 {input[1]} -t {threads} --phred-offset 33 > {log} spades.py -o {output.outdir} -1 {input[0]} -2 {input[1]} -t {threads} --phred-offset 33 > {log}
""" """
rule ragtag:
rule polishing:
input: input:
contigs = os.path.join(config['output_directory'], "03-ASSEMBLY", "spades", "{sample}", "contigs.fasta") get_inbutt,
assembly = os.path.join(config['output_directory'], "03-ASSEMBLY", "spades", "{sample}", "contigs.fasta"),
output: output:
scaffold = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "ragtag.scaffold.fasta"), polished = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta")
main = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"),
outdir = directory(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}"))
log: log:
os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_ragtag.log") os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_polca.log")
threads: threads:
2 2
params:
ref = config["ref"]
conda: conda:
os.path.join("..", "envs", "assembly.yaml") os.path.join("..", "envs", "assembly.yaml")
shadow: # we need to do this becauzse polca saves output files to the project directory by default and overwrites the other instances output and crashes because its dumb
"minimal"
shell: shell:
""" """
ragtag.py scaffold -w -t {threads} -o {output.outdir} {params.ref} {input.contigs} 2> {log} polca.sh -a {input.assembly} -r '{input[0]} {input[1]}' -t {threads} &> {log}
# hack to replace default header and cut off unaligned sequences because ragtag is stupid and doesnt provide these features # move output file to output h7ujzgnb cm because polca is stupid and doersnt work
echo ">{wildcards.sample}" > {output.outdir}/main.scaffold.fasta mv *.PolcaCorrected.fa {output.polished} 2>> {log}
head -n 2 {output.outdir}/ragtag.scaffold.fasta | tail -n 1 >> {output.outdir}/main.scaffold.fasta
""" """
rule polishing: def get_blast_references(wildcards):
file_paths = []
if config["blast"]["reference"][-1] == '/':
for file in glob.glob(f"{config['blast']['reference']}/*.fasta"):
file_paths.append(file)
else:
file_paths.append(config["blast"]["reference"])
return file_paths
rule blastn:
input: input:
get_inbutt, references = get_blast_references,
assembly = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta") contigs = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta")
output: output:
polished = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta") all_refs = temp("{sample}_all_refs.fasta"),
result = os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv")
log: log:
os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_polca.log") os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_blast.log")
conda:
os.path.join("..", "envs", "assembly.yaml")
threads:
6
shell:
"""
cat {input.references} > {wildcards.sample}_all_refs.fasta 2> {log}
blastn -num_threads {threads} -query {input.contigs} -subject {output.all_refs} -out {output.result} -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" 2>> {log}
"""
rule ragtag:
input:
all_refs = "{sample}_all_refs.fasta",
blast = os.path.join(config['output_directory'], "03-ASSEMBLY", "blast", "{sample}_blast.tsv"),
contigs = os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta")
output:
scaffold = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "ragtag.scaffold.fasta"),
main = os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"),
outdir = directory(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}")),
tsv = temp("{sample}_temp.tsv"),
best_ref = temp("{sample}_best_ref.fasta")
log:
os.path.join(config['output_directory'], "00-Log", "assembly", "{sample}_ragtag.log")
threads: threads:
2 2
params:
ref = "resources/references/ref.fasta"
conda: conda:
os.path.join("..", "envs", "assembly.yaml") os.path.join("..", "envs", "assembly.yaml")
shadow: # we need to do this becauzse polca saves output files to the project directory by default and overwrites the other instances output and crashes because its dumb
"minimal"
shell: shell:
""" """
polca.sh -a {input.assembly} -r '{input[0]} {input[1]}' -t {threads} &> {log} # do some awk magic to get from the best reference sequence from blast output to actualy reference sequence (sorry)
# filter for longest contig because small contigs often have high percentage identity for all references
awk '{{if(max<=$4){{max=$4;line=$0;print line}}}}' {input.blast} > {wildcards.sample}_temp.tsv 2> {log}
# filter from the longest contig the line with highest percentage identity and get reference sequence from line
ref_id=$(awk '{{if(max<$3){{max=$3;ref_id=$2}}}}END{{print ref_id}}' {wildcards.sample}_temp.tsv) 2>> {log}
# move output file to output h7ujzgnb cm because polca is stupid and doersnt work seqkit grep -p "$ref_id" {input.all_refs} > {wildcards.sample}_best_ref.fasta 2>> {log}
mv *.PolcaCorrected.fa {output.polished}
### code graveyard (this didnt work but we spent like 4h on it)
# count reference (multi) fasta length and grep sequence by reference id
#ref_len=$(cat {input.all_refs} | wc -l)
#cat {input.all_refs} | grep -A $ref_len -e $ref_id | grep -m 2 -B $ref_len '>' | head -n -1 > {wildcards.sample}_best_ref.fasta
# run ragtag on this best reference
ragtag.py scaffold -w -t {threads} -o {output.outdir} {wildcards.sample}_best_ref.fasta {input.contigs} 2>> {log}
# hacky way to replace default header and cut off unaligned sequences because ragtag is stupid and doesnt provide these features
echo ">{wildcards.sample}" > {output.outdir}/main.scaffold.fasta
head -n 2 {output.outdir}/ragtag.scaffold.fasta | tail -n 1 >> {output.outdir}/main.scaffold.fasta 2>> {log}
""" """
\ No newline at end of file
rule mafft: rule mafft:
input: input:
assemblies = expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "polish", "{sample}_polished.fasta"), sample = list(samples.index)) assemblies = expand(os.path.join(config['output_directory'], "03-ASSEMBLY", "ragtag", "{sample}", "main.scaffold.fasta"), sample = list(samples.index))
output: output:
msa = os.path.join(config['output_directory'], "04-PHYLOGENY", "msa.fasta"), msa = os.path.join(config['output_directory'], "04-PHYLOGENY", "msa.fasta"),
all = temp("all.fasta") all = temp("all.fasta")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment