Skip to content
Snippets Groups Projects
Commit d28837db authored by Philipp Harlos's avatar Philipp Harlos
Browse files

HiC_to_Bio workflow

parent 124329c4
No related branches found
No related tags found
No related merge requests found
...@@ -11,10 +11,11 @@ samples = pd.read_table(config["samples"], dtype=str) ...@@ -11,10 +11,11 @@ samples = pd.read_table(config["samples"], dtype=str)
rule all: rule all:
input: input:
expand(config["salsa_scaffolding"] + "scaffolds/{sample}_scaffolds_FINAL.fasta", zip, sample=samples["sample"]), #expand(config["salsa_scaffolding"] + "scaffolds/scaffolds_FINAL.fasta", zip, sample=samples["sample"]),
#expand(config["arima_mapping"] + "merge_bio_repl/{sample}.bam.stats", zip, sample=samples["sample"]) #config["results"] + "quality/quast"
#config["results"] + "bionano" config["results"] + "bionano"
include: "rules/arima_mapping.smk" include: "rules/arima_mapping.smk"
include: "rules/salsa.smk" include: "rules/salsa.smk"
include: "rules/bionano.smk" include: "rules/bionano.smk"
\ No newline at end of file include: "rules/quality.smk"
\ No newline at end of file
rule bwa_indexing: rule bwa_indexing:
input: input:
#config["PacBio_assembly"], assembly = config["PacBio_assembly"],
config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta"
output: output:
output1 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.amb", output1 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.amb",
output2 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.ann", output2 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.ann",
output3 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.bwt", output3 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.bwt",
output4 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.pac", output4 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.pac",
output5 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.sa" output5 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.sa"
params: params:
algorithm = "bwtsw", algorithm = "bwtsw",
#basename = "bwa_index" #basename = "bwa_index"
...@@ -16,13 +15,13 @@ rule bwa_indexing: ...@@ -16,13 +15,13 @@ rule bwa_indexing:
log: log:
config["logs"] + "arima_mapping/bwa_indexing.log" config["logs"] + "arima_mapping/bwa_indexing.log"
shell: shell:
"bwa index -a {params.algorithm} {input} 2> {log}" "bwa index -a {params.algorithm} {input.assembly} 2> {log}"
rule r1_mapping: rule r1_mapping:
input: input:
read1 = lambda wc: samples[(samples["sample"] == wc.sample) & (samples["unit_bio"] == wc.unit_bio) & (samples["unit_tech"] == wc.unit_tech)].fq1, read1 = lambda wc: samples[(samples["sample"] == wc.sample) & (samples["unit_bio"] == wc.unit_bio) & (samples["unit_tech"] == wc.unit_tech)].fq1,
ref = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta", ref = config["PacBio_assembly"],
linker = config["index"] + "bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.sa" linker = config["PacBio_assembly"] + ".sa"
output: output:
config["arima_mapping"] + "unprocessed_bam/{sample}_{unit_bio}_{unit_tech}_R1.bam" config["arima_mapping"] + "unprocessed_bam/{sample}_{unit_bio}_{unit_tech}_R1.bam"
params: params:
...@@ -39,8 +38,8 @@ rule r1_mapping: ...@@ -39,8 +38,8 @@ rule r1_mapping:
rule r2_mapping: rule r2_mapping:
input: input:
read2 = lambda wildcards: samples[(samples["sample"] == wildcards.sample) & (samples["unit_bio"] == wildcards.unit_bio) & (samples["unit_tech"] == wildcards.unit_tech)].fq2, read2 = lambda wildcards: samples[(samples["sample"] == wildcards.sample) & (samples["unit_bio"] == wildcards.unit_bio) & (samples["unit_tech"] == wildcards.unit_tech)].fq2,
ref = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta", ref = config["PacBio_assembly"],
linker = config["index"] + "bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.amb" linker = config["PacBio_assembly"] + ".sa"
output: output:
config["arima_mapping"] + "unprocessed_bam/{sample}_{unit_bio}_{unit_tech}_R2.bam" config["arima_mapping"] + "unprocessed_bam/{sample}_{unit_bio}_{unit_tech}_R2.bam"
params: params:
...@@ -82,10 +81,10 @@ rule r2_filter5end: ...@@ -82,10 +81,10 @@ rule r2_filter5end:
rule samtools_faidx: rule samtools_faidx:
input: input:
#samtools_faidx soll erst nach bwa indexing starten da sie sonst beide auf die selbe input Datei zugreifen sollen #samtools_faidx soll erst nach bwa indexing starten da sie sonst beide auf die selbe input Datei zugreifen sollen
linker = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.sa", linker = config["PacBio_assembly"] + ".sa",
reference = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta" reference = config["PacBio_assembly"],
output: output:
config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.fai" config["PacBio_assembly"] + ".fai"
#params: #params:
#IsFastq = "-f" #IsFastq = "-f"
conda: conda:
...@@ -99,7 +98,7 @@ rule pair_reads: ...@@ -99,7 +98,7 @@ rule pair_reads:
input: input:
read1 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R1.bam", read1 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R1.bam",
read2 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R2.bam", read2 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R2.bam",
faidx = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.fai" faidx = config["PacBio_assembly"] + ".fai"
output: output:
config["arima_mapping"] + "paired/{sample}_{unit_bio}_{unit_tech}.bam" config["arima_mapping"] + "paired/{sample}_{unit_bio}_{unit_tech}.bam"
params: params:
......
rule hybrid_scaffolding: rule hybrid_scaffolding:
input: input:
seq = config["PacBio_assembly"], seq = config["salsa_scaffolding"] + "scaffolds/scaffolds_FINAL.fasta",
cmap = config["cmap"] cmap = config["cmap"]
output: output:
directory(config["results"] + "bionano") dir = directory(config["results"] + "bionano")
# conda: # conda:
#"../envs/bionano.yaml" #"../envs/bionano.yaml"
params: params:
...@@ -16,4 +16,4 @@ rule hybrid_scaffolding: ...@@ -16,4 +16,4 @@ rule hybrid_scaffolding:
log: log:
config["logs"] + "bionano/hybrid_scaffolding.log" config["logs"] + "bionano/hybrid_scaffolding.log"
shell: shell:
"perl {params.script1} -n {input.seq} -b {input.cmap} -u {params.enzyme} -c {params.xml} -r {params.script2} -o {output} -B {params.con1} -N {params.con2} 2> {log}" "perl {params.script1} -n {input.seq} -b {input.cmap} -u {params.enzyme} -c {params.xml} -r {params.script2} -o {output.dir} -B {params.con1} -N {params.con2} 2> {log}"
\ No newline at end of file \ No newline at end of file
rule quast:
input:
reference = config["VGP_Ref"],
assembly = config["salsa_scaffolding"] + "scaffolds/scaffolds/scaffolds_FINAL.fasta"
output:
directory(config["results"] + "quality/quast")
params:
cell_type = "--eukaryote",
genome_size = "--large", # genome > 100Mbp
k_mer_stats = "--k-mer-stats", #By default 101bp increade if high level of heterozygosity.
fragmented = "--fragmented"
threads:
20
conda:
"../envs/quality.yaml"
log:
config["logs"] + "quality/quast.log"
shell:
"quast {input.assembly} -r {input.reference} -o {output} --threads {threads} {params.cell_type} {params.genome_size} {params.k_mer_stats} {params.fragmented} 2> {log}"
\ No newline at end of file
...@@ -24,7 +24,7 @@ rule sort_bed: ...@@ -24,7 +24,7 @@ rule sort_bed:
rule generate_contig_lengths: rule generate_contig_lengths:
input: input:
config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta" config["PacBio_assembly"]
output: output:
config["salsa_scaffolding"] + "contig_lengths/contigs.fasta.fai" config["salsa_scaffolding"] + "contig_lengths/contigs.fasta.fai"
conda: conda:
...@@ -37,24 +37,25 @@ rule generate_contig_lengths: ...@@ -37,24 +37,25 @@ rule generate_contig_lengths:
rule salsa_scaffolding: rule salsa_scaffolding:
input: input:
assembly = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta", assembly = config["PacBio_assembly"],
lengths = config["salsa_scaffolding"] + "contig_lengths/contigs.fasta.fai", lengths = config["salsa_scaffolding"] + "contig_lengths/contigs.fasta.fai",
alignment = config["salsa_scaffolding"] + "sorted_bed/{sample}.bed" alignment = expand(config["salsa_scaffolding"] + "sorted_bed/{sample}.bed", zip, sample=samples["sample"])
output: output:
directory(config["salsa_scaffolding"] + "scaffolds/{sample}_scaffolds_FINAL.fasta") config["salsa_scaffolding"] + "scaffolds/scaffolds_FINAL.fasta"
params: params:
enzymes = config["enzyme_sites"], enzymes = config["enzyme_sites"],
iterations = 5, iterations = 5,
#look_for_missassemblies #look_for_missassemblies
m = "yes", m = "yes",
#output_for_each_iteration #output_for_each_iteration
p = "yes" p = "yes",
outdir = config["salsa_scaffolding"] + "scaffolds"
conda: conda:
"../envs/salsa2.yaml" "../envs/salsa2.yaml"
log: log:
config["logs"] + "scaffolding/{sample}.log" config["logs"] + "scaffolding.log"
shell: shell:
"run_pipeline.py -a {input.assembly} -l {input.lengths} -b {input.alignment} -e {params.enzymes} -m {params.m} -p {params.p} -i {params.iterations} -o {output}" "run_pipeline.py -a {input.assembly} -l {input.lengths} -b {input.alignment} -e {params.enzymes} -m {params.m} -p {params.p} -i {params.iterations} -o {params.outdir}"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment