Skip to content
Snippets Groups Projects
Commit 124329c4 authored by Philipp Harlos's avatar Philipp Harlos
Browse files

Bio_to_HiC workflow

parent 95ac5f47
No related branches found
No related tags found
No related merge requests found
import pandas as pd import pandas as pd
import os import os
import yaml
from collections import Counter
configfile: "config.yaml" configfile: "config.yaml"
samples = pd.read_table(config["samples"], index_col="sample")
samples = pd.read_table(config["samples"], dtype=str)
rule all: rule all:
input: input:
expand(config["arima_mapping"] + "final/{sample}.bam", sample=list(samples.index)), expand(config["salsa_scaffolding"] + "scaffolds/{sample}_scaffolds_FINAL.fasta", zip, sample=samples["sample"]),
expand(config["arima_mapping"] + "final/metric_{sample}.txt", sample=list(samples.index)) #expand(config["arima_mapping"] + "merge_bio_repl/{sample}.bam.stats", zip, sample=samples["sample"])
#config["results"] + "bionano"
include: "rules/arima_mapping.smk" include: "rules/arima_mapping.smk"
include: "rules/salsa.smk"
include: "rules/bionano.smk"
\ No newline at end of file
...@@ -2,11 +2,46 @@ samples: ./units.tsv ...@@ -2,11 +2,46 @@ samples: ./units.tsv
arima_mapping: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/intermed/arima_mapping/ arima_mapping: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/intermed/arima_mapping/
salsa_scaffolding: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/intermed/salsa_scaffolding/
results: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/results/
logs: ../logs/ logs: ../logs/
HIC_reads: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/data/arima_HiC/ HIC_reads: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/data/arima_HiC/
PacBio_assembly: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/data/assemblies/pac_bio_assembly.fna PacBio_assembly: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/data/assemblies/pac_bio_assembly.fna
index: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/data/assemblies/ raw_Filename_asse:
index: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/results/bionano/hybrid_scaffolds/
# --------------- Arima Mapping ---------------
# --------------- Bionano Scaffolding ---------------
# -------- Data --------
cmap: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/data/bionano/bCalAnn1_Saphyr_DLE1.cmap
rawFilename_cmap: bCalAnn1_Saphyr_DLE1
# -------- Scripts --------
hybridScript: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/workflow/scripts/bionano/tools/pipeline/1.0/HybridScaffold/1.0/hybridScaffold.pl
refAligner: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/workflow/scripts/bionano/tools/pipeline/1.0/RefAligner/1.0/RefAligner
# -------- Parameters --------
# DLE1: CTTAAG ;
enzyme_sites: "CTTAAG"
enzyme_xml: /buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/workflow/scripts/bionano/tools/pipeline/1.0/HybridScaffold/1.0/hybridScaffold_DLE1_config.xml
#conflict filter level genome maps: 1, 2, or 3
conflictLvlMap: 2
#conflict filter level sequences: 1, 2, or 3
conflictLvlSeq: 2
File added
channels:
- bioconda
- conda-forge
- anaconda
- bioconda-legacy
dependencies:
- pandas
- lxml = 4.9.1
- perl-threaded = 5.32.1
- gcc_linux-64 = 11.2.0
- expat = 2.5.0
- perl-app-cpanminus = 1.7044
channels:
- bioconda
- conda-forge
- anaconda
dependencies:
- bedtools = 2.30.0
- samtools = 1.15.1
channels:
- conda-forge
- anaconda
- bioconda
dependencies:
- salsa2 = 2.3.
...@@ -82,10 +82,10 @@ rule r2_filter5end: ...@@ -82,10 +82,10 @@ rule r2_filter5end:
rule samtools_faidx: rule samtools_faidx:
input: input:
#samtools_faidx soll erst nach bwa indexing starten da sie sonst beide auf die selbe input Datei zugreifen sollen #samtools_faidx soll erst nach bwa indexing starten da sie sonst beide auf die selbe input Datei zugreifen sollen
linker = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.sa" linker = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.sa",
reference = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta" reference = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta"
output: output:
config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fai" config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.fai"
#params: #params:
#IsFastq = "-f" #IsFastq = "-f"
conda: conda:
...@@ -99,8 +99,7 @@ rule pair_reads: ...@@ -99,8 +99,7 @@ rule pair_reads:
input: input:
read1 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R1.bam", read1 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R1.bam",
read2 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R2.bam", read2 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R2.bam",
faidx = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fai" faidx = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.fai"
output: output:
config["arima_mapping"] + "paired/{sample}_{unit_bio}_{unit_tech}.bam" config["arima_mapping"] + "paired/{sample}_{unit_bio}_{unit_tech}.bam"
params: params:
...@@ -123,7 +122,7 @@ rule add_read_groups: ...@@ -123,7 +122,7 @@ rule add_read_groups:
platform = "ILLUMINA", platform = "ILLUMINA",
sampleName = "{sample}", sampleName = "{sample}",
library = "{sample}", library = "{sample}",
platform_unit ="None" platform_unit ="None",
conda: conda:
"../envs/arima_mapping.yaml" "../envs/arima_mapping.yaml"
log: log:
...@@ -167,13 +166,14 @@ rule mark_duplicates: ...@@ -167,13 +166,14 @@ rule mark_duplicates:
output: output:
bam = config["arima_mapping"] + "final/{sample}_{unit_bio}.bam", bam = config["arima_mapping"] + "final/{sample}_{unit_bio}.bam",
metric = config["arima_mapping"] + "final/metric_{sample}_{unit_bio}.txt" metric = config["arima_mapping"] + "final/metric_{sample}_{unit_bio}.txt"
#params:
conda: conda:
"../envs/arima_mapping.yaml" "../envs/arima_mapping.yaml"
log: log:
config["logs"] + "arima_mapping/mark_duplicates/{sample}_{unit_bio}.log" config["logs"] + "arima_mapping/mark_duplicates/{sample}_{unit_bio}.log"
#wrapper:
#"v1.19.0/bio/picard/markduplicates"
shell: shell:
"picard MarkDuplicates I={input} O={output.bam} M={output.metric} 2> {log}" "picard MarkDuplicates -Xmx20g I={input} O={output.bam} M={output.metric} 2> {log}"
def bio_input(wildcards): def bio_input(wildcards):
......
sample fq1 fq2 sample unit_bio unit_tech fq1 fq2
bCalAnn1_A9_USPD16081032_HGCKKALXX_L3 "/buffer/ag_bsc/Theses/harlos_assembly/data/arima_HiC/bCalAnn1_A9_USPD16081032_HGCKKALXX_L3_R1.fastq.gz" "/buffer/ag_bsc/Theses/harlos_assembly/data/arima_HiC/bCalAnn1_A9_USPD16081032_HGCKKALXX_L3_R2.fastq.gz" bCalAnn1 1 1 "/buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/data/arima_HiC/bCalAnn1_R1.fastq.gz" "/buffer/ag_bsc/Theses/harlos_assembly/assembly_downstream/data/arima_HiC/bCalAnn1_R2.fastq.gz"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment