diff --git a/workflow/Snakefile b/workflow/Snakefile index 7b437cb9fbec4d38b124b1479e0a2501205bdf3a..9d012809dac048e1e3d841a38c1f3d9152155182 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -11,10 +11,11 @@ samples = pd.read_table(config["samples"], dtype=str) rule all: input: - expand(config["salsa_scaffolding"] + "scaffolds/{sample}_scaffolds_FINAL.fasta", zip, sample=samples["sample"]), - #expand(config["arima_mapping"] + "merge_bio_repl/{sample}.bam.stats", zip, sample=samples["sample"]) - #config["results"] + "bionano" + #expand(config["salsa_scaffolding"] + "scaffolds/scaffolds_FINAL.fasta", zip, sample=samples["sample"]), + #config["results"] + "quality/quast" + config["results"] + "bionano" include: "rules/arima_mapping.smk" include: "rules/salsa.smk" -include: "rules/bionano.smk" \ No newline at end of file +include: "rules/bionano.smk" +include: "rules/quality.smk" \ No newline at end of file diff --git a/workflow/rules/arima_mapping.smk b/workflow/rules/arima_mapping.smk index a026340d310c17fae6d41cb5abc442d491143b73..c271fe2d317691f8950ebc81430605cf6113beb8 100644 --- a/workflow/rules/arima_mapping.smk +++ b/workflow/rules/arima_mapping.smk @@ -1,13 +1,12 @@ rule bwa_indexing: input: - #config["PacBio_assembly"], - config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta" + assembly = config["PacBio_assembly"], output: - output1 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.amb", - output2 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.ann", - output3 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.bwt", - output4 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.pac", - output5 = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.sa" + output1 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.amb", + output2 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.ann", + output3 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.bwt", + output4 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.pac", + output5 = config["results"] + "bionano/hybrid_scaffolds/pac_bio_assembly.fna.sa" params: algorithm = "bwtsw", #basename = "bwa_index" @@ -16,13 +15,13 @@ rule bwa_indexing: log: config["logs"] + "arima_mapping/bwa_indexing.log" shell: - "bwa index -a {params.algorithm} {input} 2> {log}" + "bwa index -a {params.algorithm} {input.assembly} 2> {log}" rule r1_mapping: input: read1 = lambda wc: samples[(samples["sample"] == wc.sample) & (samples["unit_bio"] == wc.unit_bio) & (samples["unit_tech"] == wc.unit_tech)].fq1, - ref = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta", - linker = config["index"] + "bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.sa" + ref = config["PacBio_assembly"], + linker = config["PacBio_assembly"] + ".sa" output: config["arima_mapping"] + "unprocessed_bam/{sample}_{unit_bio}_{unit_tech}_R1.bam" params: @@ -39,8 +38,8 @@ rule r1_mapping: rule r2_mapping: input: read2 = lambda wildcards: samples[(samples["sample"] == wildcards.sample) & (samples["unit_bio"] == wildcards.unit_bio) & (samples["unit_tech"] == wildcards.unit_tech)].fq2, - ref = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta", - linker = config["index"] + "bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.amb" + ref = config["PacBio_assembly"], + linker = config["PacBio_assembly"] + ".sa" output: config["arima_mapping"] + "unprocessed_bam/{sample}_{unit_bio}_{unit_tech}_R2.bam" params: @@ -82,10 +81,10 @@ rule r2_filter5end: rule samtools_faidx: input: #samtools_faidx soll erst nach bwa indexing starten da sie sonst beide auf die selbe input Datei zugreifen sollen - linker = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.sa", - reference = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta" + linker = config["PacBio_assembly"] + ".sa", + reference = config["PacBio_assembly"], output: - config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.fai" + config["PacBio_assembly"] + ".fai" #params: #IsFastq = "-f" conda: @@ -99,7 +98,7 @@ rule pair_reads: input: read1 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R1.bam", read2 = config["arima_mapping"] + "filtered_bam/{sample}_{unit_bio}_{unit_tech}_R2.bam", - faidx = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta.fai" + faidx = config["PacBio_assembly"] + ".fai" output: config["arima_mapping"] + "paired/{sample}_{unit_bio}_{unit_tech}.bam" params: diff --git a/workflow/rules/bionano.smk b/workflow/rules/bionano.smk index dc4b264f546525cdd998a0cc49ea9820e8056e24..337ed8d472ae8dbf661a7f5e8ddb77c77fc805fb 100644 --- a/workflow/rules/bionano.smk +++ b/workflow/rules/bionano.smk @@ -1,9 +1,9 @@ rule hybrid_scaffolding: input: - seq = config["PacBio_assembly"], + seq = config["salsa_scaffolding"] + "scaffolds/scaffolds_FINAL.fasta", cmap = config["cmap"] output: - directory(config["results"] + "bionano") + dir = directory(config["results"] + "bionano") # conda: #"../envs/bionano.yaml" params: @@ -16,4 +16,4 @@ rule hybrid_scaffolding: log: config["logs"] + "bionano/hybrid_scaffolding.log" shell: - "perl {params.script1} -n {input.seq} -b {input.cmap} -u {params.enzyme} -c {params.xml} -r {params.script2} -o {output} -B {params.con1} -N {params.con2} 2> {log}" \ No newline at end of file + "perl {params.script1} -n {input.seq} -b {input.cmap} -u {params.enzyme} -c {params.xml} -r {params.script2} -o {output.dir} -B {params.con1} -N {params.con2} 2> {log}" \ No newline at end of file diff --git a/workflow/rules/quality.smk b/workflow/rules/quality.smk new file mode 100644 index 0000000000000000000000000000000000000000..4b4dd18c5c0950b0a825b0098b189762b0eacd80 --- /dev/null +++ b/workflow/rules/quality.smk @@ -0,0 +1,20 @@ +rule quast: + input: + reference = config["VGP_Ref"], + assembly = config["salsa_scaffolding"] + "scaffolds/scaffolds/scaffolds_FINAL.fasta" + output: + directory(config["results"] + "quality/quast") + params: + cell_type = "--eukaryote", + genome_size = "--large", # genome > 100Mbp + k_mer_stats = "--k-mer-stats", #By default 101bp increade if high level of heterozygosity. + fragmented = "--fragmented" + threads: + 20 + conda: + "../envs/quality.yaml" + log: + config["logs"] + "quality/quast.log" + shell: + "quast {input.assembly} -r {input.reference} -o {output} --threads {threads} {params.cell_type} {params.genome_size} {params.k_mer_stats} {params.fragmented} 2> {log}" + \ No newline at end of file diff --git a/workflow/rules/salsa.smk b/workflow/rules/salsa.smk index 99b85eb758ebbd4ed21dc6168f03d340b643c103..8b0481e48312499f62ebb3b4e1c64c8d850f8f2b 100644 --- a/workflow/rules/salsa.smk +++ b/workflow/rules/salsa.smk @@ -24,7 +24,7 @@ rule sort_bed: rule generate_contig_lengths: input: - config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta" + config["PacBio_assembly"] output: config["salsa_scaffolding"] + "contig_lengths/contigs.fasta.fai" conda: @@ -37,24 +37,25 @@ rule generate_contig_lengths: rule salsa_scaffolding: input: - assembly = config["results"] + "bionano/hybrid_scaffolds/bCalAnn1_Saphyr_DLE1_bppAdjust_cmap_pac_bio_assembly_fna_NGScontigs_HYBRID_SCAFFOLD.fasta", + assembly = config["PacBio_assembly"], lengths = config["salsa_scaffolding"] + "contig_lengths/contigs.fasta.fai", - alignment = config["salsa_scaffolding"] + "sorted_bed/{sample}.bed" + alignment = expand(config["salsa_scaffolding"] + "sorted_bed/{sample}.bed", zip, sample=samples["sample"]) output: - directory(config["salsa_scaffolding"] + "scaffolds/{sample}_scaffolds_FINAL.fasta") + config["salsa_scaffolding"] + "scaffolds/scaffolds_FINAL.fasta" params: enzymes = config["enzyme_sites"], iterations = 5, #look_for_missassemblies m = "yes", #output_for_each_iteration - p = "yes" + p = "yes", + outdir = config["salsa_scaffolding"] + "scaffolds" conda: "../envs/salsa2.yaml" log: - config["logs"] + "scaffolding/{sample}.log" + config["logs"] + "scaffolding.log" shell: - "run_pipeline.py -a {input.assembly} -l {input.lengths} -b {input.alignment} -e {params.enzymes} -m {params.m} -p {params.p} -i {params.iterations} -o {output}" + "run_pipeline.py -a {input.assembly} -l {input.lengths} -b {input.alignment} -e {params.enzymes} -m {params.m} -p {params.p} -i {params.iterations} -o {params.outdir}"