wd update

c0588869 · seehagec01 · f0f82661 · c0588869 · c0588869 · c0588869
Commit c0588869 authored 3 years ago by seehagec01
--- a/project/config/config.yaml
+++ b/project/config/config.yaml
 ---
 samples: "resources/sample/sample.tsv"
+human_reference: "resources/reference/human.fa"
 index: "results/reference/index"
 bowtieparams: "-q"
+kraken_db: "/buffer/ag_bsc/pmsb_workflows_2022/kraken_standard_db/standard_db"
+viral_reference_alignment: "resources/reference/SARS-CoV-2.fasta"
+shiver_config_file: "config/shiver_config.sh"
+viral_sequencing_adapters: "resources/adapters/SARS-CoV-2.fasta"
+viral_sequencing_primers: "resources/primers/SARS-CoV-2.fasta"
--- a/project/config/shiver_config.sh
+++ b/project/config/shiver_config.sh
+#!/usr/bin/env bash
+################################################################################
+# Note that for boolean variables, only the exact value "true" (all lower case)
+# will be interpreted as true, anything else is taken to mean false.
+# Two options only needed for the fully automatic version: the maximum allowed
+# percentage of gaps inside contigs when aligned to their closest reference (too
+# much gap content indicates misalignment, rather than deletions), and the
+# minimum fraction of a contig's length that blasts to HIV.
+MaxContigGappiness=0.05
+MinContigHitFrac=0.9
+# What do you have to type into the command line to make these commands execute?
+# (If the binary file lives in a directory that is not included in your $PATH
+# variable, you will need to include the path here.)
+BlastDBcommand='makeblastdb'
+BlastNcommand='blastn'
+smalt='smalt'
+bwa='bwa'
+bowtie2='bowtie2'
+bowtie2_build='bowtie2-build'
+samtools='samtools'
+mafft='mafft'
+fastaq='fastaq'
+# If you've downloaded the trimmomatic executable file (ending in .jar), to run
+# it you probably need to type something like this:
+# java -jar path/to/where/it/lives/trimmomatic-0.36.jar
+# If someone else installed it for you (e.g. on MRC CLIMB) there may be an alias
+# which means you just type 'trimmomatic' to run it:
+trimmomatic="trimmomatic"
+# If you leave 'GiveHXB2coords', below, as 'true', we'll do pairwise alignment
+# of the mapping reference with HXB2. You may as well use mafft options to make
+# it more accurate (though slower).
+MafftArgsForPairwise='--maxiterate 1000 --localpair'
+# Minimum contig length: contigs shorter than this will be discarded at the
+# start. In addition, when contigs are blasted against the existing reference 
+# set, we will only keep hits for which the length of the hit multipled by its 
+# identity to the reference is at least this length.
+MinContigLength=40
+# A contig will be split/cut if it has multiple blast hits. After alignment to
+# a set of references, it may be split again if it contains a large gap: this
+# parameter sets the gap size that will result in such a splitting...
+MinGapSizeToSplitGontig=20
+# ...and as such splitting occasionally results in cutting off a small bit of
+# a contig into a new separate contig that you might not want to bother keeping,
+# we have a second length threshold (which you could set to equal the
+# MinContigLength parameter above but by default we are more permissive).
+MinContigFragmentLength=10
+# Blast's "Word size for wordfinder algorithm" when blasting contigs against
+# references.
+BlastWordSize=17
+# If you have a more recent mafft installation that includes the --addfragments
+# option, we will use both --addfragments and --add to align the contigs to the
+# existing reference alignment, then automatically choose one to keep. There are
+# two available strategies for doing this: one is to use the alignment with the
+# shortest length ("MinAlnLength"), the other is to calculate the fractional gap
+# content of each contig after alignment, find the maximum over all contigs, and
+# use the alignment with the smaller maximum ("MinMaxGappiness").
+MafftTestingStrategy="MinAlnLength"
+# Shall we trim adapaters and low quality bases from reads, using trimmomatic?
+TrimReadsForAdaptersAndQual=false
+# The trimmomatic manual explains at length the parameters controlling read
+# trimming; the reader is referred to it for explanations of the following
+# variables and other options not used here:
+IlluminaClipParams='2:10:7:1:true'
+BaseQualityParams='MINLEN:50 LEADING:20 TRAILING:20 SLIDINGWINDOW:4:20'
+# How many threads Trimmomatic should use (it sometimes multithreads unless told
+# not to, which can be problematic on clusters).
+NumThreadsTrimmomatic=1
+# Shall we trim exact matches to PCR primers from the end of reads using fastaq?
+TrimReadsForPrimers=true
+# Shall we also trim matches to the PCR primers that differ by a single base
+# change?
+TrimPrimerWithOneSNP=false
+# Shall we clean (remove read pairs that look like contaminants)?
+CleanReads=true
+# Which mapper to use? "smalt", "bwa" or "bowtie"? You can ignore the options
+# for a mapper you're not using, and it doesn't need to be installed.
+mapper="smalt"
+# Check the smalt documentation for a full explanation of options,
+# including those not used by default here.
+# A summary of the index options used below:
+# -k sets the word (kmer) length, -s the sampling step size (i.e. is every word
+#  hashed, or every second word, or one word in every 3, ...), when a hash table
+# is made using the reference.
+# A summary of the mapping options used below:
+# -x means a read and its mate are mapped independently (not constraining them
+# to be close), -y sets the minimum fraction of identical nucleotides a read
+# must have to its reference before it is considered mapped, -j is the minimum 
+# insert size and -i the maximum insert size: outside of this range, the read
+# pair is still mapped, but flagged as improperly paired.
+smaltIndexOptions="-k 15 -s 3"
+smaltMapOptions="-x -y 0.7 -j 0 -i 2000"
+# Check the bowtie2 documentation for a full explanation of options,
+# including those not used by default here.
+# A summary of the options used below:
+# --local means bowtie might soft-clip read ends if doing so maximizes the
+# alignment score.
+# --maxins 2000 means the maximum allowed insert size is 2000
+# --no-discordant stops bowtie from looking for discordant alignments of mates
+# in a pair (incorrectly oriented or exceeding the specified maximum insert
+# size).
+# --no-unal keeps unaligned reads out of the output (see also shiver's
+# samtoolsReadFlags option below).
+# --quiet means "Print nothing besides alignments and serious errors".
+bowtieOptions="--local --maxins 2000 --no-discordant --no-unal --quiet"
+# Check the bwa mem documentation for a full explanation of options,
+# including those not used by default here.
+# A summary of the options used below:
+# -v 2 sets the verbosity to "warnings and errors" but not "normal messages".
+bwaOptions='-v 2'
+# After mapping, the choice of what kinds of reads should be kept is specified
+# with SAM format flags, whose documentation is here:
+# https://samtools.github.io/hts-specs/SAMv1.pdf
+# Flags are combined in a bitwise manner, which is fiddly. This page
+# https://broadinstitute.github.io/picard/explain-flags.html
+# gives a more user-friendly correspondance between SAM flags and kinds of
+# reads.
+# The flags used below mean unmapped reads are excluded (-F 4) and only properly
+# aligned pairs are kept (-f 3).
+samtoolsReadFlags='-f 3 -F 4'
+# See http://www.htslib.org/doc/samtools.html for a description of samtools
+# mpileup options. Those used below mean that the minimum of the base quality
+# the 'BAQ' quantity (see http://samtools.sourceforge.net/mpileup.shtml for an
+# explanation) must be at least 5, and only the first 1000000 reads mapped to
+# each point will be considered (NB a limit must be provided; the default is
+# 250).
+mpileupOptions='--min-BQ 5 --max-depth 1000000'
+# Parameters for calling the consensus base at each position:
+# The minimum coverage (number of reads) to call a base instead of a '?'
+MinCov1=15
+# The minimum coverage to use upper case for the base (to signal increased
+# confidence)
+MinCov2=30
+# The minimum fraction of reads at a position before we call that base (or those
+# bases, when one base alone does not reach that threshold fraction; e.g. say
+# you have 60% A, 30% C and 10% G: if you set this fraction to 0.6 or lower we
+# call an A, if you set it to 0.6-0.9 we call an M for "A or C", if you set it
+# to 0.9-1 we call a V for "A, C or G".). Alternatively, if you choose a
+# negative value, we always call the single most common base regardless of its
+# fraction, unless two or more bases are equally (most) common, then we call the
+# ambiguity code for those bases.
+MinBaseFrac=-1
+# Shall we remove read pairs marked as duplicates? i.e. using picard, for each
+# set of pairs sharing the same mapped coordinates (start & end of each mate),
+# keep only one pair and discard the rest? This can cause loss of diversity in
+# the reads due to true biological variation as well sequencing error. We
+# suggest you use this only if you understand duplication in your sequencing
+# data... 
+deduplicate=false
+# Desired command (note that MarkDuplicatesWithMateCigar exists, which may be
+# better, however it still seems to have beta status; also note that you can
+# include options in this command, such as a non-default
+# DUPLICATE_SCORING_STRATEGY, but do not include options relating to file-naming
+# or the associated shiver commands will break):
+DeduplicationCommand="picard MarkDuplicates"
+# Shall we remap to the consensus? (For remapping, gaps in coverage in the 
+# consensus will filled in by the corresponding part of the orginal reference,
+# and ambiguity codes will simplified to just one of the bases they represent.
+# Because of this, if remapping to the consensus, you are strongly advised to 
+# set the MinBaseFrac parameter above to any negative value.)
+remap=true
+# Shall we map contaminant reads to the reference (separately), to see which 
+# reads would have contaminanted our final bam file had they not been removed?
+MapContaminantReads=false
+# Shall we generate a version of the base frequencies file that also includes
+# HXB2 coordinates (by aligning the reference used for mapping to HXB2)? Useful
+# for HIV, clearly inappropriate for other viruses.
+# If you are using HXB2 as the reference for mapping (instead of a reference
+# constructed out of contigs as is normal for shiver), set this to false or
+# there will be a problem with two identically named sequences.
+GiveHXB2coords=false
+# Shall we align the contigs to the consensus, for comparison?
+AlignContigsToConsensus=false
+# Suffixes we'll append to the sample ID for output files.
+# If you change the extension, you may well break something.
+OutputRefSuffix='_ref.fasta'
+DeduplicationStatsSuffix='_DedupStats.txt'
+PreDeduplicationBamSuffix='_PreDedup'
+MappedContaminantReadsSuffix='_ContaminantReads'
+BaseFreqsSuffix='_BaseFreqs.csv'
+BaseFreqsWGlobalSuffix='_BaseFreqs_ForGlobalAln.csv'
+BaseFreqsWHXB2Suffix='_BaseFreqs_WithHXB2.csv'
+InsertSizeCountsSuffix='_InsertSizeCounts.csv'
+CoordsDictSuffix='_coords.csv'
+LongEnoughContigsSuffix='_contigs_NoShortOnes.fasta'
+BlastSuffix='.blast'
+CleanedReads1Suffix='_clean_1.fastq' # .gz will be added when they're zipped
+CleanedReads2Suffix='_clean_2.fastq' # .gz will be added when they're zipped
+GlobalAlnSuffix='_ForGlobalAln.fasta'
+BestContigToRefAlignmentSuffix='_ContigsAndBestRef.fasta' # only for fully auto.
+################################################################################
+# The names of temporary files we'll create in the working directory.
+# If you change the extension, you may well break something.
+RawContigFile1='temp_HIVcontigs_uncut1.fasta'
+RawContigFile2='temp_HIVcontigs_uncut2.fasta'
+CutContigFile='temp_HIVcontigs_cut.fasta'
+TempContigAlignment1='temp_HIVcontigs_wRefs_MafftAdd.fasta'
+TempContigAlignment2='temp_HIVcontigs_wRefs_MafftAddFrags.fasta'
+TempContigAlignment3='temp_HIVcontigs_wRefs_3.fasta'
+TempRefAlignment='temp_RefAlignment.fasta'
+GappyRefWithExtraSeq='temp_GappyRefWithExtraSeq.fasta'
+FlattenedContigs='temp_FlattenedContigs.fasta'
+AllContigsList='temp_AllContigsList.txt'
+HIVContigsListOrig='temp_HIVContigsListOrig.txt'
+HIVContigsListUser='temp_HIVContigsListUser.txt'
+ContaminantContigsList='temp_ContaminantContigsList.txt'
+RefAndContaminantContigs='temp_RefAndContaminantContigs.fasta' # no whitespace!
+BlastDB='temp_BlastDB' # no whitespace!
+BadReadsBaseName='temp_ContaminantReads'
+smaltIndex='temp_smaltRefIndex'
+bowtieIndex='temp_bowtieRefIndex'
+AllMappedContaminantReads='temp_ContaminantReads_IncUnmapped.sam'
+RefFromAlignment='temp_RefFromAlignment.fasta'
+AllSeqsInAln='temp_AllSeqsInAln.txt'
+reads1asFasta='temp_reads1.fasta'
+reads2asFasta='temp_reads2.fasta'
+reads1blast1='temp_reads1_1.blast'
+reads2blast1='temp_reads2_1.blast'
+reads1blast2='temp_reads1_2.blast'
+reads2blast2='temp_reads2_2.blast'
+reads1sorted='temp_1_sorted.fastq'
+reads2sorted='temp_2_sorted.fastq'
+MapOutAsSam='temp_MapOut.sam'
+MapOutConversion1='temp_MapOutStep1'
+MapOutConversion2='temp_MapOutStep2'
+MapOutConversion3='temp_MapOutStep3'
+InsertSizes1='temp_InsertSizes.txt'
+InsertSizes2='temp_InsertSizes2.txt'
+PileupFile='temp_MapOut.pileup'
+RefWithGaps='temp_RefWithGaps.fasta'
+reads1trim1='temp_reads1trim1.fastq'
+reads1trim2='temp_reads1trim2.fastq'
+reads2trim1='temp_reads2trim1.fastq'
+reads2trim2='temp_reads2trim2.fastq'
+reads1trimmings='temp_trimmings1.fastq'
+reads2trimmings='temp_trimmings2.fastq'
+AlignmentForTesting='temp_test.fasta'
+ContigsWith1ref='temp_ContigsWith1ref.fasta'
+RefMatchLog='temp_RefMatchLog.txt'
+ContigAlignmentsToRefsDir='temp_ContigAlignmentsToRefsDir' # no whitespace!
+SamtoolsSortFile='temp_SamtoolsSortFile'
+RefWHXB2unaln='temp_RefWHXB2unaln.fasta'
+RefWHXB2aln='temp_RefWHXB2aln.fasta'
\ No newline at end of file
--- a/project/workflow/envs/shiver.yaml
+++ b/project/workflow/envs/shiver.yaml
+name: shiver
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - shiver=1.3.5
+  - bc
+  - seqkit=0.14.0
\ No newline at end of file
--- a/project/workflow/envs/yourenv.yaml
+++ b/project/workflow/envs/yourenv.yaml
@@ -20,4 +20,5 @@ dependencies:
  - kmc=3.2.1
  - Trimmomatic=0.32
  - kraken2=2.1.2
+  - fastqc=0.11.9
 prefix: /buffer/ag_bsc/pmsb_workflows_2022/virus/anna-swp-workflows/project/workflow/envs/yourenv.yaml
--- a/project/workflow/rules/IVA.smk
+++ b/project/workflow/rules/IVA.smk
 rule IVA:
    input:
-        "results/unmapped_fastq/{sample}_1_unmapped.fastq",
+        fq1="results/unmapped_fastq/{sample}_1_unmapped.fastq",
-        "results/unmapped_fastq/{sample}_2_unmapped.fastq"
+        fq2="results/unmapped_fastq/{sample}_2_unmapped.fastq"
    output:
        directory("results/IVA/{sample}/")
    log:
@@ -10,6 +10,6 @@ rule IVA:
    conda:
       "../envs/yourenv.yaml"
    shell:
-        "iva -f {input[0]} -r {input[1]} {output} --threads {threads} 2> {log}"
+        "iva -f {input.fq1} -r {input.fq2} {output} --threads {threads} 2> {log}"
\ No newline at end of file