to get running on HPC

17e047db · james94 · b4759e13 · 17e047db · 17e047db · 17e047db
Commit 17e047db authored 4 years ago by james94
--- a/envs/genomescope.yaml
+++ b/envs/genomescope.yaml
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - genomescope2=2.0
+  - python=3.9.2
--- a/envs/merylMerq.yaml
+++ b/envs/merylMerq.yaml
 channels:
-  - bioconda
  - conda-forge
-  - r
+  - bioconda
 dependencies:
  - igvtools
  - openjdk=11.0.8
-  - r=3
+  - r-base=3.6.3
  - r-ggplot2
  - r-argparse
  - r-scales
-  - genomescope2=2.0
  - samtools
  - bedtools
-  - python=3.9.2
--- a/programs/merqury-1.1/LICENSE
+++ b/programs/merqury-1.1/LICENSE
+
+  PUBLIC DOMAIN NOTICE
+
+  This software is "United States Government Work" under the terms of the United
+  States Copyright Act. It was written as part of the authors' official duties
+  for the United States Government and thus cannot be copyrighted. This software
+  is freely available to the public for use without a copyright
+  notice. Restrictions cannot be placed on its present or future use.
+
+  Although all reasonable efforts have been taken to ensure the accuracy and
+  reliability of the software and associated data, the National Human Genome
+  Research Institute (NHGRI), National Institutes of Health (NIH) and the
+  U.S. Government do not and cannot warrant the performance or results that may
+  be obtained by using this software or data. NHGRI, NIH and the U.S. Government
+  disclaim all warranties as to performance, merchantability or fitness for any
+  particular purpose.
+
+  Please cite the authors in any work or product based on this material.
+
--- a/programs/merqury-1.1/README.md
+++ b/programs/merqury-1.1/README.md
+# Merqury
+
+## Evaluate genome assemblies with k-mers and more
+
+Often, genome assembly projects have illumina whole genome sequencing reads available for the assembled individual.
+The k-mer spectrum of this read set can be used for independently evaluating assembly quality without the need of a high quality reference.
+Merqury provides a set of tools for this purpose.
+
+## Dependency
+* gcc 4.8 or higher
+* meryl
+* Java run time environment (JRE)
+* R with argparse, ggplot2, and scales (tested on R 3.6.1)
+* bedtools
+* samtools
+* igvtools
+
+## Installation
+
+### Get a working meryl in your PATH
+Download meryl release: https://github.com/marbl/meryl/releases/tag/v1.0
+
+If the binary doesn't work, download the source and compile:
+```shell
+cd meryl/src
+make -j 24
+export PATH=/path/to/meryl/…/bin:$PATH
+```
+See if we get help message for `meryl`.
+
+### Add a path variable MERQURY
+```shell
+git clone https://github.com/marbl/merqury.git
+cd merqury
+export MERQURY=$PWD
+```
+Add the “export” part to your environment (~/.bash_profile or ~/.profile).
+Add installation dir paths for `bedtools`, `samtools` and `igvtools` to your enviroenment.
+`source` it.
+
+
+## Run
+
+* !! Merqury assumes all meryl dbs (dirs) are named with `.meryl`. !!
+
+On a single machine:
+```
+ln -s $MERQURY/merqury.sh		# Link merqury
+./merqury.sh <read-db.meryl> [<mat.meryl> <pat.meryl>] <asm1.fasta> [asm2.fasta] <out>
+
+Usage: merqury.sh <read-db.meryl> [<mat.meryl> <pat.meryl>] <asm1.fasta> [asm2.fasta] <out>
+	<read-db.meryl>	: k-mer counts of the read set
+	<mat.meryl>		: k-mer counts of the maternal haplotype (ex. mat.only.meryl or mat.hapmer.meryl)
+	<pat.meryl>		: k-mer counts of the paternal haplotype (ex. pat.only.meryl or pat.hapmer.meryl)
+	<asm1.fasta>	: Assembly fasta file (ex. pri.fasta, hap1.fasta or maternal.fasta)
+	[asm2.fasta]	: Additional fasta file (ex. alt.fasta, hap2.fasta or paternal.fasta)
+	*asm1.meryl and asm2.meryl will be generated. Avoid using the same names as the hap-mer dbs
+	<out>		: Output prefix
+```
+`< >` : required  
+`[ ]` : optional
+
+## Example
+
+Below is showing examples how to run Merqury using the prebuilt meryl dbs on a. thaliana F1 hybrid.
+The fasta files are the trio-binned assemblies from [Koren et al](https://doi.org/10.1038/nbt.4277).
+
+```shell
+### Download assemblies ###
+wget https://gembox.cbcb.umd.edu/triobinning/athal_COL.fasta
+wget https://gembox.cbcb.umd.edu/triobinning/athal_CVI.fasta
+
+### Download prebuilt meryl dbs ###
+# read.meryl of the F1 hybrid between COL-0 and CVI-0
+wget https://obj.umiacs.umd.edu/marbl_publications/merqury/athal/a_thal.k18.meryl.tar.gz
+# hap-mers for COL-0 haplotype
+wget https://obj.umiacs.umd.edu/marbl_publications/merqury/athal/a_thal.col0.hapmer.meryl.tar.gz
+# hap-mers for CVI-0 haplotype
+wget https://obj.umiacs.umd.edu/marbl_publications/merqury/athal/a_thal.cvi0.hapmer.meryl.tar.gz
+
+# Untar
+for gz in *.tar.gz
+do
+    tar -zxf $gz
+done
+
+# Run merqury
+$MERQURY/merqury.sh F1.k18.meryl col0.hapmer.meryl cvi0.hapmer.meryl athal_COL.fasta athal_CVI.fasta test
+```
+
+
+### 1. I have one assembly (pseudo-haplotype or mixed-haplotype)
+```shell
+# I don't have the hap-mers
+$MERQURY/merqury.sh read-db.meryl asm1.fasta out_prefix
+# Using the example above
+$MERQURY/merqury.sh F1.k18.meryl athal_COL.fasta test-1
+
+# I have the hap-mers
+$MERQURY/merqury.sh read-db.meryl mat.meryl pat.meryl asm1.fasta out_prefix
+# Using the example above
+$MERQURY/merqury.sh F1.k18.meryl col0.hapmer.meryl cvi0.hapmer.meryl athal_COL.fasta test-1
+```
+
+### 2. I have two assemblies (diploid)
+```shell
+# I don't have the hap-mers
+$MERQURY/merqury.sh read-db.meryl asm1.fasta asm2.fasta out_prefix
+# Using the example above
+$MERQURY/merqury.sh F1.k18.meryl athal_COL.fasta athal_CVI.fasta test-2
+
+# I have the hap-mers
+$MERQURY/merqury.sh read-db.meryl mat.meryl pat.meryl asm1.fasta asm2.fasta out_prefix
+# Using the example above
+$MERQURY/merqury.sh F1.k18.meryl col0.hapmer.meryl cvi0.hapmer.meryl athal_COL.fasta athal_CVI.fasta test-2
+```
+
+* Note there is no need to run merqury per-assemblies. Give two fasta files, Merqury generates stats for each and combined.
+
+
+
+### How to parallelize
+Merqury starts with `eval/spectra_cn.sh`.
+When hap-mers are provided, merqury runs modules under `trio/` in addition to `eval/spectra_cn.sh`.
+
+
+The following can run at the same time. Modules with dependency are followed by arrows (->).
+* `eval/spectra_cn.sh` -> `trio/spectra_hap.sh`
+* `trio/hap_blob.sh`
+* `trio/phase_block.sh` per assembly -> `trio/block_n_stats.sh`
+
+
+Meryl, the k-mer counter inside, uses the maximum cpus available.
+Set `OMP_NUM_THREADS=24` for example to use 24 threads.
+
+On slurm environment, simply run:
+```
+ln -s $MERQURY/_submit_merqury.sh	# Link merqury
+./_submit_merqury.sh <read-db.meryl> [<mat.meryl> <pat.meryl>] <asm1.fasta> [asm2.fasta] <out>
+```
+Change the `sbatch` to match your environment. (ex. partition)
+
+
+## Outputs from each modules
+* `eval/spectra_cn.sh`: k-mer completeness, qv, spectra-cn and spectra-asm plots, asm-only `.bed` and `.tdf` for tracking errors
+* `eval/qv.sh`: just get the qv stats and quit.
+* `trio/spectra_hap.sh`: hap-mer level spectra-cn plots, hap-mer completeness
+* `trio/hap_blob.sh`: blob plots of the hap-mers in each contg/scaffold
+* `trio/phase_block.sh`: phase block statistics, phase block N* plots, hap-mer tracks (`.bed` and `.tdf` files)
+* `trio/block_n_stats.sh`: continuity plots (phase block N* or NG* plots, phase block vs. contig/scaffold plots)
+* `trio/switch_error.sh`: this is run part of `phase_blck.sh`, however can be re-run with desired short-range switch parameters. Run `trio/block_n_stats.sh` along with it to get the associated plots.
+
+## Tips for helps
+Run each script without any parameters if not sure what to do.
+For example, `./trio/switch_error.sh` will give a help message and quit.
+
+Following wiki pages have more detailed examples.
+
+## 1. Prepare meryl dbs ([details](https://github.com/marbl/merqury/wiki/1.-Prepare-meryl-dbs))
+1. Get the right k size
+2. Build k-mer dbs with meryl
+3. Build hap-mers for trios
+
+## 2. Overall assembly evaluation ([details](https://github.com/marbl/merqury/wiki/2.-Overall-k-mer-evaluation))
+1. Reference free QV estimate
+2. k-mer completeness (recovery rate)
+3. Spectra copy number analysis
+4. Track error bases in the assembly
+
+## 3. Phasing assessment with hap-mers ([details](https://github.com/marbl/merqury/wiki/3.-Phasing-assessment-with-hap-mers))
+1. Inherited hap-mer plots
+2. Hap-mer blob plots
+3. Hap-mer completeness (recovery rate)
+4. Spectra copy number analysis per hap-mers
+5. Phased block statistics and switch error rates
+6. Track each haplotype block in the assembly
+
+## Available pre-built meryl dbs
+Meryl dbs from Illumina WGS and hapmers are available [here](https://obj.umiacs.umd.edu/marbl_publications/merqury/index.html) for
+* A. thaliana COL-0 x CVI-0 F1
+* NA12878 (HG001)
+* HG002
+
+## Citing merqury
+
+Please use the following [preprint](https://www.biorxiv.org/content/10.1101/2020.03.15.992941v1) to cite Merqury:
+
+Arang Rhie, Brian P. Walenz, Sergey Koren, Adam M. Phillippy, Merqury: reference-free quality and phasing assessment for genome assemblies, bioRxiv (2020). doi: https://doi.org/10.1101/2020.03.15.992941
+
+
+
--- a/programs/merqury_June2020/_submit_build.sh
+++ b/programs/merqury_June2020/_submit_build.sh
@@ -45,11 +45,11 @@ for i in $(seq 1 $LEN)
 do
 	fq=`sed -n ${i}p $input_fofn`
 	GB=`du -k $fq  | awk '{printf "%.0f", $1/1024/1024}'`
-	if [[ $GB -lt 12 ]]; then
-	    echo "$fq is $GB, less than 12GB. Skip splitting."
+	if [[ $GB -lt 15 ]]; then
+	    echo "$fq is $GB, less than 15GB. Skip splitting."
 	    echo $fq >> $input_fofn.$i
 	else
-	    echo "$fq is $GB, over 12GB. Will split and run meryl in parallel."
+	    echo "$fq is $GB, over 15GB. Will split and run meryl in parallel."
 	    echo "Split files will be in $input_fofn.$i"
 	    args="$input_fofn"
 	    split_arrs="$split_arrs$i,"

--- a/programs/merqury_June2020/_submit_build_10x.sh
+++ b/programs/merqury_June2020/_submit_build_10x.sh
 #!/bin/bash

-build=/srv/public/users/tomas/programs/merqury/build
+build=$MERQURY/build

 if [ -z $1 ]; then
    echo "Usage: ./_submit_build.sh <k-size> <R1.fofn> <R2.fofn> <out_prefix> [mem=T]"

--- a/programs/merqury_June2020/_submit_hapmers.sh
+++ b/programs/merqury_June2020/_submit_hapmers.sh
--- a/programs/merqury_June2020/_submit_merqury.sh
+++ b/programs/merqury_June2020/_submit_merqury.sh
--- a/programs/merqury_June2020/_submit_split.sh
+++ b/programs/merqury_June2020/_submit_split.sh
--- a/programs/merqury_June2020/best_k.sh
+++ b/programs/merqury_June2020/best_k.sh
--- a/programs/merqury_June2020/build/concat_splits.sh
+++ b/programs/merqury_June2020/build/concat_splits.sh
--- a/programs/merqury_June2020/build/count.sh
+++ b/programs/merqury_June2020/build/count.sh
--- a/programs/merqury_June2020/build/count_10x.sh
+++ b/programs/merqury_June2020/build/count_10x.sh
@@ -28,9 +28,19 @@ else
    line_num=$SLURM_ARRAY_TASK_ID
 fi

+if [[ ! -z $SLURM_CPUS_PER_TASK ]]; then
+	cpus="threads=$SLURM_CPUS_PER_TASK"
+fi
+
+# If SLURM_MEM_PER_NODE exist; give extra 4Gb
+# otherwise, let meryl determine
+if [[ ! -z $SLURM_MEM_PER_NODE ]]; then
 	# Note: Provide memory in Gb unit. SLURM provides $SLURM_MEM_PER_NODE in Mb.
 	# Give extra 4Gb to avoid 'Bus Error' form running out of memory.
 	mem=$(((SLURM_MEM_PER_NODE/1024)-4))
+	mem="memory=$mem"
+fi
+
 line_num=$(((offset * 1000) + $line_num))

 # Read in the input path
@@ -45,9 +55,9 @@ output=$name.$k.$line_num.meryl
 if [ ! -d $output ]; then
    # Run meryl count: Collect k-mer frequencies
    echo "
-    zcat $input | awk '{if (NR%2==1) {print $1} else {print substr($1,24)}}' | meryl k=$k threads=$SLURM_CPUS_PER_TASK memory=$mem count output $output -
+    zcat $input | awk '{if (NR%2==1) {print $1} else {print substr($1,24)}}' | meryl k=$k $cpus $mem count output $output -
    "
-    zcat $input | awk '{if (NR%2==1) {print $1} else {print substr($1,24)}}' | meryl k=$k threads=$SLURM_CPUS_PER_TASK memory=$mem count output $output -
+    zcat $input | awk '{if (NR%2==1) {print $1} else {print substr($1,24)}}' | meryl k=$k $cpus $mem count output $output -
 else
    echo "$output dir already exist. Nothing to do with $name."
 fi

--- a/programs/merqury_June2020/build/diff.sh
+++ b/programs/merqury_June2020/build/diff.sh
--- a/programs/merqury_June2020/build/filt.sh
+++ b/programs/merqury_June2020/build/filt.sh
--- a/programs/merqury_June2020/build/intersect.sh
+++ b/programs/merqury_June2020/build/intersect.sh
--- a/programs/merqury_June2020/build/split.sh
+++ b/programs/merqury_June2020/build/split.sh
@@ -3,7 +3,7 @@
 echo "Usage: ./split.sh <input.fofn> [LINE_NUM]"
 echo -e "\t<input.fofn>: fastq.gz files we want to split by every 300 million lines."
 echo -e "\t<input.fofn>.LINE_NUM will be generated."
-echo -e "\t\tUse it for building meryl dbs"
+echo -e "\t\tUse it for building meryl dbs. pigz will use maximum of 8 processes by default."
 echo

 FOFN=$1
@@ -25,6 +25,9 @@ if [ -z $tid ]; then
 fi

 cpus=$SLURM_CPUS_PER_TASK
+if [[ -z $cpus ]]; then
+    cpus=8
+fi

 fq=`sed -n ${tid}p $FOFN`
 fq_prefix=`echo $fq | sed 's/.fastq.gz$//g' | sed 's/.fq.gz$//g' | sed 's/.fastq$//g' | sed 's/.fq$//g'`

--- a/programs/merqury_June2020/build/split_10x.sh
+++ b/programs/merqury_June2020/build/split_10x.sh
@@ -3,7 +3,7 @@
 echo "Usage: ./split_10x.sh <input.fofn> [LINE_NUM]"
 echo -e "\t<input.fofn>: 10XG fastq R1.gz files to split per 300 million lines."
 echo -e "\t<input.fofn>.LINE_NUM will be generated."
-echo -e "\t\tUse it for building meryl dbs"
+echo -e "\t\tUse it for building meryl dbs. pigz will use maximum of 8 processes by default."
 echo

 FOFN=$1
@@ -12,7 +12,7 @@ if [ -z $FOFN ]; then
    exit -1
 fi

-tid=300000000	# slurm environment variable for job arrays
+tid=$SLURM_ARRAY_TASK_ID	# slurm environment variable for job arrays
 LINE_NUM=$2

 if [ -z $tid ]; then
@@ -24,11 +24,13 @@ if [ -z $tid ]; then
    exit -1
 fi

-cpus=20
+cpus=$SLURM_CPUS_PER_TASK
+if [[ -z $cpus ]]; then
+    cpus=8
+fi

 fq=`sed -n ${tid}p $FOFN`
 fq_prefix=`echo $fq | sed 's/.fastq.gz$//g' | sed 's/.fq.gz$//g' | sed 's/.fastq$//g' | sed 's/.fq$//g'`
-echo $fq_prefix
 fq_prefix=`basename $fq_prefix`

 mkdir -p split

--- a/programs/merqury_June2020/build/union_sum.sh
+++ b/programs/merqury_June2020/build/union_sum.sh
@@ -16,8 +16,6 @@ output_prefix=$3.k$k
 LEN=`wc -l $input_fofn | awk '{print $1}'`
 NUM_DBS_TO_JOIN=100	# Join every $NUM_DBS_TO_JOIN as intermediates, then merge at the end
 JOIN_IDX=0
-CPU=$SLURM_CPUS_PER_TASK
-MEM=$SLURM_MEM_PER_NODE

 echo "Set ulimit: ulimit -Sn 32000"
 ulimit -Sn 32000
@@ -44,8 +42,6 @@ do
 	if [ ! -d $output ]; then
 	    echo "
 	    meryl \
-	        threads=$CPU \
-	        memory=$((MEM/1024)) \
 	        k=$k \
 	        union-sum \
 	        output $output \
@@ -53,8 +49,6 @@ do
 	    "

 	    meryl \
-	        threads=$CPU \
-	        memory=$((MEM/1024)) \
 	        k=$k \
 	        union-sum \
 	        output $output \
@@ -97,8 +91,6 @@ echo "union-sum of $output_prefix.[ 1 - $JOIN_IDX ] :"
 if [ ! -d $output_prefix ]; then
    echo "
    meryl \
-        threads=$CPU \
-        memory=$((MEM/1024)) \
        k=$k \
        union-sum \
        output $output_prefix.meryl \
@@ -106,8 +98,6 @@ if [ ! -d $output_prefix ]; then
    "

    meryl \
-        threads=$CPU \
-        memory=$((MEM/1024)) \
        k=$k \
        union-sum \
        output $output_prefix.meryl \

--- a/programs/merqury-1.1/eval/asm_multiplicity.sh
+++ b/programs/merqury-1.1/eval/asm_multiplicity.sh
+#!/bin/sh
+
+if [[ "$#" -lt 3 ]]; then
+	echo "Usage: ./asm_multiplicity.sh <asm.fasta> <asm.meryl> <out>"
+	echo -e "\t<asm.fasta>: assembly fasta file"
+	echo -e "\t<asm.meryl>: assembly meryl dir"
+	echo -e "\t<out>: output file prefix. <out>.copies.wig and <out>.asm_multiplicity.bigWig will be generated."
+	exit 0
+fi
+
+asm_fa=$1
+asm=$2
+out=$3
+
+# Requirements: samtools, ucsc kent utils
+module load samtools
+module load ucsc/396
+
+if [[ ! -e $asm_fa ]]; then
+	samtools faidx $asm_fa
+fi
+
+
+echo "
+# Collect copy numbers in assembly"
+meryl-lookup -dump -memory 12 -sequence $asm_fa -mers $asm | awk '$4=="T"' | java -jar -Xmx1g $MERQURY/util/merylDumpToWig.jar - > $out.copies.wig
+
+echo "
+# Convert to bigwig"
+wigToBigWig $out.copies.wig $asm_fa.fai $out.asm_multiplicity.bigWig
+
+echo "Done!"
+