diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/pileup.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/pileup.sh	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+usage(){
+echo "
+Written by Brian Bushnell
+Last modified April 30, 2020
+
+Description:  Calculates per-scaffold or per-base coverage information from an unsorted sam or bam file.
+Supports SAM/BAM format for reads and FASTA for reference.
+Sorting is not needed, so output may be streamed directly from a mapping program.
+Requires a minimum of 1 bit per reference base plus 100 bytes per scaffold (even if no reference is specified).
+If per-base coverage is needed (including for stdev and median), at least 4 bytes per base is needed.
+
+Usage:        pileup.sh in=<input> out=<output>
+
+Input Parameters:
+in=<file>           The input sam file; this is the only required parameter.
+ref=<file>          Scans a reference fasta for per-scaffold GC counts, not otherwise needed.
+fastaorf=<file>     An optional fasta file with ORF header information in PRODIGAL's output format.  Must also specify 'outorf'.
+unpigz=t            Decompress with pigz for faster decompression.
+addfromref=t        Allow ref scaffolds not present in sam header to be added from the reference.
+addfromreads=f      Allow ref scaffolds not present in sam header to be added from the reads.
+                    Note that in this case the ref scaffold lengths will be inaccurate.
+
+Output Parameters:
+out=<file>          (covstats) Per-scaffold coverage info.
+rpkm=<file>         Per-scaffold RPKM/FPKM counts.
+twocolumn=f         Change to true to print only ID and Avg_fold instead of all 6 columns.
+countgc=t           Enable/disable counting of read GC content.
+outorf=<file>       Per-orf coverage info to this file (only if 'fastaorf' is specified).
+outsam=<file>       Print the input sam stream to this file (or stdout).  Useful for piping data.
+hist=<file>         Histogram of # occurrences of each depth level.
+basecov=<file>      Coverage per base location.
+bincov=<file>       Binned coverage per location (one line per X bases).
+binsize=1000        Binsize for binned coverage output.
+keepshortbins=t     (ksb) Keep residual bins shorter than binsize.
+normcov=<file>      Normalized coverage by normalized location (X lines per scaffold).
+normcovo=<file>     Overall normalized coverage by normalized location (X lines for the entire assembly).
+normb=-1            If positive, use a fixed number of bins per scaffold; affects 'normcov' and 'normcovo'.
+normc=f             Normalize coverage to fraction of max per scaffold; affects 'normcov' and 'normcovo'.
+delta=f             Only print base coverage lines when the coverage differs from the previous base.
+nzo=f               Only print scaffolds with nonzero coverage.
+concise=f           Write 'basecov' in a more concise format.
+header=t            (hdr) Include headers in output files.
+headerpound=t       (#) Prepend header lines with '#' symbol.
+stdev=t             Calculate coverage standard deviation.
+covminscaf=0        (minscaf) Don't print coverage for scaffolds shorter than this.
+covwindow=0         Calculate how many bases are in windows of this size with
+                    low average coverage.  Produces an extra stats column.
+covwindowavg=5      Average coverage below this will be classified as low.
+k=0                 If positive, calculate kmer coverage statstics for this kmer length.
+keyvalue=f          Output statistics to screen as key=value pairs.
+mincov=1            When calculating percent covered, ignore bases under this depth.
+
+Processing Parameters:
+strandedcov=f       Track coverage for plus and minus strand independently.
+startcov=f          Only track start positions of reads.
+stopcov=f           Only track stop positions of reads.
+secondary=t         Use secondary alignments, if present.
+softclip=f          Include soft-clipped bases in coverage.
+minmapq=0           (minq) Ignore alignments with mapq below this.
+physical=f          (physcov) Calculate physical coverage for paired reads.  This includes the unsequenced bases.
+tlen=t              Track physical coverage from the tlen field rather than recalculating it.
+arrays=auto         Set to t/f to manually force the use of coverage arrays.  Arrays and bitsets are mutually exclusive.
+bitsets=auto        Set to t/f to manually force the use of coverage bitsets.
+32bit=f             Set to true if you need per-base coverage over 64k; does not affect per-scaffold coverage precision.
+                    This option will double RAM usage (when calculating per-base coverage).
+delcoverage=t       (delcov) Count bases covered by deletions or introns as covered.
+                    True is faster than false.
+dupecoverage=t      (dupes) Include reads flagged as duplicates in coverage.
+samstreamer=t       (ss) Load reads multithreaded to increase speed.
+
+Trimming Parameters: 
+** NOTE: These are applied before adding coverage, to allow mimicking **
+** tools like CallVariants, which uses 'qtrim=r trimq=10 border=5'    **
+qtrim=f             Quality-trim.  May be set to:
+                       f (false): Don't quality-trim.
+                       r (right): Trim right (3') end only.
+                       l (left): Trim right (5') end only.
+                       rl (both): Trim both ends.
+trimq=-1            If positive, quality-trim to this threshold.
+border=0            Ignore this many bases on the left and right end.
+
+Output format (tab-delimited):
+ID, Avg_fold, Length, Ref_GC, Covered_percent, Covered_bases, Plus_reads, Minus_reads, Read_GC, Median_fold, Std_Dev
+
+ID:                Scaffold ID
+Length:            Scaffold length
+Ref_GC:            GC ratio of reference
+Avg_fold:          Average fold coverage of this scaffold
+Covered_percent:   Percent of scaffold with any coverage (only if arrays or bitsets are used)
+Covered_bases:     Number of bases with any coverage (only if arrays or bitsets are used)
+Plus_reads:        Number of reads mapped to plus strand
+Minus_reads:       Number of reads mapped to minus strand
+Read_GC:           Average GC ratio of reads mapped to this scaffold
+Median_fold:       Median fold coverage of this scaffold (only if arrays are used)
+Std_Dev:           Standard deviation of coverage (only if arrays are used)
+
+Java Parameters:
+
+-Xmx               This will set Java's memory usage, overriding 
+                   autodetection. -Xmx20g will 
+                   specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.  
+                   The max is typically 85% of physical memory.
+-eoom              This flag will cause the process to exit if an out-of-memory
+                   exception occurs.  Requires Java 8u92+.
+-da                Disable assertions.
+
+Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
+"
+}
+
+#This block allows symlinked shellscripts to correctly set classpath.
+pushd . > /dev/null
+DIR="${BASH_SOURCE[0]}"
+while [ -h "$DIR" ]; do
+  cd "$(dirname "$DIR")"
+  DIR="$(readlink "$(basename "$DIR")")"
+done
+cd "$(dirname "$DIR")"
+DIR="$(pwd)/"
+popd > /dev/null
+
+#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
+CP="$DIR""current/"
+
+z="-Xmx1g"
+z2="-Xms1g"
+set=0
+
+if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
+	usage
+	exit
+fi
+
+calcXmx () {
+	source "$DIR""/calcmem.sh"
+	setEnvironment
+	parseXmx "$@"
+	if [[ $set == 1 ]]; then
+		return
+	fi
+	freeRam 3200m 84
+	z="-Xmx${RAM}m"
+	z2="-Xms${RAM}m"
+}
+calcXmx "$@"
+
+pileup() {
+	local CMD="java $EA $EOOM $z -cp $CP jgi.CoveragePileup $@"
+	echo $CMD >&2
+	eval $CMD
+}
+
+pileup "$@"