jpayne@69: #!/bin/bash
jpayne@69: 
jpayne@69: usage(){
jpayne@69: echo "
jpayne@69: Written by Brian Bushnell and Jonathan Rood
jpayne@69: Last modified February 19, 2020
jpayne@69: 
jpayne@69: Description:  Accepts one or more files containing sets of sequences (reads or scaffolds).
jpayne@69: Removes duplicate sequences, which may be specified to be exact matches, subsequences, or sequences within some percent identity.
jpayne@69: Can also find overlapping sequences and group them into clusters.
jpayne@69: Please read bbmap/docs/guides/DedupeGuide.txt for more information.
jpayne@69: 
jpayne@69: Usage:     dedupe.sh in=<file or stdin> out=<file or stdout>
jpayne@69: 
jpayne@69: An example of running Dedupe for clustering short reads:
jpayne@69: dedupe.sh in=x.fq am=f ac=f fo c pc rnc=f mcs=4 mo=100 s=1 pto cc qin=33 csf=stats.txt pattern=cluster_%.fq dot=graph.dot
jpayne@69: 
jpayne@69: Input may be fasta or fastq, compressed or uncompressed.
jpayne@69: Output may be stdout or a file.  With no output parameter, data will be written to stdout.
jpayne@69: If 'out=null', there will be no output, but statistics will still be printed.
jpayne@69: You can also use 'dedupe <infile> <outfile>' without the 'in=' and 'out='.
jpayne@69: 
jpayne@69: I/O parameters:
jpayne@69: in=<file,file>        A single file or a comma-delimited list of files.
jpayne@69: out=<file>            Destination for all output contigs.
jpayne@69: pattern=<file>        Clusters will be written to individual files, where the '%' symbol in the pattern is replaced by cluster number.
jpayne@69: outd=<file>           Optional; removed duplicates will go here.
jpayne@69: csf=<file>            (clusterstatsfile) Write a list of cluster names and sizes.
jpayne@69: dot=<file>            (graph) Write a graph in dot format.  Requires 'fo' and 'pc' flags.
jpayne@69: threads=auto          (t) Set number of threads to use; default is number of logical processors.
jpayne@69: overwrite=t           (ow) Set to false to force the program to abort rather than overwrite an existing file.
jpayne@69: showspeed=t           (ss) Set to 'f' to suppress display of processing speed.
jpayne@69: minscaf=0             (ms) Ignore contigs/scaffolds shorter than this.
jpayne@69: interleaved=auto      If true, forces fastq input to be paired and interleaved.
jpayne@69: ziplevel=2            Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
jpayne@69: 
jpayne@69: Output format parameters:
jpayne@69: storename=t           (sn) Store scaffold names (set false to save memory).
jpayne@69: #addpairnum=f         Add .1 and .2 to numeric id of read1 and read2.
jpayne@69: storequality=t        (sq) Store quality values for fastq assemblies (set false to save memory).
jpayne@69: uniquenames=t         (un) Ensure all output scaffolds have unique names.  Uses more memory.
jpayne@69: mergenames=f          When a sequence absorbs another, concatenate their headers.
jpayne@69: mergedelimiter=>      Delimiter between merged headers.  Can be a symbol name like greaterthan.
jpayne@69: numbergraphnodes=t    (ngn) Label dot graph nodes with read numbers rather than read names.
jpayne@69: sort=f                Sort output (otherwise it will be random).  Options:
jpayne@69:                          length:  Sort by length
jpayne@69:                          quality: Sort by quality
jpayne@69:                          name:    Sort by name
jpayne@69:                          id:      Sort by input order
jpayne@69: ascending=f           Sort in ascending order.
jpayne@69: ordered=f             Output sequences in input order.  Equivalent to sort=id ascending.
jpayne@69: renameclusters=f      (rnc) Rename contigs to indicate which cluster they are in.
jpayne@69: printlengthinedges=f  (ple) Print the length of contigs in edges.
jpayne@69: 
jpayne@69: Processing parameters:
jpayne@69: absorbrc=t            (arc) Absorb reverse-complements as well as normal orientation.
jpayne@69: absorbmatch=t         (am) Absorb exact matches of contigs.
jpayne@69: absorbcontainment=t   (ac) Absorb full containments of contigs.
jpayne@69: #absorboverlap=f      (ao) Absorb (merge) non-contained overlaps of contigs (TODO).
jpayne@69: findoverlap=f         (fo) Find overlaps between contigs (containments and non-containments).  Necessary for clustering.
jpayne@69: uniqueonly=f          (uo) If true, all copies of duplicate reads will be discarded, rather than keeping 1.
jpayne@69: rmn=f                 (requirematchingnames) If true, both names and sequence must match.
jpayne@69: usejni=f              (jni) Do alignments in C code, which is faster, if an edit distance is allowed.
jpayne@69:                       This will require compiling the C code; details are in /jni/README.txt.
jpayne@69: 
jpayne@69: Subset parameters:
jpayne@69: subsetcount=1         (sstc) Number of subsets used to process the data; higher uses less memory.
jpayne@69: subset=0              (sst) Only process reads whose ((ID%subsetcount)==subset).
jpayne@69: 
jpayne@69: Clustering parameters:
jpayne@69: cluster=f             (c) Group overlapping contigs into clusters.
jpayne@69: pto=f                 (preventtransitiveoverlaps) Do not look for new edges between nodes in the same cluster.
jpayne@69: minclustersize=1      (mcs) Do not output clusters smaller than this.
jpayne@69: pbr=f                 (pickbestrepresentative) Only output the single highest-quality read per cluster.
jpayne@69: 
jpayne@69: Cluster postprocessing parameters:
jpayne@69: processclusters=f     (pc) Run the cluster processing phase, which performs the selected operations in this category.
jpayne@69:                       For example, pc AND cc must be enabled to perform cc.
jpayne@69: fixmultijoins=t       (fmj) Remove redundant overlaps between the same two contigs.
jpayne@69: removecycles=t        (rc) Remove all cycles so clusters form trees.
jpayne@69: cc=t                  (canonicizeclusters) Flip contigs so clusters have a single orientation.
jpayne@69: fcc=f                 (fixcanoncontradictions) Truncate graph at nodes with canonization disputes.
jpayne@69: foc=f                 (fixoffsetcontradictions) Truncate graph at nodes with offset disputes.
jpayne@69: mst=f                 (maxspanningtree) Remove cyclic edges, leaving only the longest edges that form a tree.
jpayne@69: 
jpayne@69: Overlap Detection Parameters
jpayne@69: exact=t               (ex) Only allow exact symbol matches.  When false, an 'N' will match any symbol.
jpayne@69: touppercase=t         (tuc) Convert input bases to upper-case; otherwise, lower-case will not match.
jpayne@69: maxsubs=0             (s) Allow up to this many mismatches (substitutions only, no indels).  May be set higher than maxedits.
jpayne@69: maxedits=0            (e) Allow up to this many edits (subs or indels).  Higher is slower.
jpayne@69: minidentity=100       (mid) Absorb contained sequences with percent identity of at least this (includes indels).
jpayne@69: minlengthpercent=0    (mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed.
jpayne@69: minoverlappercent=0   (mop) Overlap must be at least this percent of smaller contig's length to cluster and merge.
jpayne@69: minoverlap=200        (mo) Overlap must be at least this long to cluster and merge.
jpayne@69: depthratio=0          (dr) When non-zero, overlaps will only be formed between reads with a depth ratio of at most this.
jpayne@69:                       Should be above 1.  Depth is determined by parsing the read names; this information can be added
jpayne@69:                       by running KmerNormalize (khist.sh, bbnorm.sh, or ecc.sh) with the flag 'rename'
jpayne@69: k=31                  Seed length used for finding containments and overlaps.  Anything shorter than k will not be found.
jpayne@69: numaffixmaps=1        (nam) Number of prefixes/suffixes to index per contig. Higher is more sensitive, if edits are allowed.
jpayne@69: hashns=f              Set to true to search for matches using kmers containing Ns.  Can lead to extreme slowdown in some cases.
jpayne@69: #ignoreaffix1=f       (ia1) Ignore first affix (for testing).
jpayne@69: #storesuffix=f        (ss) Store suffix as well as prefix.  Automatically set to true when doing inexact matches.
jpayne@69: 
jpayne@69: Other Parameters
jpayne@69: qtrim=f               Set to qtrim=rl to trim leading and trailing Ns.
jpayne@69: trimq=6               Quality trim level.
jpayne@69: forcetrimleft=-1      (ftl) If positive, trim bases to the left of this position (exclusive, 0-based).
jpayne@69: forcetrimright=-1     (ftr) If positive, trim bases to the right of this position (exclusive, 0-based).
jpayne@69: 
jpayne@69: Note on Proteins / Amino Acids
jpayne@69: Dedupe supports amino acid space via the 'amino' flag.  This also changes the default kmer length to 10.
jpayne@69: In amino acid mode, all flags related to canonicity and reverse-complementation are disabled,
jpayne@69: and nam (numaffixmaps) is currently limited to 2 per tip.
jpayne@69: 
jpayne@69: Java Parameters:
jpayne@69: -Xmx                  This will set Java's memory usage, overriding autodetection.
jpayne@69:                       -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69:                     The max is typically 85% of physical memory.
jpayne@69: -eoom                 This flag will cause the process to exit if an out-of-memory exception occurs.  Requires Java 8u92+.
jpayne@69: -da                   Disable assertions.
jpayne@69: 
jpayne@69: Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69: "
jpayne@69: }
jpayne@69: 
jpayne@69: #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69: pushd . > /dev/null
jpayne@69: DIR="${BASH_SOURCE[0]}"
jpayne@69: while [ -h "$DIR" ]; do
jpayne@69:   cd "$(dirname "$DIR")"
jpayne@69:   DIR="$(readlink "$(basename "$DIR")")"
jpayne@69: done
jpayne@69: cd "$(dirname "$DIR")"
jpayne@69: DIR="$(pwd)/"
jpayne@69: popd > /dev/null
jpayne@69: 
jpayne@69: #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69: CP="$DIR""current/"
jpayne@69: JNI="-Djava.library.path=""$DIR""jni/"
jpayne@69: JNI=""
jpayne@69: 
jpayne@69: z="-Xmx1g"
jpayne@69: z2="-Xms1g"
jpayne@69: set=0
jpayne@69: 
jpayne@69: if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69: 	usage
jpayne@69: 	exit
jpayne@69: fi
jpayne@69: 
jpayne@69: calcXmx () {
jpayne@69: 	source "$DIR""/calcmem.sh"
jpayne@69: 	setEnvironment
jpayne@69: 	parseXmx "$@"
jpayne@69: 	if [[ $set == 1 ]]; then
jpayne@69: 	return
jpayne@69: 	fi
jpayne@69: 	freeRam 3200m 84
jpayne@69: 	z="-Xmx${RAM}m"
jpayne@69: 	z2="-Xms${RAM}m"
jpayne@69: }
jpayne@69: calcXmx "$@"
jpayne@69: 
jpayne@69: dedupe() {
jpayne@69: 	local CMD="java $JNI $EA $EOOM $z $z2 -cp $CP jgi.Dedupe $@"
jpayne@69: 	echo $CMD >&2
jpayne@69: 	eval $CMD
jpayne@69: }
jpayne@69: 
jpayne@69: dedupe "$@"