csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/dedupe.sh annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/dedupe.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 #!/bin/bash
jpayne@69	2
jpayne@69	3 usage(){
jpayne@69	4 echo "
jpayne@69	5 Written by Brian Bushnell and Jonathan Rood
jpayne@69	6 Last modified February 19, 2020
jpayne@69	7
jpayne@69	8 Description: Accepts one or more files containing sets of sequences (reads or scaffolds).
jpayne@69	9 Removes duplicate sequences, which may be specified to be exact matches, subsequences, or sequences within some percent identity.
jpayne@69	10 Can also find overlapping sequences and group them into clusters.
jpayne@69	11 Please read bbmap/docs/guides/DedupeGuide.txt for more information.
jpayne@69	12
jpayne@69	13 Usage: dedupe.sh in=<file or stdin> out=<file or stdout>
jpayne@69	14
jpayne@69	15 An example of running Dedupe for clustering short reads:
jpayne@69	16 dedupe.sh in=x.fq am=f ac=f fo c pc rnc=f mcs=4 mo=100 s=1 pto cc qin=33 csf=stats.txt pattern=cluster_%.fq dot=graph.dot
jpayne@69	17
jpayne@69	18 Input may be fasta or fastq, compressed or uncompressed.
jpayne@69	19 Output may be stdout or a file. With no output parameter, data will be written to stdout.
jpayne@69	20 If 'out=null', there will be no output, but statistics will still be printed.
jpayne@69	21 You can also use 'dedupe <infile> <outfile>' without the 'in=' and 'out='.
jpayne@69	22
jpayne@69	23 I/O parameters:
jpayne@69	24 in=<file,file> A single file or a comma-delimited list of files.
jpayne@69	25 out=<file> Destination for all output contigs.
jpayne@69	26 pattern=<file> Clusters will be written to individual files, where the '%' symbol in the pattern is replaced by cluster number.
jpayne@69	27 outd=<file> Optional; removed duplicates will go here.
jpayne@69	28 csf=<file> (clusterstatsfile) Write a list of cluster names and sizes.
jpayne@69	29 dot=<file> (graph) Write a graph in dot format. Requires 'fo' and 'pc' flags.
jpayne@69	30 threads=auto (t) Set number of threads to use; default is number of logical processors.
jpayne@69	31 overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file.
jpayne@69	32 showspeed=t (ss) Set to 'f' to suppress display of processing speed.
jpayne@69	33 minscaf=0 (ms) Ignore contigs/scaffolds shorter than this.
jpayne@69	34 interleaved=auto If true, forces fastq input to be paired and interleaved.
jpayne@69	35 ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
jpayne@69	36
jpayne@69	37 Output format parameters:
jpayne@69	38 storename=t (sn) Store scaffold names (set false to save memory).
jpayne@69	39 #addpairnum=f Add .1 and .2 to numeric id of read1 and read2.
jpayne@69	40 storequality=t (sq) Store quality values for fastq assemblies (set false to save memory).
jpayne@69	41 uniquenames=t (un) Ensure all output scaffolds have unique names. Uses more memory.
jpayne@69	42 mergenames=f When a sequence absorbs another, concatenate their headers.
jpayne@69	43 mergedelimiter=> Delimiter between merged headers. Can be a symbol name like greaterthan.
jpayne@69	44 numbergraphnodes=t (ngn) Label dot graph nodes with read numbers rather than read names.
jpayne@69	45 sort=f Sort output (otherwise it will be random). Options:
jpayne@69	46 length: Sort by length
jpayne@69	47 quality: Sort by quality
jpayne@69	48 name: Sort by name
jpayne@69	49 id: Sort by input order
jpayne@69	50 ascending=f Sort in ascending order.
jpayne@69	51 ordered=f Output sequences in input order. Equivalent to sort=id ascending.
jpayne@69	52 renameclusters=f (rnc) Rename contigs to indicate which cluster they are in.
jpayne@69	53 printlengthinedges=f (ple) Print the length of contigs in edges.
jpayne@69	54
jpayne@69	55 Processing parameters:
jpayne@69	56 absorbrc=t (arc) Absorb reverse-complements as well as normal orientation.
jpayne@69	57 absorbmatch=t (am) Absorb exact matches of contigs.
jpayne@69	58 absorbcontainment=t (ac) Absorb full containments of contigs.
jpayne@69	59 #absorboverlap=f (ao) Absorb (merge) non-contained overlaps of contigs (TODO).
jpayne@69	60 findoverlap=f (fo) Find overlaps between contigs (containments and non-containments). Necessary for clustering.
jpayne@69	61 uniqueonly=f (uo) If true, all copies of duplicate reads will be discarded, rather than keeping 1.
jpayne@69	62 rmn=f (requirematchingnames) If true, both names and sequence must match.
jpayne@69	63 usejni=f (jni) Do alignments in C code, which is faster, if an edit distance is allowed.
jpayne@69	64 This will require compiling the C code; details are in /jni/README.txt.
jpayne@69	65
jpayne@69	66 Subset parameters:
jpayne@69	67 subsetcount=1 (sstc) Number of subsets used to process the data; higher uses less memory.
jpayne@69	68 subset=0 (sst) Only process reads whose ((ID%subsetcount)==subset).
jpayne@69	69
jpayne@69	70 Clustering parameters:
jpayne@69	71 cluster=f (c) Group overlapping contigs into clusters.
jpayne@69	72 pto=f (preventtransitiveoverlaps) Do not look for new edges between nodes in the same cluster.
jpayne@69	73 minclustersize=1 (mcs) Do not output clusters smaller than this.
jpayne@69	74 pbr=f (pickbestrepresentative) Only output the single highest-quality read per cluster.
jpayne@69	75
jpayne@69	76 Cluster postprocessing parameters:
jpayne@69	77 processclusters=f (pc) Run the cluster processing phase, which performs the selected operations in this category.
jpayne@69	78 For example, pc AND cc must be enabled to perform cc.
jpayne@69	79 fixmultijoins=t (fmj) Remove redundant overlaps between the same two contigs.
jpayne@69	80 removecycles=t (rc) Remove all cycles so clusters form trees.
jpayne@69	81 cc=t (canonicizeclusters) Flip contigs so clusters have a single orientation.
jpayne@69	82 fcc=f (fixcanoncontradictions) Truncate graph at nodes with canonization disputes.
jpayne@69	83 foc=f (fixoffsetcontradictions) Truncate graph at nodes with offset disputes.
jpayne@69	84 mst=f (maxspanningtree) Remove cyclic edges, leaving only the longest edges that form a tree.
jpayne@69	85
jpayne@69	86 Overlap Detection Parameters
jpayne@69	87 exact=t (ex) Only allow exact symbol matches. When false, an 'N' will match any symbol.
jpayne@69	88 touppercase=t (tuc) Convert input bases to upper-case; otherwise, lower-case will not match.
jpayne@69	89 maxsubs=0 (s) Allow up to this many mismatches (substitutions only, no indels). May be set higher than maxedits.
jpayne@69	90 maxedits=0 (e) Allow up to this many edits (subs or indels). Higher is slower.
jpayne@69	91 minidentity=100 (mid) Absorb contained sequences with percent identity of at least this (includes indels).
jpayne@69	92 minlengthpercent=0 (mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed.
jpayne@69	93 minoverlappercent=0 (mop) Overlap must be at least this percent of smaller contig's length to cluster and merge.
jpayne@69	94 minoverlap=200 (mo) Overlap must be at least this long to cluster and merge.
jpayne@69	95 depthratio=0 (dr) When non-zero, overlaps will only be formed between reads with a depth ratio of at most this.
jpayne@69	96 Should be above 1. Depth is determined by parsing the read names; this information can be added
jpayne@69	97 by running KmerNormalize (khist.sh, bbnorm.sh, or ecc.sh) with the flag 'rename'
jpayne@69	98 k=31 Seed length used for finding containments and overlaps. Anything shorter than k will not be found.
jpayne@69	99 numaffixmaps=1 (nam) Number of prefixes/suffixes to index per contig. Higher is more sensitive, if edits are allowed.
jpayne@69	100 hashns=f Set to true to search for matches using kmers containing Ns. Can lead to extreme slowdown in some cases.
jpayne@69	101 #ignoreaffix1=f (ia1) Ignore first affix (for testing).
jpayne@69	102 #storesuffix=f (ss) Store suffix as well as prefix. Automatically set to true when doing inexact matches.
jpayne@69	103
jpayne@69	104 Other Parameters
jpayne@69	105 qtrim=f Set to qtrim=rl to trim leading and trailing Ns.
jpayne@69	106 trimq=6 Quality trim level.
jpayne@69	107 forcetrimleft=-1 (ftl) If positive, trim bases to the left of this position (exclusive, 0-based).
jpayne@69	108 forcetrimright=-1 (ftr) If positive, trim bases to the right of this position (exclusive, 0-based).
jpayne@69	109
jpayne@69	110 Note on Proteins / Amino Acids
jpayne@69	111 Dedupe supports amino acid space via the 'amino' flag. This also changes the default kmer length to 10.
jpayne@69	112 In amino acid mode, all flags related to canonicity and reverse-complementation are disabled,
jpayne@69	113 and nam (numaffixmaps) is currently limited to 2 per tip.
jpayne@69	114
jpayne@69	115 Java Parameters:
jpayne@69	116 -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69	117 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69	118 The max is typically 85% of physical memory.
jpayne@69	119 -eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
jpayne@69	120 -da Disable assertions.
jpayne@69	121
jpayne@69	122 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69	123 "
jpayne@69	124 }
jpayne@69	125
jpayne@69	126 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69	127 pushd . > /dev/null
jpayne@69	128 DIR="${BASH_SOURCE[0]}"
jpayne@69	129 while [ -h "$DIR" ]; do
jpayne@69	130 cd "$(dirname "$DIR")"
jpayne@69	131 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69	132 done
jpayne@69	133 cd "$(dirname "$DIR")"
jpayne@69	134 DIR="$(pwd)/"
jpayne@69	135 popd > /dev/null
jpayne@69	136
jpayne@69	137 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69	138 CP="$DIR""current/"
jpayne@69	139 JNI="-Djava.library.path=""$DIR""jni/"
jpayne@69	140 JNI=""
jpayne@69	141
jpayne@69	142 z="-Xmx1g"
jpayne@69	143 z2="-Xms1g"
jpayne@69	144 set=0
jpayne@69	145
jpayne@69	146 if [ -z "$1" ] \|\| [[ $1 == -h ]] \|\| [[ $1 == --help ]]; then
jpayne@69	147 usage
jpayne@69	148 exit
jpayne@69	149 fi
jpayne@69	150
jpayne@69	151 calcXmx () {
jpayne@69	152 source "$DIR""/calcmem.sh"
jpayne@69	153 setEnvironment
jpayne@69	154 parseXmx "$@"
jpayne@69	155 if [[ $set == 1 ]]; then
jpayne@69	156 return
jpayne@69	157 fi
jpayne@69	158 freeRam 3200m 84
jpayne@69	159 z="-Xmx${RAM}m"
jpayne@69	160 z2="-Xms${RAM}m"
jpayne@69	161 }
jpayne@69	162 calcXmx "$@"
jpayne@69	163
jpayne@69	164 dedupe() {
jpayne@69	165 local CMD="java $JNI $EA $EOOM $z $z2 -cp $CP jgi.Dedupe $@"
jpayne@69	166 echo $CMD >&2
jpayne@69	167 eval $CMD
jpayne@69	168 }
jpayne@69	169
jpayne@69	170 dedupe "$@"

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/dedupe.sh @ 69:33d812a61356