comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/dedupe.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 #!/bin/bash
2
3 usage(){
4 echo "
5 Written by Brian Bushnell and Jonathan Rood
6 Last modified February 19, 2020
7
8 Description: Accepts one or more files containing sets of sequences (reads or scaffolds).
9 Removes duplicate sequences, which may be specified to be exact matches, subsequences, or sequences within some percent identity.
10 Can also find overlapping sequences and group them into clusters.
11 Please read bbmap/docs/guides/DedupeGuide.txt for more information.
12
13 Usage: dedupe.sh in=<file or stdin> out=<file or stdout>
14
15 An example of running Dedupe for clustering short reads:
16 dedupe.sh in=x.fq am=f ac=f fo c pc rnc=f mcs=4 mo=100 s=1 pto cc qin=33 csf=stats.txt pattern=cluster_%.fq dot=graph.dot
17
18 Input may be fasta or fastq, compressed or uncompressed.
19 Output may be stdout or a file. With no output parameter, data will be written to stdout.
20 If 'out=null', there will be no output, but statistics will still be printed.
21 You can also use 'dedupe <infile> <outfile>' without the 'in=' and 'out='.
22
23 I/O parameters:
24 in=<file,file> A single file or a comma-delimited list of files.
25 out=<file> Destination for all output contigs.
26 pattern=<file> Clusters will be written to individual files, where the '%' symbol in the pattern is replaced by cluster number.
27 outd=<file> Optional; removed duplicates will go here.
28 csf=<file> (clusterstatsfile) Write a list of cluster names and sizes.
29 dot=<file> (graph) Write a graph in dot format. Requires 'fo' and 'pc' flags.
30 threads=auto (t) Set number of threads to use; default is number of logical processors.
31 overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file.
32 showspeed=t (ss) Set to 'f' to suppress display of processing speed.
33 minscaf=0 (ms) Ignore contigs/scaffolds shorter than this.
34 interleaved=auto If true, forces fastq input to be paired and interleaved.
35 ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster.
36
37 Output format parameters:
38 storename=t (sn) Store scaffold names (set false to save memory).
39 #addpairnum=f Add .1 and .2 to numeric id of read1 and read2.
40 storequality=t (sq) Store quality values for fastq assemblies (set false to save memory).
41 uniquenames=t (un) Ensure all output scaffolds have unique names. Uses more memory.
42 mergenames=f When a sequence absorbs another, concatenate their headers.
43 mergedelimiter=> Delimiter between merged headers. Can be a symbol name like greaterthan.
44 numbergraphnodes=t (ngn) Label dot graph nodes with read numbers rather than read names.
45 sort=f Sort output (otherwise it will be random). Options:
46 length: Sort by length
47 quality: Sort by quality
48 name: Sort by name
49 id: Sort by input order
50 ascending=f Sort in ascending order.
51 ordered=f Output sequences in input order. Equivalent to sort=id ascending.
52 renameclusters=f (rnc) Rename contigs to indicate which cluster they are in.
53 printlengthinedges=f (ple) Print the length of contigs in edges.
54
55 Processing parameters:
56 absorbrc=t (arc) Absorb reverse-complements as well as normal orientation.
57 absorbmatch=t (am) Absorb exact matches of contigs.
58 absorbcontainment=t (ac) Absorb full containments of contigs.
59 #absorboverlap=f (ao) Absorb (merge) non-contained overlaps of contigs (TODO).
60 findoverlap=f (fo) Find overlaps between contigs (containments and non-containments). Necessary for clustering.
61 uniqueonly=f (uo) If true, all copies of duplicate reads will be discarded, rather than keeping 1.
62 rmn=f (requirematchingnames) If true, both names and sequence must match.
63 usejni=f (jni) Do alignments in C code, which is faster, if an edit distance is allowed.
64 This will require compiling the C code; details are in /jni/README.txt.
65
66 Subset parameters:
67 subsetcount=1 (sstc) Number of subsets used to process the data; higher uses less memory.
68 subset=0 (sst) Only process reads whose ((ID%subsetcount)==subset).
69
70 Clustering parameters:
71 cluster=f (c) Group overlapping contigs into clusters.
72 pto=f (preventtransitiveoverlaps) Do not look for new edges between nodes in the same cluster.
73 minclustersize=1 (mcs) Do not output clusters smaller than this.
74 pbr=f (pickbestrepresentative) Only output the single highest-quality read per cluster.
75
76 Cluster postprocessing parameters:
77 processclusters=f (pc) Run the cluster processing phase, which performs the selected operations in this category.
78 For example, pc AND cc must be enabled to perform cc.
79 fixmultijoins=t (fmj) Remove redundant overlaps between the same two contigs.
80 removecycles=t (rc) Remove all cycles so clusters form trees.
81 cc=t (canonicizeclusters) Flip contigs so clusters have a single orientation.
82 fcc=f (fixcanoncontradictions) Truncate graph at nodes with canonization disputes.
83 foc=f (fixoffsetcontradictions) Truncate graph at nodes with offset disputes.
84 mst=f (maxspanningtree) Remove cyclic edges, leaving only the longest edges that form a tree.
85
86 Overlap Detection Parameters
87 exact=t (ex) Only allow exact symbol matches. When false, an 'N' will match any symbol.
88 touppercase=t (tuc) Convert input bases to upper-case; otherwise, lower-case will not match.
89 maxsubs=0 (s) Allow up to this many mismatches (substitutions only, no indels). May be set higher than maxedits.
90 maxedits=0 (e) Allow up to this many edits (subs or indels). Higher is slower.
91 minidentity=100 (mid) Absorb contained sequences with percent identity of at least this (includes indels).
92 minlengthpercent=0 (mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed.
93 minoverlappercent=0 (mop) Overlap must be at least this percent of smaller contig's length to cluster and merge.
94 minoverlap=200 (mo) Overlap must be at least this long to cluster and merge.
95 depthratio=0 (dr) When non-zero, overlaps will only be formed between reads with a depth ratio of at most this.
96 Should be above 1. Depth is determined by parsing the read names; this information can be added
97 by running KmerNormalize (khist.sh, bbnorm.sh, or ecc.sh) with the flag 'rename'
98 k=31 Seed length used for finding containments and overlaps. Anything shorter than k will not be found.
99 numaffixmaps=1 (nam) Number of prefixes/suffixes to index per contig. Higher is more sensitive, if edits are allowed.
100 hashns=f Set to true to search for matches using kmers containing Ns. Can lead to extreme slowdown in some cases.
101 #ignoreaffix1=f (ia1) Ignore first affix (for testing).
102 #storesuffix=f (ss) Store suffix as well as prefix. Automatically set to true when doing inexact matches.
103
104 Other Parameters
105 qtrim=f Set to qtrim=rl to trim leading and trailing Ns.
106 trimq=6 Quality trim level.
107 forcetrimleft=-1 (ftl) If positive, trim bases to the left of this position (exclusive, 0-based).
108 forcetrimright=-1 (ftr) If positive, trim bases to the right of this position (exclusive, 0-based).
109
110 Note on Proteins / Amino Acids
111 Dedupe supports amino acid space via the 'amino' flag. This also changes the default kmer length to 10.
112 In amino acid mode, all flags related to canonicity and reverse-complementation are disabled,
113 and nam (numaffixmaps) is currently limited to 2 per tip.
114
115 Java Parameters:
116 -Xmx This will set Java's memory usage, overriding autodetection.
117 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
118 The max is typically 85% of physical memory.
119 -eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+.
120 -da Disable assertions.
121
122 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
123 "
124 }
125
126 #This block allows symlinked shellscripts to correctly set classpath.
127 pushd . > /dev/null
128 DIR="${BASH_SOURCE[0]}"
129 while [ -h "$DIR" ]; do
130 cd "$(dirname "$DIR")"
131 DIR="$(readlink "$(basename "$DIR")")"
132 done
133 cd "$(dirname "$DIR")"
134 DIR="$(pwd)/"
135 popd > /dev/null
136
137 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
138 CP="$DIR""current/"
139 JNI="-Djava.library.path=""$DIR""jni/"
140 JNI=""
141
142 z="-Xmx1g"
143 z2="-Xms1g"
144 set=0
145
146 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
147 usage
148 exit
149 fi
150
151 calcXmx () {
152 source "$DIR""/calcmem.sh"
153 setEnvironment
154 parseXmx "$@"
155 if [[ $set == 1 ]]; then
156 return
157 fi
158 freeRam 3200m 84
159 z="-Xmx${RAM}m"
160 z2="-Xms${RAM}m"
161 }
162 calcXmx "$@"
163
164 dedupe() {
165 local CMD="java $JNI $EA $EOOM $z $z2 -cp $CP jgi.Dedupe $@"
166 echo $CMD >&2
167 eval $CMD
168 }
169
170 dedupe "$@"