Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/dedupe.sh @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 #!/bin/bash | |
2 | |
3 usage(){ | |
4 echo " | |
5 Written by Brian Bushnell and Jonathan Rood | |
6 Last modified February 19, 2020 | |
7 | |
8 Description: Accepts one or more files containing sets of sequences (reads or scaffolds). | |
9 Removes duplicate sequences, which may be specified to be exact matches, subsequences, or sequences within some percent identity. | |
10 Can also find overlapping sequences and group them into clusters. | |
11 Please read bbmap/docs/guides/DedupeGuide.txt for more information. | |
12 | |
13 Usage: dedupe.sh in=<file or stdin> out=<file or stdout> | |
14 | |
15 An example of running Dedupe for clustering short reads: | |
16 dedupe.sh in=x.fq am=f ac=f fo c pc rnc=f mcs=4 mo=100 s=1 pto cc qin=33 csf=stats.txt pattern=cluster_%.fq dot=graph.dot | |
17 | |
18 Input may be fasta or fastq, compressed or uncompressed. | |
19 Output may be stdout or a file. With no output parameter, data will be written to stdout. | |
20 If 'out=null', there will be no output, but statistics will still be printed. | |
21 You can also use 'dedupe <infile> <outfile>' without the 'in=' and 'out='. | |
22 | |
23 I/O parameters: | |
24 in=<file,file> A single file or a comma-delimited list of files. | |
25 out=<file> Destination for all output contigs. | |
26 pattern=<file> Clusters will be written to individual files, where the '%' symbol in the pattern is replaced by cluster number. | |
27 outd=<file> Optional; removed duplicates will go here. | |
28 csf=<file> (clusterstatsfile) Write a list of cluster names and sizes. | |
29 dot=<file> (graph) Write a graph in dot format. Requires 'fo' and 'pc' flags. | |
30 threads=auto (t) Set number of threads to use; default is number of logical processors. | |
31 overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file. | |
32 showspeed=t (ss) Set to 'f' to suppress display of processing speed. | |
33 minscaf=0 (ms) Ignore contigs/scaffolds shorter than this. | |
34 interleaved=auto If true, forces fastq input to be paired and interleaved. | |
35 ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster. | |
36 | |
37 Output format parameters: | |
38 storename=t (sn) Store scaffold names (set false to save memory). | |
39 #addpairnum=f Add .1 and .2 to numeric id of read1 and read2. | |
40 storequality=t (sq) Store quality values for fastq assemblies (set false to save memory). | |
41 uniquenames=t (un) Ensure all output scaffolds have unique names. Uses more memory. | |
42 mergenames=f When a sequence absorbs another, concatenate their headers. | |
43 mergedelimiter=> Delimiter between merged headers. Can be a symbol name like greaterthan. | |
44 numbergraphnodes=t (ngn) Label dot graph nodes with read numbers rather than read names. | |
45 sort=f Sort output (otherwise it will be random). Options: | |
46 length: Sort by length | |
47 quality: Sort by quality | |
48 name: Sort by name | |
49 id: Sort by input order | |
50 ascending=f Sort in ascending order. | |
51 ordered=f Output sequences in input order. Equivalent to sort=id ascending. | |
52 renameclusters=f (rnc) Rename contigs to indicate which cluster they are in. | |
53 printlengthinedges=f (ple) Print the length of contigs in edges. | |
54 | |
55 Processing parameters: | |
56 absorbrc=t (arc) Absorb reverse-complements as well as normal orientation. | |
57 absorbmatch=t (am) Absorb exact matches of contigs. | |
58 absorbcontainment=t (ac) Absorb full containments of contigs. | |
59 #absorboverlap=f (ao) Absorb (merge) non-contained overlaps of contigs (TODO). | |
60 findoverlap=f (fo) Find overlaps between contigs (containments and non-containments). Necessary for clustering. | |
61 uniqueonly=f (uo) If true, all copies of duplicate reads will be discarded, rather than keeping 1. | |
62 rmn=f (requirematchingnames) If true, both names and sequence must match. | |
63 usejni=f (jni) Do alignments in C code, which is faster, if an edit distance is allowed. | |
64 This will require compiling the C code; details are in /jni/README.txt. | |
65 | |
66 Subset parameters: | |
67 subsetcount=1 (sstc) Number of subsets used to process the data; higher uses less memory. | |
68 subset=0 (sst) Only process reads whose ((ID%subsetcount)==subset). | |
69 | |
70 Clustering parameters: | |
71 cluster=f (c) Group overlapping contigs into clusters. | |
72 pto=f (preventtransitiveoverlaps) Do not look for new edges between nodes in the same cluster. | |
73 minclustersize=1 (mcs) Do not output clusters smaller than this. | |
74 pbr=f (pickbestrepresentative) Only output the single highest-quality read per cluster. | |
75 | |
76 Cluster postprocessing parameters: | |
77 processclusters=f (pc) Run the cluster processing phase, which performs the selected operations in this category. | |
78 For example, pc AND cc must be enabled to perform cc. | |
79 fixmultijoins=t (fmj) Remove redundant overlaps between the same two contigs. | |
80 removecycles=t (rc) Remove all cycles so clusters form trees. | |
81 cc=t (canonicizeclusters) Flip contigs so clusters have a single orientation. | |
82 fcc=f (fixcanoncontradictions) Truncate graph at nodes with canonization disputes. | |
83 foc=f (fixoffsetcontradictions) Truncate graph at nodes with offset disputes. | |
84 mst=f (maxspanningtree) Remove cyclic edges, leaving only the longest edges that form a tree. | |
85 | |
86 Overlap Detection Parameters | |
87 exact=t (ex) Only allow exact symbol matches. When false, an 'N' will match any symbol. | |
88 touppercase=t (tuc) Convert input bases to upper-case; otherwise, lower-case will not match. | |
89 maxsubs=0 (s) Allow up to this many mismatches (substitutions only, no indels). May be set higher than maxedits. | |
90 maxedits=0 (e) Allow up to this many edits (subs or indels). Higher is slower. | |
91 minidentity=100 (mid) Absorb contained sequences with percent identity of at least this (includes indels). | |
92 minlengthpercent=0 (mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed. | |
93 minoverlappercent=0 (mop) Overlap must be at least this percent of smaller contig's length to cluster and merge. | |
94 minoverlap=200 (mo) Overlap must be at least this long to cluster and merge. | |
95 depthratio=0 (dr) When non-zero, overlaps will only be formed between reads with a depth ratio of at most this. | |
96 Should be above 1. Depth is determined by parsing the read names; this information can be added | |
97 by running KmerNormalize (khist.sh, bbnorm.sh, or ecc.sh) with the flag 'rename' | |
98 k=31 Seed length used for finding containments and overlaps. Anything shorter than k will not be found. | |
99 numaffixmaps=1 (nam) Number of prefixes/suffixes to index per contig. Higher is more sensitive, if edits are allowed. | |
100 hashns=f Set to true to search for matches using kmers containing Ns. Can lead to extreme slowdown in some cases. | |
101 #ignoreaffix1=f (ia1) Ignore first affix (for testing). | |
102 #storesuffix=f (ss) Store suffix as well as prefix. Automatically set to true when doing inexact matches. | |
103 | |
104 Other Parameters | |
105 qtrim=f Set to qtrim=rl to trim leading and trailing Ns. | |
106 trimq=6 Quality trim level. | |
107 forcetrimleft=-1 (ftl) If positive, trim bases to the left of this position (exclusive, 0-based). | |
108 forcetrimright=-1 (ftr) If positive, trim bases to the right of this position (exclusive, 0-based). | |
109 | |
110 Note on Proteins / Amino Acids | |
111 Dedupe supports amino acid space via the 'amino' flag. This also changes the default kmer length to 10. | |
112 In amino acid mode, all flags related to canonicity and reverse-complementation are disabled, | |
113 and nam (numaffixmaps) is currently limited to 2 per tip. | |
114 | |
115 Java Parameters: | |
116 -Xmx This will set Java's memory usage, overriding autodetection. | |
117 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. | |
118 The max is typically 85% of physical memory. | |
119 -eoom This flag will cause the process to exit if an out-of-memory exception occurs. Requires Java 8u92+. | |
120 -da Disable assertions. | |
121 | |
122 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems. | |
123 " | |
124 } | |
125 | |
126 #This block allows symlinked shellscripts to correctly set classpath. | |
127 pushd . > /dev/null | |
128 DIR="${BASH_SOURCE[0]}" | |
129 while [ -h "$DIR" ]; do | |
130 cd "$(dirname "$DIR")" | |
131 DIR="$(readlink "$(basename "$DIR")")" | |
132 done | |
133 cd "$(dirname "$DIR")" | |
134 DIR="$(pwd)/" | |
135 popd > /dev/null | |
136 | |
137 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" | |
138 CP="$DIR""current/" | |
139 JNI="-Djava.library.path=""$DIR""jni/" | |
140 JNI="" | |
141 | |
142 z="-Xmx1g" | |
143 z2="-Xms1g" | |
144 set=0 | |
145 | |
146 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then | |
147 usage | |
148 exit | |
149 fi | |
150 | |
151 calcXmx () { | |
152 source "$DIR""/calcmem.sh" | |
153 setEnvironment | |
154 parseXmx "$@" | |
155 if [[ $set == 1 ]]; then | |
156 return | |
157 fi | |
158 freeRam 3200m 84 | |
159 z="-Xmx${RAM}m" | |
160 z2="-Xms${RAM}m" | |
161 } | |
162 calcXmx "$@" | |
163 | |
164 dedupe() { | |
165 local CMD="java $JNI $EA $EOOM $z $z2 -cp $CP jgi.Dedupe $@" | |
166 echo $CMD >&2 | |
167 eval $CMD | |
168 } | |
169 | |
170 dedupe "$@" |