Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/comparesketch.sh @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 #!/bin/bash | |
2 | |
3 usage(){ | |
4 echo " | |
5 Written by Brian Bushnell | |
6 Last modified Jan 7, 2020 | |
7 | |
8 Description: Compares query sketches to others, and prints their kmer identity. | |
9 The input can be sketches made by sketch.sh, or fasta/fastq files. | |
10 It's recommended to first sketch references with sketch.sh for large files, | |
11 or when taxonomic information is desired. | |
12 | |
13 Please read bbmap/docs/guides/BBSketchGuide.txt for more information. | |
14 | |
15 Usage: comparesketch.sh in=<file,file,file...> ref=<file,file,file...> | |
16 Alternative: comparesketch.sh in=<file,file,file...> file file file | |
17 Alternative: comparesketch.sh in=<file,file,file...> *.sketch | |
18 Alternative: comparesketch.sh alltoall *.sketch.gz | |
19 | |
20 File parameters: | |
21 in=<file,file...> Sketches or fasta files to compare. | |
22 out=stdout Comparison output. Can be set to a file instead. | |
23 outsketch=<file> Optionally write sketch files generated from the input. | |
24 ref=<file,file...> List of sketches to compare against. Files given without | |
25 a prefix (ref=) will be treated as references, | |
26 so you can use *.sketch without ref=. | |
27 You can also do ref=nt#.sketch to load all numbered files | |
28 fitting that pattern. | |
29 On NERSC, you can use these abbreviations (e.g. ref=nt): | |
30 nt: nt sketches | |
31 refseq: Refseq sketches | |
32 silva: Silva sketches | |
33 img: IMG sketches | |
34 mito: RefSeq mitochondrial sketches | |
35 fungi: RefSeq fungi sketches | |
36 protein: RefSeq prokaryotic amino acid sketches | |
37 Using an abbreviation automatically sets the blacklist, | |
38 and k. If the reference is in amino space, the query | |
39 also be in amino acid space with the flag amino added. | |
40 If the query is in nucleotide space, use the flag | |
41 'translate', but this will only work for prokaryotes. | |
42 | |
43 Blacklist and Whitelist parameters: | |
44 blacklist=<file> Ignore keys in this sketch file. Additionally, there are | |
45 built-in blacklists that can be specified: | |
46 nt: Blacklist for nt | |
47 refseq: Blacklist for Refseq | |
48 silva: Blacklist for Silva | |
49 img: Blacklist for IMG | |
50 whitelist=f Ignore keys that are not in the index. Requires index=t. | |
51 | |
52 Sketch-making parameters: | |
53 mode=perfile Possible modes, for sequence input: | |
54 single: Generate one sketch. | |
55 sequence: Generate one sketch per sequence. | |
56 perfile: Generate one sketch per file. | |
57 sketchonly=f Don't run comparisons, just write the output sketch file. | |
58 k=31 Kmer length, 1-32. To maximize sensitivity and | |
59 specificity, dual kmer lengths may be used: k=31,24 | |
60 Dual kmers are fastest if the shorter is a multiple | |
61 of 4. Query and reference k must match. | |
62 samplerate=1 Set to a lower value to sample a fraction of input reads. | |
63 For raw reads (rather than an assembly), 1-3x coverage | |
64 gives best results, by reducing error kmers. Somewhat | |
65 higher is better for high-error-rate data like PacBio. | |
66 minkeycount=1 Ignore kmers that occur fewer times than this. Values | |
67 over 1 can be used with raw reads to avoid error kmers. | |
68 minprob=0.0001 Ignore kmers below this probability of correctness. | |
69 minqual=0 Ignore kmers spanning bases below this quality. | |
70 entropy=0.66 Ignore sequence with entropy below this value. | |
71 merge=f Merge paired reads prior to sketching. | |
72 amino=f Use amino acid mode. Input should be amino acids. | |
73 translate=f Call genes and translate to proteins. Input should be | |
74 nucleotides. Designed for prokaryotes. | |
75 sixframes=f Translate all 6 frames instead of predicting genes. | |
76 ssu=t Scan for and retain full-length SSU sequence. | |
77 printssusequence=f Print the query SSU sequence (JSON mode only). | |
78 | |
79 Size parameters: | |
80 size=10000 Desired size of sketches (if not using autosize). | |
81 mgf=0.01 (maxfraction) Max fraction of genomic kmers to use. | |
82 minsize=100 Do not generate sketches for genomes smaller than this. | |
83 autosize=t Use flexible sizing instead of fixed-length. This is | |
84 nonlinear; a human sketch is only ~6x a bacterial sketch. | |
85 sizemult=1 Multiply the autosized size of sketches by this factor. | |
86 Normally a bacterial-size genome will get a sketch size | |
87 of around 10000; if autosizefactor=2, it would be ~20000. | |
88 density= If this flag is set (to a number between 0 and 1), | |
89 autosize and sizemult are ignored, and this fraction of | |
90 genomic kmers are used. For example, at density=0.001, | |
91 a 4.5Mbp bacteria will get a 4500-kmer sketch. | |
92 sketchheapfactor=4 If minkeycount>1, temporarily track this many kmers until | |
93 counts are known and low-count kmers are discarded. | |
94 | |
95 Sketch comparing parameters: | |
96 threads=auto Use this many threads for comparison. | |
97 index=auto Index the sketches for much faster searching. | |
98 Requires more memory and adds startup time. | |
99 Recommended true for many query sketches, false for few. | |
100 prealloc=f Preallocate the index for greater efficiency. | |
101 Can be set to a number between 0 and 1 to determine how | |
102 much of total memory should be used. | |
103 alltoall (ata) Compare all refs to all. Must be sketches. | |
104 compareself=f In all-to-all mode, compare a sketch to itself. | |
105 | |
106 Taxonomy-related parameters: | |
107 tree=<file> Specify a TaxTree file. On Genepool, use tree=auto. | |
108 Only necessary for use with printtaxa and level. | |
109 Assumes comparisons are done against reference sketches | |
110 with known taxonomy information. | |
111 level=2 Only report the best record per taxa at this level. | |
112 Either level names or numbers may be used. | |
113 0: disabled | |
114 1: subspecies | |
115 2: species | |
116 3: genus | |
117 ...etc | |
118 include= Restrict output to organisms in these clades. | |
119 May be a comma-delimited list of names or NCBI TaxIDs. | |
120 includelevel=0 Promote the include list to this taxonomic level. | |
121 For example, include=h.sapiens includelevel=phylum | |
122 would only include organisms in the same phylum as human. | |
123 includestring= Only report records whose name contains this string. | |
124 exclude= Ignore organisms in these clades. | |
125 May be a comma-delimited list of names or NCBI TaxIDs. | |
126 excludelevel=0 Promote the exclude list to this taxonomic level. | |
127 For example, exclude=h.sapiens excludelevel=phylum | |
128 would exclude all organisms in the same phylum as human. | |
129 excludestring= Do not records whose name contains this string. | |
130 minlevel= Use this to restrict comparisons to distantly-related | |
131 organisms. Intended for finding misclassified organisms | |
132 using all-to-all comparisons. minlevel=order would only | |
133 report hits between organisms related at the order level | |
134 or higher, not between same species or genus. | |
135 banunclassified=f Ignore organisms descending from nodes like | |
136 'unclassified Bacteria' | |
137 banvirus=f Ignore viruses. | |
138 requiressu=f Ignore records without SSUs. | |
139 minrefsize=0 Ignore ref sketches smaller than this (unique kmers). | |
140 minrefsizebases=0 Ignore ref sketches smaller than this (total base pairs). | |
141 | |
142 Output format: | |
143 format=2 2: Default format with, per query, one query header line; | |
144 one column header line; and one reference line per hit. | |
145 3: One line per hit, with columns query, reference, ANI, | |
146 and sizeRatio. Useful for all-to-all comparisons. | |
147 4: JSON (format=json also works). | |
148 5: Constellation (format=constellation also works). | |
149 usetaxidname=f For format 3, print the taxID in the name column. | |
150 usetaxname for format 3, print the taxonomic name in the name column. | |
151 useimgname For format 3, print the img ID in the name column. | |
152 | |
153 Output columns (for format=2): | |
154 printall=f Enable all output columns. | |
155 printani=t (ani) Print average nucleotide identity estimate. | |
156 completeness=t Genome completeness estimate. | |
157 score=f Score (used for sorting the output). | |
158 printmatches=t Number of kmer matches to reference. | |
159 printlength=f Number of kmers compared. | |
160 printtaxid=t NCBI taxID. | |
161 printimg=f IMG identifier (only for IMG data). | |
162 printgbases=f Number of genomic bases. | |
163 printgkmers=f Number of genomic kmers. | |
164 printgsize=t Estimated number of unique genomic kmers. | |
165 printgseqs=t Number of sequences (scaffolds/reads). | |
166 printtaxname=t Name associated with this taxID. | |
167 printname0=f (pn0) Original seqeuence name. | |
168 printfname=t Query filename. | |
169 printtaxa=f Full taxonomy of each record. | |
170 printcontam=t Print contamination estimate, and factor contaminant kmers | |
171 into calculations. Kmers are considered contaminant if | |
172 present in some ref sketch but not the current one. | |
173 printunique=t Number of matches unique to this reference. | |
174 printunique2=f Number of matches unique to this reference's taxa. | |
175 printunique3=f Number of query kmers unique to this reference's taxa, | |
176 regardless of whether they are in this reference sketch. | |
177 printnohit=f Number of kmers that don't hit anything. | |
178 printrefhits=f Average number of ref sketches hit by shared kmers. | |
179 printgc=f GC content. | |
180 printucontam=f Contam hits that hit exactly one reference sketch. | |
181 printcontam2=f Print contamination estimate using only kmer hits | |
182 to unrelated taxa. | |
183 contamlevel=species Taxonomic level to use for contam2/unique2/unique3. | |
184 NOTE: unique2/unique3/contam2/refhits require an index. | |
185 | |
186 printdepth=f (depth) Print average depth of sketch kmers; intended | |
187 for shotgun read input. | |
188 printdepth2=f (depth2) Print depth compensating for genomic repeats. | |
189 Requires reference sketches to be generated with depth. | |
190 actualdepth=t If this is false, the raw average count is printed. | |
191 If true, the raw average (observed depth) is converted | |
192 to estimated actual depth (including uncovered areas). | |
193 printvolume=f (volume) Product of average depth and matches. | |
194 printca=f Print common ancestor, if query taxID is known. | |
195 printcal=f Print common ancestor tax level, if query taxID is known. | |
196 recordsperlevel=0 If query TaxID is known, and this is positive, print this | |
197 many records per common ancestor level. | |
198 | |
199 Sorting: | |
200 sortbyscore=t Default sort order is by score, a composite metric. | |
201 sortbydepth=f Include depth as a factor in sort order. | |
202 sortbydepth2=f Include depth2 as a factor in sort order. | |
203 sortbyvolume=f Include volume as a factor in sort order. | |
204 sortbykid=f Sort strictly by KID. | |
205 sortbyani=f Sort strictly by ANI/AAI/WKID. | |
206 sortbyhits=f Sort strictly by the number of kmer hits. | |
207 | |
208 Other output parameters: | |
209 minhits=3 (hits) Only report records with at least this many hits. | |
210 minani=0 (ani) Only report records with at least this ANI (0-1). | |
211 minwkid=0.0001 (wkid) Only report records with at least this WKID (0-1). | |
212 anifromwkid=t Calculate ani from wkid. If false, use kid. | |
213 minbases=0 Ignore ref sketches of sequences shortert than this. | |
214 minsizeratio=0 Don't compare sketches if the smaller genome is less than | |
215 this fraction of the size of the larger. | |
216 records=20 Report at most this many best-matching records. | |
217 color=family Color records at the family level. color=f will disable. | |
218 Colors work in most terminals but may cause odd characters | |
219 to appear in text editors. So, color defaults to f if | |
220 writing to a file. Requires the taxtree to be loaded. | |
221 intersect=f Print sketch intersections. delta=f is suggested. | |
222 | |
223 Metadata flags (optional, for the query sketch header): | |
224 taxid=-1 Set the NCBI taxid. | |
225 imgid=-1 Set the IMG id. | |
226 spid=-1 Set the JGI sequencing project id. | |
227 name= Set the name (taxname). | |
228 name0= Set name0 (normally the first sequence header). | |
229 fname= Set fname (normally the file name). | |
230 meta_= Set an arbitrary metadata field. | |
231 For example, meta_Month=March. | |
232 | |
233 Other parameters: | |
234 requiredmeta= (rmeta) Required optional metadata values. For example: | |
235 rmeta=subunit:ssu,source:silva | |
236 bannedmeta= (bmeta) Forbidden optional metadata values. | |
237 | |
238 | |
239 Java Parameters: | |
240 -Xmx This will set Java's memory usage, overriding autodetection. | |
241 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. | |
242 The max is typically 85% of physical memory. | |
243 -eoom This flag will cause the process to exit if an | |
244 out-of-memory exception occurs. Requires Java 8u92+. | |
245 -da Disable assertions. | |
246 | |
247 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt. | |
248 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems. | |
249 " | |
250 } | |
251 | |
252 #This block allows symlinked shellscripts to correctly set classpath. | |
253 pushd . > /dev/null | |
254 DIR="${BASH_SOURCE[0]}" | |
255 while [ -h "$DIR" ]; do | |
256 cd "$(dirname "$DIR")" | |
257 DIR="$(readlink "$(basename "$DIR")")" | |
258 done | |
259 cd "$(dirname "$DIR")" | |
260 DIR="$(pwd)/" | |
261 popd > /dev/null | |
262 | |
263 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" | |
264 CP="$DIR""current/" | |
265 | |
266 z="-Xmx4g" | |
267 z2="-Xms4g" | |
268 set=0 | |
269 | |
270 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then | |
271 usage | |
272 exit | |
273 fi | |
274 | |
275 calcXmx () { | |
276 source "$DIR""/calcmem.sh" | |
277 setEnvironment | |
278 parseXmx "$@" | |
279 if [[ $set == 1 ]]; then | |
280 return | |
281 fi | |
282 freeRam 3200m 84 | |
283 z="-Xmx${RAM}m" | |
284 z2="-Xms${RAM}m" | |
285 } | |
286 calcXmx "$@" | |
287 | |
288 comparesketch() { | |
289 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.CompareSketch $@" | |
290 # echo $CMD >&2 | |
291 eval $CMD | |
292 } | |
293 | |
294 comparesketch "$@" |