comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/comparesketch.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 #!/bin/bash
2
3 usage(){
4 echo "
5 Written by Brian Bushnell
6 Last modified Jan 7, 2020
7
8 Description: Compares query sketches to others, and prints their kmer identity.
9 The input can be sketches made by sketch.sh, or fasta/fastq files.
10 It's recommended to first sketch references with sketch.sh for large files,
11 or when taxonomic information is desired.
12
13 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
14
15 Usage: comparesketch.sh in=<file,file,file...> ref=<file,file,file...>
16 Alternative: comparesketch.sh in=<file,file,file...> file file file
17 Alternative: comparesketch.sh in=<file,file,file...> *.sketch
18 Alternative: comparesketch.sh alltoall *.sketch.gz
19
20 File parameters:
21 in=<file,file...> Sketches or fasta files to compare.
22 out=stdout Comparison output. Can be set to a file instead.
23 outsketch=<file> Optionally write sketch files generated from the input.
24 ref=<file,file...> List of sketches to compare against. Files given without
25 a prefix (ref=) will be treated as references,
26 so you can use *.sketch without ref=.
27 You can also do ref=nt#.sketch to load all numbered files
28 fitting that pattern.
29 On NERSC, you can use these abbreviations (e.g. ref=nt):
30 nt: nt sketches
31 refseq: Refseq sketches
32 silva: Silva sketches
33 img: IMG sketches
34 mito: RefSeq mitochondrial sketches
35 fungi: RefSeq fungi sketches
36 protein: RefSeq prokaryotic amino acid sketches
37 Using an abbreviation automatically sets the blacklist,
38 and k. If the reference is in amino space, the query
39 also be in amino acid space with the flag amino added.
40 If the query is in nucleotide space, use the flag
41 'translate', but this will only work for prokaryotes.
42
43 Blacklist and Whitelist parameters:
44 blacklist=<file> Ignore keys in this sketch file. Additionally, there are
45 built-in blacklists that can be specified:
46 nt: Blacklist for nt
47 refseq: Blacklist for Refseq
48 silva: Blacklist for Silva
49 img: Blacklist for IMG
50 whitelist=f Ignore keys that are not in the index. Requires index=t.
51
52 Sketch-making parameters:
53 mode=perfile Possible modes, for sequence input:
54 single: Generate one sketch.
55 sequence: Generate one sketch per sequence.
56 perfile: Generate one sketch per file.
57 sketchonly=f Don't run comparisons, just write the output sketch file.
58 k=31 Kmer length, 1-32. To maximize sensitivity and
59 specificity, dual kmer lengths may be used: k=31,24
60 Dual kmers are fastest if the shorter is a multiple
61 of 4. Query and reference k must match.
62 samplerate=1 Set to a lower value to sample a fraction of input reads.
63 For raw reads (rather than an assembly), 1-3x coverage
64 gives best results, by reducing error kmers. Somewhat
65 higher is better for high-error-rate data like PacBio.
66 minkeycount=1 Ignore kmers that occur fewer times than this. Values
67 over 1 can be used with raw reads to avoid error kmers.
68 minprob=0.0001 Ignore kmers below this probability of correctness.
69 minqual=0 Ignore kmers spanning bases below this quality.
70 entropy=0.66 Ignore sequence with entropy below this value.
71 merge=f Merge paired reads prior to sketching.
72 amino=f Use amino acid mode. Input should be amino acids.
73 translate=f Call genes and translate to proteins. Input should be
74 nucleotides. Designed for prokaryotes.
75 sixframes=f Translate all 6 frames instead of predicting genes.
76 ssu=t Scan for and retain full-length SSU sequence.
77 printssusequence=f Print the query SSU sequence (JSON mode only).
78
79 Size parameters:
80 size=10000 Desired size of sketches (if not using autosize).
81 mgf=0.01 (maxfraction) Max fraction of genomic kmers to use.
82 minsize=100 Do not generate sketches for genomes smaller than this.
83 autosize=t Use flexible sizing instead of fixed-length. This is
84 nonlinear; a human sketch is only ~6x a bacterial sketch.
85 sizemult=1 Multiply the autosized size of sketches by this factor.
86 Normally a bacterial-size genome will get a sketch size
87 of around 10000; if autosizefactor=2, it would be ~20000.
88 density= If this flag is set (to a number between 0 and 1),
89 autosize and sizemult are ignored, and this fraction of
90 genomic kmers are used. For example, at density=0.001,
91 a 4.5Mbp bacteria will get a 4500-kmer sketch.
92 sketchheapfactor=4 If minkeycount>1, temporarily track this many kmers until
93 counts are known and low-count kmers are discarded.
94
95 Sketch comparing parameters:
96 threads=auto Use this many threads for comparison.
97 index=auto Index the sketches for much faster searching.
98 Requires more memory and adds startup time.
99 Recommended true for many query sketches, false for few.
100 prealloc=f Preallocate the index for greater efficiency.
101 Can be set to a number between 0 and 1 to determine how
102 much of total memory should be used.
103 alltoall (ata) Compare all refs to all. Must be sketches.
104 compareself=f In all-to-all mode, compare a sketch to itself.
105
106 Taxonomy-related parameters:
107 tree=<file> Specify a TaxTree file. On Genepool, use tree=auto.
108 Only necessary for use with printtaxa and level.
109 Assumes comparisons are done against reference sketches
110 with known taxonomy information.
111 level=2 Only report the best record per taxa at this level.
112 Either level names or numbers may be used.
113 0: disabled
114 1: subspecies
115 2: species
116 3: genus
117 ...etc
118 include= Restrict output to organisms in these clades.
119 May be a comma-delimited list of names or NCBI TaxIDs.
120 includelevel=0 Promote the include list to this taxonomic level.
121 For example, include=h.sapiens includelevel=phylum
122 would only include organisms in the same phylum as human.
123 includestring= Only report records whose name contains this string.
124 exclude= Ignore organisms in these clades.
125 May be a comma-delimited list of names or NCBI TaxIDs.
126 excludelevel=0 Promote the exclude list to this taxonomic level.
127 For example, exclude=h.sapiens excludelevel=phylum
128 would exclude all organisms in the same phylum as human.
129 excludestring= Do not records whose name contains this string.
130 minlevel= Use this to restrict comparisons to distantly-related
131 organisms. Intended for finding misclassified organisms
132 using all-to-all comparisons. minlevel=order would only
133 report hits between organisms related at the order level
134 or higher, not between same species or genus.
135 banunclassified=f Ignore organisms descending from nodes like
136 'unclassified Bacteria'
137 banvirus=f Ignore viruses.
138 requiressu=f Ignore records without SSUs.
139 minrefsize=0 Ignore ref sketches smaller than this (unique kmers).
140 minrefsizebases=0 Ignore ref sketches smaller than this (total base pairs).
141
142 Output format:
143 format=2 2: Default format with, per query, one query header line;
144 one column header line; and one reference line per hit.
145 3: One line per hit, with columns query, reference, ANI,
146 and sizeRatio. Useful for all-to-all comparisons.
147 4: JSON (format=json also works).
148 5: Constellation (format=constellation also works).
149 usetaxidname=f For format 3, print the taxID in the name column.
150 usetaxname for format 3, print the taxonomic name in the name column.
151 useimgname For format 3, print the img ID in the name column.
152
153 Output columns (for format=2):
154 printall=f Enable all output columns.
155 printani=t (ani) Print average nucleotide identity estimate.
156 completeness=t Genome completeness estimate.
157 score=f Score (used for sorting the output).
158 printmatches=t Number of kmer matches to reference.
159 printlength=f Number of kmers compared.
160 printtaxid=t NCBI taxID.
161 printimg=f IMG identifier (only for IMG data).
162 printgbases=f Number of genomic bases.
163 printgkmers=f Number of genomic kmers.
164 printgsize=t Estimated number of unique genomic kmers.
165 printgseqs=t Number of sequences (scaffolds/reads).
166 printtaxname=t Name associated with this taxID.
167 printname0=f (pn0) Original seqeuence name.
168 printfname=t Query filename.
169 printtaxa=f Full taxonomy of each record.
170 printcontam=t Print contamination estimate, and factor contaminant kmers
171 into calculations. Kmers are considered contaminant if
172 present in some ref sketch but not the current one.
173 printunique=t Number of matches unique to this reference.
174 printunique2=f Number of matches unique to this reference's taxa.
175 printunique3=f Number of query kmers unique to this reference's taxa,
176 regardless of whether they are in this reference sketch.
177 printnohit=f Number of kmers that don't hit anything.
178 printrefhits=f Average number of ref sketches hit by shared kmers.
179 printgc=f GC content.
180 printucontam=f Contam hits that hit exactly one reference sketch.
181 printcontam2=f Print contamination estimate using only kmer hits
182 to unrelated taxa.
183 contamlevel=species Taxonomic level to use for contam2/unique2/unique3.
184 NOTE: unique2/unique3/contam2/refhits require an index.
185
186 printdepth=f (depth) Print average depth of sketch kmers; intended
187 for shotgun read input.
188 printdepth2=f (depth2) Print depth compensating for genomic repeats.
189 Requires reference sketches to be generated with depth.
190 actualdepth=t If this is false, the raw average count is printed.
191 If true, the raw average (observed depth) is converted
192 to estimated actual depth (including uncovered areas).
193 printvolume=f (volume) Product of average depth and matches.
194 printca=f Print common ancestor, if query taxID is known.
195 printcal=f Print common ancestor tax level, if query taxID is known.
196 recordsperlevel=0 If query TaxID is known, and this is positive, print this
197 many records per common ancestor level.
198
199 Sorting:
200 sortbyscore=t Default sort order is by score, a composite metric.
201 sortbydepth=f Include depth as a factor in sort order.
202 sortbydepth2=f Include depth2 as a factor in sort order.
203 sortbyvolume=f Include volume as a factor in sort order.
204 sortbykid=f Sort strictly by KID.
205 sortbyani=f Sort strictly by ANI/AAI/WKID.
206 sortbyhits=f Sort strictly by the number of kmer hits.
207
208 Other output parameters:
209 minhits=3 (hits) Only report records with at least this many hits.
210 minani=0 (ani) Only report records with at least this ANI (0-1).
211 minwkid=0.0001 (wkid) Only report records with at least this WKID (0-1).
212 anifromwkid=t Calculate ani from wkid. If false, use kid.
213 minbases=0 Ignore ref sketches of sequences shortert than this.
214 minsizeratio=0 Don't compare sketches if the smaller genome is less than
215 this fraction of the size of the larger.
216 records=20 Report at most this many best-matching records.
217 color=family Color records at the family level. color=f will disable.
218 Colors work in most terminals but may cause odd characters
219 to appear in text editors. So, color defaults to f if
220 writing to a file. Requires the taxtree to be loaded.
221 intersect=f Print sketch intersections. delta=f is suggested.
222
223 Metadata flags (optional, for the query sketch header):
224 taxid=-1 Set the NCBI taxid.
225 imgid=-1 Set the IMG id.
226 spid=-1 Set the JGI sequencing project id.
227 name= Set the name (taxname).
228 name0= Set name0 (normally the first sequence header).
229 fname= Set fname (normally the file name).
230 meta_= Set an arbitrary metadata field.
231 For example, meta_Month=March.
232
233 Other parameters:
234 requiredmeta= (rmeta) Required optional metadata values. For example:
235 rmeta=subunit:ssu,source:silva
236 bannedmeta= (bmeta) Forbidden optional metadata values.
237
238
239 Java Parameters:
240 -Xmx This will set Java's memory usage, overriding autodetection.
241 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
242 The max is typically 85% of physical memory.
243 -eoom This flag will cause the process to exit if an
244 out-of-memory exception occurs. Requires Java 8u92+.
245 -da Disable assertions.
246
247 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
248 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
249 "
250 }
251
252 #This block allows symlinked shellscripts to correctly set classpath.
253 pushd . > /dev/null
254 DIR="${BASH_SOURCE[0]}"
255 while [ -h "$DIR" ]; do
256 cd "$(dirname "$DIR")"
257 DIR="$(readlink "$(basename "$DIR")")"
258 done
259 cd "$(dirname "$DIR")"
260 DIR="$(pwd)/"
261 popd > /dev/null
262
263 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
264 CP="$DIR""current/"
265
266 z="-Xmx4g"
267 z2="-Xms4g"
268 set=0
269
270 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
271 usage
272 exit
273 fi
274
275 calcXmx () {
276 source "$DIR""/calcmem.sh"
277 setEnvironment
278 parseXmx "$@"
279 if [[ $set == 1 ]]; then
280 return
281 fi
282 freeRam 3200m 84
283 z="-Xmx${RAM}m"
284 z2="-Xms${RAM}m"
285 }
286 calcXmx "$@"
287
288 comparesketch() {
289 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.CompareSketch $@"
290 # echo $CMD >&2
291 eval $CMD
292 }
293
294 comparesketch "$@"