jpayne@69
|
1 #!/bin/bash
|
jpayne@69
|
2
|
jpayne@69
|
3 usage(){
|
jpayne@69
|
4 echo "
|
jpayne@69
|
5 Written by Brian Bushnell
|
jpayne@69
|
6 Last modified Jan 7, 2020
|
jpayne@69
|
7
|
jpayne@69
|
8 Description: Compares query sketches to others, and prints their kmer identity.
|
jpayne@69
|
9 The input can be sketches made by sketch.sh, or fasta/fastq files.
|
jpayne@69
|
10 It's recommended to first sketch references with sketch.sh for large files,
|
jpayne@69
|
11 or when taxonomic information is desired.
|
jpayne@69
|
12
|
jpayne@69
|
13 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
|
jpayne@69
|
14
|
jpayne@69
|
15 Usage: comparesketch.sh in=<file,file,file...> ref=<file,file,file...>
|
jpayne@69
|
16 Alternative: comparesketch.sh in=<file,file,file...> file file file
|
jpayne@69
|
17 Alternative: comparesketch.sh in=<file,file,file...> *.sketch
|
jpayne@69
|
18 Alternative: comparesketch.sh alltoall *.sketch.gz
|
jpayne@69
|
19
|
jpayne@69
|
20 File parameters:
|
jpayne@69
|
21 in=<file,file...> Sketches or fasta files to compare.
|
jpayne@69
|
22 out=stdout Comparison output. Can be set to a file instead.
|
jpayne@69
|
23 outsketch=<file> Optionally write sketch files generated from the input.
|
jpayne@69
|
24 ref=<file,file...> List of sketches to compare against. Files given without
|
jpayne@69
|
25 a prefix (ref=) will be treated as references,
|
jpayne@69
|
26 so you can use *.sketch without ref=.
|
jpayne@69
|
27 You can also do ref=nt#.sketch to load all numbered files
|
jpayne@69
|
28 fitting that pattern.
|
jpayne@69
|
29 On NERSC, you can use these abbreviations (e.g. ref=nt):
|
jpayne@69
|
30 nt: nt sketches
|
jpayne@69
|
31 refseq: Refseq sketches
|
jpayne@69
|
32 silva: Silva sketches
|
jpayne@69
|
33 img: IMG sketches
|
jpayne@69
|
34 mito: RefSeq mitochondrial sketches
|
jpayne@69
|
35 fungi: RefSeq fungi sketches
|
jpayne@69
|
36 protein: RefSeq prokaryotic amino acid sketches
|
jpayne@69
|
37 Using an abbreviation automatically sets the blacklist,
|
jpayne@69
|
38 and k. If the reference is in amino space, the query
|
jpayne@69
|
39 also be in amino acid space with the flag amino added.
|
jpayne@69
|
40 If the query is in nucleotide space, use the flag
|
jpayne@69
|
41 'translate', but this will only work for prokaryotes.
|
jpayne@69
|
42
|
jpayne@69
|
43 Blacklist and Whitelist parameters:
|
jpayne@69
|
44 blacklist=<file> Ignore keys in this sketch file. Additionally, there are
|
jpayne@69
|
45 built-in blacklists that can be specified:
|
jpayne@69
|
46 nt: Blacklist for nt
|
jpayne@69
|
47 refseq: Blacklist for Refseq
|
jpayne@69
|
48 silva: Blacklist for Silva
|
jpayne@69
|
49 img: Blacklist for IMG
|
jpayne@69
|
50 whitelist=f Ignore keys that are not in the index. Requires index=t.
|
jpayne@69
|
51
|
jpayne@69
|
52 Sketch-making parameters:
|
jpayne@69
|
53 mode=perfile Possible modes, for sequence input:
|
jpayne@69
|
54 single: Generate one sketch.
|
jpayne@69
|
55 sequence: Generate one sketch per sequence.
|
jpayne@69
|
56 perfile: Generate one sketch per file.
|
jpayne@69
|
57 sketchonly=f Don't run comparisons, just write the output sketch file.
|
jpayne@69
|
58 k=31 Kmer length, 1-32. To maximize sensitivity and
|
jpayne@69
|
59 specificity, dual kmer lengths may be used: k=31,24
|
jpayne@69
|
60 Dual kmers are fastest if the shorter is a multiple
|
jpayne@69
|
61 of 4. Query and reference k must match.
|
jpayne@69
|
62 samplerate=1 Set to a lower value to sample a fraction of input reads.
|
jpayne@69
|
63 For raw reads (rather than an assembly), 1-3x coverage
|
jpayne@69
|
64 gives best results, by reducing error kmers. Somewhat
|
jpayne@69
|
65 higher is better for high-error-rate data like PacBio.
|
jpayne@69
|
66 minkeycount=1 Ignore kmers that occur fewer times than this. Values
|
jpayne@69
|
67 over 1 can be used with raw reads to avoid error kmers.
|
jpayne@69
|
68 minprob=0.0001 Ignore kmers below this probability of correctness.
|
jpayne@69
|
69 minqual=0 Ignore kmers spanning bases below this quality.
|
jpayne@69
|
70 entropy=0.66 Ignore sequence with entropy below this value.
|
jpayne@69
|
71 merge=f Merge paired reads prior to sketching.
|
jpayne@69
|
72 amino=f Use amino acid mode. Input should be amino acids.
|
jpayne@69
|
73 translate=f Call genes and translate to proteins. Input should be
|
jpayne@69
|
74 nucleotides. Designed for prokaryotes.
|
jpayne@69
|
75 sixframes=f Translate all 6 frames instead of predicting genes.
|
jpayne@69
|
76 ssu=t Scan for and retain full-length SSU sequence.
|
jpayne@69
|
77 printssusequence=f Print the query SSU sequence (JSON mode only).
|
jpayne@69
|
78
|
jpayne@69
|
79 Size parameters:
|
jpayne@69
|
80 size=10000 Desired size of sketches (if not using autosize).
|
jpayne@69
|
81 mgf=0.01 (maxfraction) Max fraction of genomic kmers to use.
|
jpayne@69
|
82 minsize=100 Do not generate sketches for genomes smaller than this.
|
jpayne@69
|
83 autosize=t Use flexible sizing instead of fixed-length. This is
|
jpayne@69
|
84 nonlinear; a human sketch is only ~6x a bacterial sketch.
|
jpayne@69
|
85 sizemult=1 Multiply the autosized size of sketches by this factor.
|
jpayne@69
|
86 Normally a bacterial-size genome will get a sketch size
|
jpayne@69
|
87 of around 10000; if autosizefactor=2, it would be ~20000.
|
jpayne@69
|
88 density= If this flag is set (to a number between 0 and 1),
|
jpayne@69
|
89 autosize and sizemult are ignored, and this fraction of
|
jpayne@69
|
90 genomic kmers are used. For example, at density=0.001,
|
jpayne@69
|
91 a 4.5Mbp bacteria will get a 4500-kmer sketch.
|
jpayne@69
|
92 sketchheapfactor=4 If minkeycount>1, temporarily track this many kmers until
|
jpayne@69
|
93 counts are known and low-count kmers are discarded.
|
jpayne@69
|
94
|
jpayne@69
|
95 Sketch comparing parameters:
|
jpayne@69
|
96 threads=auto Use this many threads for comparison.
|
jpayne@69
|
97 index=auto Index the sketches for much faster searching.
|
jpayne@69
|
98 Requires more memory and adds startup time.
|
jpayne@69
|
99 Recommended true for many query sketches, false for few.
|
jpayne@69
|
100 prealloc=f Preallocate the index for greater efficiency.
|
jpayne@69
|
101 Can be set to a number between 0 and 1 to determine how
|
jpayne@69
|
102 much of total memory should be used.
|
jpayne@69
|
103 alltoall (ata) Compare all refs to all. Must be sketches.
|
jpayne@69
|
104 compareself=f In all-to-all mode, compare a sketch to itself.
|
jpayne@69
|
105
|
jpayne@69
|
106 Taxonomy-related parameters:
|
jpayne@69
|
107 tree=<file> Specify a TaxTree file. On Genepool, use tree=auto.
|
jpayne@69
|
108 Only necessary for use with printtaxa and level.
|
jpayne@69
|
109 Assumes comparisons are done against reference sketches
|
jpayne@69
|
110 with known taxonomy information.
|
jpayne@69
|
111 level=2 Only report the best record per taxa at this level.
|
jpayne@69
|
112 Either level names or numbers may be used.
|
jpayne@69
|
113 0: disabled
|
jpayne@69
|
114 1: subspecies
|
jpayne@69
|
115 2: species
|
jpayne@69
|
116 3: genus
|
jpayne@69
|
117 ...etc
|
jpayne@69
|
118 include= Restrict output to organisms in these clades.
|
jpayne@69
|
119 May be a comma-delimited list of names or NCBI TaxIDs.
|
jpayne@69
|
120 includelevel=0 Promote the include list to this taxonomic level.
|
jpayne@69
|
121 For example, include=h.sapiens includelevel=phylum
|
jpayne@69
|
122 would only include organisms in the same phylum as human.
|
jpayne@69
|
123 includestring= Only report records whose name contains this string.
|
jpayne@69
|
124 exclude= Ignore organisms in these clades.
|
jpayne@69
|
125 May be a comma-delimited list of names or NCBI TaxIDs.
|
jpayne@69
|
126 excludelevel=0 Promote the exclude list to this taxonomic level.
|
jpayne@69
|
127 For example, exclude=h.sapiens excludelevel=phylum
|
jpayne@69
|
128 would exclude all organisms in the same phylum as human.
|
jpayne@69
|
129 excludestring= Do not records whose name contains this string.
|
jpayne@69
|
130 minlevel= Use this to restrict comparisons to distantly-related
|
jpayne@69
|
131 organisms. Intended for finding misclassified organisms
|
jpayne@69
|
132 using all-to-all comparisons. minlevel=order would only
|
jpayne@69
|
133 report hits between organisms related at the order level
|
jpayne@69
|
134 or higher, not between same species or genus.
|
jpayne@69
|
135 banunclassified=f Ignore organisms descending from nodes like
|
jpayne@69
|
136 'unclassified Bacteria'
|
jpayne@69
|
137 banvirus=f Ignore viruses.
|
jpayne@69
|
138 requiressu=f Ignore records without SSUs.
|
jpayne@69
|
139 minrefsize=0 Ignore ref sketches smaller than this (unique kmers).
|
jpayne@69
|
140 minrefsizebases=0 Ignore ref sketches smaller than this (total base pairs).
|
jpayne@69
|
141
|
jpayne@69
|
142 Output format:
|
jpayne@69
|
143 format=2 2: Default format with, per query, one query header line;
|
jpayne@69
|
144 one column header line; and one reference line per hit.
|
jpayne@69
|
145 3: One line per hit, with columns query, reference, ANI,
|
jpayne@69
|
146 and sizeRatio. Useful for all-to-all comparisons.
|
jpayne@69
|
147 4: JSON (format=json also works).
|
jpayne@69
|
148 5: Constellation (format=constellation also works).
|
jpayne@69
|
149 usetaxidname=f For format 3, print the taxID in the name column.
|
jpayne@69
|
150 usetaxname for format 3, print the taxonomic name in the name column.
|
jpayne@69
|
151 useimgname For format 3, print the img ID in the name column.
|
jpayne@69
|
152
|
jpayne@69
|
153 Output columns (for format=2):
|
jpayne@69
|
154 printall=f Enable all output columns.
|
jpayne@69
|
155 printani=t (ani) Print average nucleotide identity estimate.
|
jpayne@69
|
156 completeness=t Genome completeness estimate.
|
jpayne@69
|
157 score=f Score (used for sorting the output).
|
jpayne@69
|
158 printmatches=t Number of kmer matches to reference.
|
jpayne@69
|
159 printlength=f Number of kmers compared.
|
jpayne@69
|
160 printtaxid=t NCBI taxID.
|
jpayne@69
|
161 printimg=f IMG identifier (only for IMG data).
|
jpayne@69
|
162 printgbases=f Number of genomic bases.
|
jpayne@69
|
163 printgkmers=f Number of genomic kmers.
|
jpayne@69
|
164 printgsize=t Estimated number of unique genomic kmers.
|
jpayne@69
|
165 printgseqs=t Number of sequences (scaffolds/reads).
|
jpayne@69
|
166 printtaxname=t Name associated with this taxID.
|
jpayne@69
|
167 printname0=f (pn0) Original seqeuence name.
|
jpayne@69
|
168 printfname=t Query filename.
|
jpayne@69
|
169 printtaxa=f Full taxonomy of each record.
|
jpayne@69
|
170 printcontam=t Print contamination estimate, and factor contaminant kmers
|
jpayne@69
|
171 into calculations. Kmers are considered contaminant if
|
jpayne@69
|
172 present in some ref sketch but not the current one.
|
jpayne@69
|
173 printunique=t Number of matches unique to this reference.
|
jpayne@69
|
174 printunique2=f Number of matches unique to this reference's taxa.
|
jpayne@69
|
175 printunique3=f Number of query kmers unique to this reference's taxa,
|
jpayne@69
|
176 regardless of whether they are in this reference sketch.
|
jpayne@69
|
177 printnohit=f Number of kmers that don't hit anything.
|
jpayne@69
|
178 printrefhits=f Average number of ref sketches hit by shared kmers.
|
jpayne@69
|
179 printgc=f GC content.
|
jpayne@69
|
180 printucontam=f Contam hits that hit exactly one reference sketch.
|
jpayne@69
|
181 printcontam2=f Print contamination estimate using only kmer hits
|
jpayne@69
|
182 to unrelated taxa.
|
jpayne@69
|
183 contamlevel=species Taxonomic level to use for contam2/unique2/unique3.
|
jpayne@69
|
184 NOTE: unique2/unique3/contam2/refhits require an index.
|
jpayne@69
|
185
|
jpayne@69
|
186 printdepth=f (depth) Print average depth of sketch kmers; intended
|
jpayne@69
|
187 for shotgun read input.
|
jpayne@69
|
188 printdepth2=f (depth2) Print depth compensating for genomic repeats.
|
jpayne@69
|
189 Requires reference sketches to be generated with depth.
|
jpayne@69
|
190 actualdepth=t If this is false, the raw average count is printed.
|
jpayne@69
|
191 If true, the raw average (observed depth) is converted
|
jpayne@69
|
192 to estimated actual depth (including uncovered areas).
|
jpayne@69
|
193 printvolume=f (volume) Product of average depth and matches.
|
jpayne@69
|
194 printca=f Print common ancestor, if query taxID is known.
|
jpayne@69
|
195 printcal=f Print common ancestor tax level, if query taxID is known.
|
jpayne@69
|
196 recordsperlevel=0 If query TaxID is known, and this is positive, print this
|
jpayne@69
|
197 many records per common ancestor level.
|
jpayne@69
|
198
|
jpayne@69
|
199 Sorting:
|
jpayne@69
|
200 sortbyscore=t Default sort order is by score, a composite metric.
|
jpayne@69
|
201 sortbydepth=f Include depth as a factor in sort order.
|
jpayne@69
|
202 sortbydepth2=f Include depth2 as a factor in sort order.
|
jpayne@69
|
203 sortbyvolume=f Include volume as a factor in sort order.
|
jpayne@69
|
204 sortbykid=f Sort strictly by KID.
|
jpayne@69
|
205 sortbyani=f Sort strictly by ANI/AAI/WKID.
|
jpayne@69
|
206 sortbyhits=f Sort strictly by the number of kmer hits.
|
jpayne@69
|
207
|
jpayne@69
|
208 Other output parameters:
|
jpayne@69
|
209 minhits=3 (hits) Only report records with at least this many hits.
|
jpayne@69
|
210 minani=0 (ani) Only report records with at least this ANI (0-1).
|
jpayne@69
|
211 minwkid=0.0001 (wkid) Only report records with at least this WKID (0-1).
|
jpayne@69
|
212 anifromwkid=t Calculate ani from wkid. If false, use kid.
|
jpayne@69
|
213 minbases=0 Ignore ref sketches of sequences shortert than this.
|
jpayne@69
|
214 minsizeratio=0 Don't compare sketches if the smaller genome is less than
|
jpayne@69
|
215 this fraction of the size of the larger.
|
jpayne@69
|
216 records=20 Report at most this many best-matching records.
|
jpayne@69
|
217 color=family Color records at the family level. color=f will disable.
|
jpayne@69
|
218 Colors work in most terminals but may cause odd characters
|
jpayne@69
|
219 to appear in text editors. So, color defaults to f if
|
jpayne@69
|
220 writing to a file. Requires the taxtree to be loaded.
|
jpayne@69
|
221 intersect=f Print sketch intersections. delta=f is suggested.
|
jpayne@69
|
222
|
jpayne@69
|
223 Metadata flags (optional, for the query sketch header):
|
jpayne@69
|
224 taxid=-1 Set the NCBI taxid.
|
jpayne@69
|
225 imgid=-1 Set the IMG id.
|
jpayne@69
|
226 spid=-1 Set the JGI sequencing project id.
|
jpayne@69
|
227 name= Set the name (taxname).
|
jpayne@69
|
228 name0= Set name0 (normally the first sequence header).
|
jpayne@69
|
229 fname= Set fname (normally the file name).
|
jpayne@69
|
230 meta_= Set an arbitrary metadata field.
|
jpayne@69
|
231 For example, meta_Month=March.
|
jpayne@69
|
232
|
jpayne@69
|
233 Other parameters:
|
jpayne@69
|
234 requiredmeta= (rmeta) Required optional metadata values. For example:
|
jpayne@69
|
235 rmeta=subunit:ssu,source:silva
|
jpayne@69
|
236 bannedmeta= (bmeta) Forbidden optional metadata values.
|
jpayne@69
|
237
|
jpayne@69
|
238
|
jpayne@69
|
239 Java Parameters:
|
jpayne@69
|
240 -Xmx This will set Java's memory usage, overriding autodetection.
|
jpayne@69
|
241 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
|
jpayne@69
|
242 The max is typically 85% of physical memory.
|
jpayne@69
|
243 -eoom This flag will cause the process to exit if an
|
jpayne@69
|
244 out-of-memory exception occurs. Requires Java 8u92+.
|
jpayne@69
|
245 -da Disable assertions.
|
jpayne@69
|
246
|
jpayne@69
|
247 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
|
jpayne@69
|
248 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
|
jpayne@69
|
249 "
|
jpayne@69
|
250 }
|
jpayne@69
|
251
|
jpayne@69
|
252 #This block allows symlinked shellscripts to correctly set classpath.
|
jpayne@69
|
253 pushd . > /dev/null
|
jpayne@69
|
254 DIR="${BASH_SOURCE[0]}"
|
jpayne@69
|
255 while [ -h "$DIR" ]; do
|
jpayne@69
|
256 cd "$(dirname "$DIR")"
|
jpayne@69
|
257 DIR="$(readlink "$(basename "$DIR")")"
|
jpayne@69
|
258 done
|
jpayne@69
|
259 cd "$(dirname "$DIR")"
|
jpayne@69
|
260 DIR="$(pwd)/"
|
jpayne@69
|
261 popd > /dev/null
|
jpayne@69
|
262
|
jpayne@69
|
263 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
|
jpayne@69
|
264 CP="$DIR""current/"
|
jpayne@69
|
265
|
jpayne@69
|
266 z="-Xmx4g"
|
jpayne@69
|
267 z2="-Xms4g"
|
jpayne@69
|
268 set=0
|
jpayne@69
|
269
|
jpayne@69
|
270 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
|
jpayne@69
|
271 usage
|
jpayne@69
|
272 exit
|
jpayne@69
|
273 fi
|
jpayne@69
|
274
|
jpayne@69
|
275 calcXmx () {
|
jpayne@69
|
276 source "$DIR""/calcmem.sh"
|
jpayne@69
|
277 setEnvironment
|
jpayne@69
|
278 parseXmx "$@"
|
jpayne@69
|
279 if [[ $set == 1 ]]; then
|
jpayne@69
|
280 return
|
jpayne@69
|
281 fi
|
jpayne@69
|
282 freeRam 3200m 84
|
jpayne@69
|
283 z="-Xmx${RAM}m"
|
jpayne@69
|
284 z2="-Xms${RAM}m"
|
jpayne@69
|
285 }
|
jpayne@69
|
286 calcXmx "$@"
|
jpayne@69
|
287
|
jpayne@69
|
288 comparesketch() {
|
jpayne@69
|
289 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.CompareSketch $@"
|
jpayne@69
|
290 # echo $CMD >&2
|
jpayne@69
|
291 eval $CMD
|
jpayne@69
|
292 }
|
jpayne@69
|
293
|
jpayne@69
|
294 comparesketch "$@"
|