jpayne@69
|
1 #!/bin/bash
|
jpayne@69
|
2
|
jpayne@69
|
3 usage(){
|
jpayne@69
|
4 echo "
|
jpayne@69
|
5 Written by Brian Bushnell
|
jpayne@69
|
6 Last modified December 19, 2019
|
jpayne@69
|
7
|
jpayne@69
|
8 Description: Compares query sketches to reference sketches hosted on a
|
jpayne@69
|
9 remote server via the Internet. The input can be sketches made by sketch.sh,
|
jpayne@69
|
10 or fasta/fastq files from which SendSketch will generate sketches.
|
jpayne@69
|
11 Only sketches will sent, not sequences.
|
jpayne@69
|
12
|
jpayne@69
|
13 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
|
jpayne@69
|
14
|
jpayne@69
|
15 Usage:
|
jpayne@69
|
16 sendsketch.sh in=file
|
jpayne@69
|
17
|
jpayne@69
|
18 To change nucleotide servers, add the server name, e.g.:
|
jpayne@69
|
19 sendsketch.sh in=file nt
|
jpayne@69
|
20
|
jpayne@69
|
21 For the protein server with nucleotide input:
|
jpayne@69
|
22 sendsketch.sh in=file protein
|
jpayne@69
|
23
|
jpayne@69
|
24 for the protein server with amino input:
|
jpayne@69
|
25 sendsketch.sh in=file amino protein
|
jpayne@69
|
26
|
jpayne@69
|
27
|
jpayne@69
|
28 Standard parameters:
|
jpayne@69
|
29 in=<file> Sketch or fasta file to compare.
|
jpayne@69
|
30 out=stdout Comparison output. Can be set to a file instead.
|
jpayne@69
|
31 outsketch= Optional, to write the sketch to a file.
|
jpayne@69
|
32 local=f For local files, have the server load the sketches.
|
jpayne@69
|
33 Allows use of whitelists; recommended for Silva.
|
jpayne@69
|
34 Local can only be used when the client and server access
|
jpayne@69
|
35 the same filesystem - e.g., Genepool and Cori.
|
jpayne@69
|
36 address= Address of remote server. Default address:
|
jpayne@69
|
37 https://refseq-sketch.jgi-psf.org/sketch
|
jpayne@69
|
38 You can also specify these abbreviations:
|
jpayne@69
|
39 nt: nt server
|
jpayne@69
|
40 refseq: Refseq server
|
jpayne@69
|
41 silva: Silva server
|
jpayne@69
|
42 protein: RefSeq prokaryotic amino acid sketches
|
jpayne@69
|
43 img: IMG server (Not Yet Available)
|
jpayne@69
|
44 mito: RefSeq mitochondrial server (NYA)
|
jpayne@69
|
45 fungi: RefSeq fungi sketches (NYA)
|
jpayne@69
|
46 Using an abbreviation automatically sets the address,
|
jpayne@69
|
47 the blacklist, and k.
|
jpayne@69
|
48 aws=f Set aws=t to use the aws servers instead of NERSC.
|
jpayne@69
|
49 When, for example, NERSC (or the whole SF Bay area) is down.
|
jpayne@69
|
50
|
jpayne@69
|
51 Sketch-making parameters:
|
jpayne@69
|
52 mode=single Possible modes, for fasta input:
|
jpayne@69
|
53 single: Generate one sketch per file.
|
jpayne@69
|
54 sequence: Generate one sketch per sequence.
|
jpayne@69
|
55 k=31 Kmer length, 1-32. This is automatic and does not need to
|
jpayne@69
|
56 be set for JGI servers, only for locally-hosted servers.
|
jpayne@69
|
57 samplerate=1 Set to a lower value to sample a fraction of input reads.
|
jpayne@69
|
58 For raw reads (rather than an assembly), 1-3x coverage
|
jpayne@69
|
59 gives best results, by reducing error kmers. Somewhat
|
jpayne@69
|
60 higher is better for high-error-rate data like PacBio.
|
jpayne@69
|
61 minkeycount=1 Ignore kmers that occur fewer times than this. Values
|
jpayne@69
|
62 over 1 can be used with raw reads to avoid error kmers.
|
jpayne@69
|
63 minprob=0.0001 Ignore kmers below this probability of correctness.
|
jpayne@69
|
64 minqual=0 Ignore kmers spanning bases below this quality.
|
jpayne@69
|
65 entropy=0.66 Ignore sequence with entropy below this value.
|
jpayne@69
|
66 merge=f Merge paired reads prior to sketching.
|
jpayne@69
|
67 amino=f Use amino acid mode. Input should be amino acids.
|
jpayne@69
|
68 translate=f Call genes and translate to proteins. Input should be
|
jpayne@69
|
69 nucleotides. Designed for prokaryotes.
|
jpayne@69
|
70 sixframes=f Translate all 6 frames instead of predicting genes.
|
jpayne@69
|
71 ssu=t Scan for and retain full-length SSU sequence.
|
jpayne@69
|
72 printssusequence=f Print the query SSU sequence (JSON mode only).
|
jpayne@69
|
73 refid= Instead of a query file, specify a reference sketch by name
|
jpayne@69
|
74 or taxid; e.g. refid=h.sapiens or refid=9606.
|
jpayne@69
|
75
|
jpayne@69
|
76 Size parameters:
|
jpayne@69
|
77 size=10000 Desired size of sketches (if not using autosize).
|
jpayne@69
|
78 mgf=0.01 (maxfraction) Max fraction of genomic kmers to use.
|
jpayne@69
|
79 minsize=100 Do not generate sketches for genomes smaller than this.
|
jpayne@69
|
80 autosize=t Use flexible sizing instead of fixed-length. This is
|
jpayne@69
|
81 nonlinear; a human sketch is only ~6x a bacterial sketch.
|
jpayne@69
|
82 sizemult=1 Multiply the autosized size of sketches by this factor.
|
jpayne@69
|
83 Normally a bacterial-size genome will get a sketch size
|
jpayne@69
|
84 of around 10000; if autosizefactor=2, it would be ~20000.
|
jpayne@69
|
85 density= If this flag is set (to a number between 0 and 1),
|
jpayne@69
|
86 autosize and sizemult are ignored, and this fraction of
|
jpayne@69
|
87 genomic kmers are used. For example, at density=0.001,
|
jpayne@69
|
88 a 4.5Mbp bacteria will get a 4500-kmer sketch.
|
jpayne@69
|
89 sketchheapfactor=4 If minkeycount>1, temporarily track this many kmers until
|
jpayne@69
|
90 counts are known and low-count kmers are discarded.
|
jpayne@69
|
91
|
jpayne@69
|
92 Taxonomy and filtering parameters:
|
jpayne@69
|
93 level=2 Only report the best record per taxa at this level.
|
jpayne@69
|
94 Either level names or numbers may be used.
|
jpayne@69
|
95 0: disabled
|
jpayne@69
|
96 1: subspecies
|
jpayne@69
|
97 2: species
|
jpayne@69
|
98 3: genus
|
jpayne@69
|
99 ...etc
|
jpayne@69
|
100 include= Restrict output to organisms in these clades.
|
jpayne@69
|
101 May be a comma-delimited list of names or NCBI TaxIDs.
|
jpayne@69
|
102 includelevel=0 Promote the include list to this taxonomic level.
|
jpayne@69
|
103 For example, include=h.sapiens includelevel=phylum
|
jpayne@69
|
104 would only include organisms in the same phylum as human.
|
jpayne@69
|
105 includestring= Only report records whose name contains this string.
|
jpayne@69
|
106 exclude= Ignore organisms in these clades.
|
jpayne@69
|
107 May be a comma-delimited list of names or NCBI TaxIDs.
|
jpayne@69
|
108 excludelevel=0 Promote the exclude list to this taxonomic level.
|
jpayne@69
|
109 For example, exclude=h.sapiens excludelevel=phylum
|
jpayne@69
|
110 would exclude all organisms in the same phylum as human.
|
jpayne@69
|
111 excludestring= Do not records whose name contains this string.
|
jpayne@69
|
112 banunclassified=f Ignore organisms descending from nodes like
|
jpayne@69
|
113 'unclassified Bacteria'
|
jpayne@69
|
114 banvirus=f Ignore viruses.
|
jpayne@69
|
115 requiressu=f Ignore records without SSUs.
|
jpayne@69
|
116 minrefsize=0 Ignore ref sketches smaller than this (unique kmers).
|
jpayne@69
|
117 minrefsizebases=0 Ignore ref sketches smaller than this (total base pairs).
|
jpayne@69
|
118
|
jpayne@69
|
119 Output format:
|
jpayne@69
|
120 format=2 2: Default format with, per query, one query header line;
|
jpayne@69
|
121 one column header line; and one reference line per hit.
|
jpayne@69
|
122 3: One line per hit, with columns query, reference, ANI,
|
jpayne@69
|
123 and sizeRatio.
|
jpayne@69
|
124 4: JSON (format=json also works).
|
jpayne@69
|
125 5: Constellation (format=constellation also works).
|
jpayne@69
|
126 usetaxidname=f For format 3, print the taxID in the name column.
|
jpayne@69
|
127 usetaxname for format 3, print the taxonomic name in the name column.
|
jpayne@69
|
128 useimgname For format 3, print the img ID in the name column.
|
jpayne@69
|
129 d3=f Output in JSON format, with a tree for visualization.
|
jpayne@69
|
130
|
jpayne@69
|
131 Output columns (for format=2):
|
jpayne@69
|
132 printall=f Enable all output columns.
|
jpayne@69
|
133 printani=t (ani) Print average nucleotide identity estimate.
|
jpayne@69
|
134 completeness=t Genome completeness estimate.
|
jpayne@69
|
135 score=f Score (used for sorting the output).
|
jpayne@69
|
136 printmatches=t Number of kmer matches to reference.
|
jpayne@69
|
137 printlength=f Number of kmers compared.
|
jpayne@69
|
138 printtaxid=t NCBI taxID.
|
jpayne@69
|
139 printimg=f IMG identifier (only for IMG data).
|
jpayne@69
|
140 printgbases=f Number of genomic bases.
|
jpayne@69
|
141 printgkmers=f Number of genomic kmers.
|
jpayne@69
|
142 printgsize=t Estimated number of unique genomic kmers.
|
jpayne@69
|
143 printgseqs=t Number of sequences (scaffolds/reads).
|
jpayne@69
|
144 printtaxname=t Name associated with this taxID.
|
jpayne@69
|
145 printname0=f (pn0) Original seqeuence name.
|
jpayne@69
|
146 printqfname=t Query filename.
|
jpayne@69
|
147 printrfname=f Reference filename.
|
jpayne@69
|
148 printtaxa=f Full taxonomy of each record.
|
jpayne@69
|
149 printcontam=t Print contamination estimate, and factor contaminant kmers
|
jpayne@69
|
150 into calculations. Kmers are considered contaminant if
|
jpayne@69
|
151 present in some ref sketch but not the current one.
|
jpayne@69
|
152 printunique=t Number of matches unique to this reference.
|
jpayne@69
|
153 printunique2=f Number of matches unique to this reference's taxa.
|
jpayne@69
|
154 printunique3=f Number of query kmers unique to this reference's taxa,
|
jpayne@69
|
155 regardless of whether they are in this reference sketch.
|
jpayne@69
|
156 printnohit=f Number of kmers that don't hit anything.
|
jpayne@69
|
157 printrefhits=f Average number of ref sketches hit by shared kmers.
|
jpayne@69
|
158 printgc=f GC content.
|
jpayne@69
|
159 printucontam=f Contam hits that hit exactly one reference sketch.
|
jpayne@69
|
160 printcontam2=f Print contamination estimate using only kmer hits
|
jpayne@69
|
161 to unrelated taxa.
|
jpayne@69
|
162 contamlevel=species Taxonomic level to use for contam2/unique2/unique3.
|
jpayne@69
|
163 NOTE: unique2/unique3/contam2/refhits require an index.
|
jpayne@69
|
164
|
jpayne@69
|
165 printdepth=f (depth) Print average depth of sketch kmers; intended
|
jpayne@69
|
166 for shotgun read input.
|
jpayne@69
|
167 printdepth2=f (depth2) Print depth compensating for genomic repeats.
|
jpayne@69
|
168 Requires reference sketches to be generated with depth.
|
jpayne@69
|
169 actualdepth=t If this is false, the raw average count is printed.
|
jpayne@69
|
170 If true, the raw average (observed depth) is converted
|
jpayne@69
|
171 to estimated actual depth (including uncovered areas).
|
jpayne@69
|
172 printvolume=f (volume) Product of average depth and matches.
|
jpayne@69
|
173 printca=f Print common ancestor, if query taxID is known.
|
jpayne@69
|
174 printcal=f Print common ancestor tax level, if query taxID is known.
|
jpayne@69
|
175 recordsperlevel=0 If query TaxID is known, and this is positive, print at
|
jpayne@69
|
176 most this many records per common ancestor level.
|
jpayne@69
|
177
|
jpayne@69
|
178 Sorting:
|
jpayne@69
|
179 sortbyscore=t Default sort order is by score.
|
jpayne@69
|
180 sortbydepth=f Include depth as a factor in sort order.
|
jpayne@69
|
181 sortbydepth2=f Include depth2 as a factor in sort order.
|
jpayne@69
|
182 sortbyvolume=f Include volume as a factor in sort order.
|
jpayne@69
|
183 sortbykid=f Sort strictly by KID.
|
jpayne@69
|
184 sortbyani=f Sort strictly by ANI/AAI/WKID.
|
jpayne@69
|
185 sortbyhits=f Sort strictly by the number of kmer hits.
|
jpayne@69
|
186
|
jpayne@69
|
187 Other output parameters:
|
jpayne@69
|
188 minhits=3 (hits) Only report records with at least this many hits.
|
jpayne@69
|
189 minani=0 (ani) Only report records with at least this ANI (0-1).
|
jpayne@69
|
190 minwkid=0.0001 (wkid) Only report records with at least this WKID (0-1).
|
jpayne@69
|
191 anifromwkid=t Calculate ani from wkid. If false, use kid.
|
jpayne@69
|
192 minbases=0 Ignore ref sketches of sequences shortert than this.
|
jpayne@69
|
193 minsizeratio=0 Don't compare sketches if the smaller genome is less than
|
jpayne@69
|
194 this fraction of the size of the larger.
|
jpayne@69
|
195 records=20 Report at most this many best-matching records.
|
jpayne@69
|
196 color=family Color records at the family level. color=f will disable.
|
jpayne@69
|
197 Colors work in most terminals but may cause odd characters
|
jpayne@69
|
198 to appear in text editors. So, color defaults to f if
|
jpayne@69
|
199 writing to a file.
|
jpayne@69
|
200 intersect=f Print sketch intersections. delta=f is suggested.
|
jpayne@69
|
201
|
jpayne@69
|
202 Metadata parameters (optional, for the query sketch header):
|
jpayne@69
|
203 taxid=-1 Set the NCBI taxid.
|
jpayne@69
|
204 imgid=-1 Set the IMG id.
|
jpayne@69
|
205 spid=-1 Set the sequencing project id (JGI-specific).
|
jpayne@69
|
206 name= Set the name (taxname).
|
jpayne@69
|
207 name0= Set name0 (normally the first sequence header).
|
jpayne@69
|
208 fname= Set fname (normally the file name).
|
jpayne@69
|
209 meta_= Set an arbitrary metadata field.
|
jpayne@69
|
210 For example, meta_Month=March.
|
jpayne@69
|
211
|
jpayne@69
|
212 Other parameters:
|
jpayne@69
|
213 requiredmeta= (rmeta) Required optional metadata values. For example:
|
jpayne@69
|
214 rmeta=subunit:ssu,source:silva
|
jpayne@69
|
215 bannedmeta= (bmeta) Forbidden optional metadata values.
|
jpayne@69
|
216
|
jpayne@69
|
217 Java Parameters:
|
jpayne@69
|
218 -Xmx This will set Java's memory usage, overriding autodetection.
|
jpayne@69
|
219 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
|
jpayne@69
|
220 The max is typically 85% of physical memory.
|
jpayne@69
|
221 -eoom This flag will cause the process to exit if an out-of-memory
|
jpayne@69
|
222 exception occurs. Requires Java 8u92+.
|
jpayne@69
|
223 -da Disable assertions.
|
jpayne@69
|
224
|
jpayne@69
|
225 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
|
jpayne@69
|
226 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
|
jpayne@69
|
227 "
|
jpayne@69
|
228 }
|
jpayne@69
|
229
|
jpayne@69
|
230 #This block allows symlinked shellscripts to correctly set classpath.
|
jpayne@69
|
231 pushd . > /dev/null
|
jpayne@69
|
232 DIR="${BASH_SOURCE[0]}"
|
jpayne@69
|
233 while [ -h "$DIR" ]; do
|
jpayne@69
|
234 cd "$(dirname "$DIR")"
|
jpayne@69
|
235 DIR="$(readlink "$(basename "$DIR")")"
|
jpayne@69
|
236 done
|
jpayne@69
|
237 cd "$(dirname "$DIR")"
|
jpayne@69
|
238 DIR="$(pwd)/"
|
jpayne@69
|
239 popd > /dev/null
|
jpayne@69
|
240
|
jpayne@69
|
241 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
|
jpayne@69
|
242 CP="$DIR""current/"
|
jpayne@69
|
243
|
jpayne@69
|
244 z="-Xmx4g"
|
jpayne@69
|
245 z2="-Xms4g"
|
jpayne@69
|
246 set=0
|
jpayne@69
|
247
|
jpayne@69
|
248 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
|
jpayne@69
|
249 usage
|
jpayne@69
|
250 exit
|
jpayne@69
|
251 fi
|
jpayne@69
|
252
|
jpayne@69
|
253 calcXmx () {
|
jpayne@69
|
254 source "$DIR""/calcmem.sh"
|
jpayne@69
|
255 setEnvironment
|
jpayne@69
|
256 parseXmx "$@"
|
jpayne@69
|
257 if [[ $set == 1 ]]; then
|
jpayne@69
|
258 return
|
jpayne@69
|
259 fi
|
jpayne@69
|
260 freeRam 3200m 84
|
jpayne@69
|
261 z="-Xmx${RAM}m"
|
jpayne@69
|
262 z2="-Xms${RAM}m"
|
jpayne@69
|
263 }
|
jpayne@69
|
264 calcXmx "$@"
|
jpayne@69
|
265
|
jpayne@69
|
266 sendsketch() {
|
jpayne@69
|
267 local CMD="java $EA $EOOM $z -cp $CP sketch.SendSketch $@"
|
jpayne@69
|
268 # echo $CMD >&2
|
jpayne@69
|
269 eval $CMD
|
jpayne@69
|
270 }
|
jpayne@69
|
271
|
jpayne@69
|
272 sendsketch "$@"
|