jpayne@69
|
1 #!/bin/bash
|
jpayne@69
|
2
|
jpayne@69
|
3 usage(){
|
jpayne@69
|
4 echo "
|
jpayne@69
|
5 Written by Brian Bushnell
|
jpayne@69
|
6 Last modified January 28, 2020
|
jpayne@69
|
7
|
jpayne@69
|
8 Description: Creates one or more sketches from a fasta file,
|
jpayne@69
|
9 optionally annotated with taxonomic information.
|
jpayne@69
|
10
|
jpayne@69
|
11 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
|
jpayne@69
|
12
|
jpayne@69
|
13 Usage: sketch.sh in=<fasta file> out=<sketch file>
|
jpayne@69
|
14
|
jpayne@69
|
15 Standard parameters:
|
jpayne@69
|
16 in=<file> A fasta file containing one or more sequences.
|
jpayne@69
|
17 out=<file> Output filename. If multiple files are desired it must
|
jpayne@69
|
18 contain the # symbol.
|
jpayne@69
|
19 blacklist=<file> Ignore keys in this sketch file. Additionaly, there are
|
jpayne@69
|
20 built-in blacklists that can be specified:
|
jpayne@69
|
21 nt: Blacklist for nt
|
jpayne@69
|
22 refseq: Blacklist for Refseq
|
jpayne@69
|
23 silva: Blacklist for Silva
|
jpayne@69
|
24 img: Blacklist for IMG
|
jpayne@69
|
25 files=1 Number of output sketch files to produce, for parallel
|
jpayne@69
|
26 loading. Independent of the number of sketches produced;
|
jpayne@69
|
27 sketches will be randomly distributed between files.
|
jpayne@69
|
28 k=32,24 Kmer length, 1-32. To maximize sensitivity and
|
jpayne@69
|
29 specificity, dual kmer lengths may be used, e.g. k=32,24
|
jpayne@69
|
30 Query and reference k must match.
|
jpayne@69
|
31 rcomp=t Look at reverse-complement kmers also.
|
jpayne@69
|
32 amino=f Use amino acid mode. Input should be amino acids.
|
jpayne@69
|
33 translate=f Call genes and translate to proteins. Input should be
|
jpayne@69
|
34 nucleotides. Designed for prokaryotes.
|
jpayne@69
|
35 mode=single Possible modes:
|
jpayne@69
|
36 single: Write one sketch.
|
jpayne@69
|
37 sequence: Write one sketch per sequence.
|
jpayne@69
|
38 taxa: Write one sketch per taxonomic unit.
|
jpayne@69
|
39 Requires more memory, and taxonomic annotation.
|
jpayne@69
|
40 img: Write one sketch per IMG id.
|
jpayne@69
|
41 delta=t Delta-compress sketches.
|
jpayne@69
|
42 a48=t Encode sketches as ASCII-48 rather than hex.
|
jpayne@69
|
43 depth=f Track the number of times kmers appear.
|
jpayne@69
|
44 Required for the depth2 field in comparisons.
|
jpayne@69
|
45 entropy=0.66 Ignore sequence with entropy below this value.
|
jpayne@69
|
46 ssu=t Scan for and retain full-length SSU sequence.
|
jpayne@69
|
47
|
jpayne@69
|
48 Size parameters:
|
jpayne@69
|
49 size=10000 Desired size of sketches (if not using autosize).
|
jpayne@69
|
50 maxfraction=0.01 (mgf) Max fraction of genomic kmers to use.
|
jpayne@69
|
51 minsize=100 Do not generate sketches for genomes smaller than this.
|
jpayne@69
|
52 autosize=t Use flexible sizing instead of fixed-length. This is
|
jpayne@69
|
53 nonlinear; a human sketch is only ~6x a bacterial sketch.
|
jpayne@69
|
54 sizemult=1 Multiply the autosized size of sketches by this factor.
|
jpayne@69
|
55 Normally a bacterial-size genome will get a sketch size
|
jpayne@69
|
56 of around 10000; if autosizefactor=2, it would be ~20000.
|
jpayne@69
|
57 density= If this flag is set (to a number between 0 and 1),
|
jpayne@69
|
58 autosize and sizemult are ignored, and this fraction of
|
jpayne@69
|
59 genomic kmers are used. For example, at density=0.001,
|
jpayne@69
|
60 a 4.5Mbp bacteria will get a 4500-kmer sketch.
|
jpayne@69
|
61
|
jpayne@69
|
62 Metadata flags (optional; intended for single-sketch mode):
|
jpayne@69
|
63 taxid=-1 Set the NCBI taxid.
|
jpayne@69
|
64 imgid=-1 Set the IMG id.
|
jpayne@69
|
65 spid=-1 Set the JGI sequencing project id.
|
jpayne@69
|
66 name= Set the name (taxname).
|
jpayne@69
|
67 name0= Set name0 (normally the first sequence header).
|
jpayne@69
|
68 fname= Set fname (normally the file name).
|
jpayne@69
|
69 meta_= Set an arbitrary metadata field.
|
jpayne@69
|
70 For example, meta_Month=March.
|
jpayne@69
|
71
|
jpayne@69
|
72 Taxonomy-specific flags:
|
jpayne@69
|
73 tree= Specify a taxtree file. On Genepool, use 'auto'.
|
jpayne@69
|
74 gi= Specify a gitable file. On Genepool, use 'auto'.
|
jpayne@69
|
75 accession= Specify one or more comma-delimited NCBI accession to
|
jpayne@69
|
76 taxid files. On Genepool, use 'auto'.
|
jpayne@69
|
77 imgdump= Specify an IMG dump file containing NCBI taxIDs,
|
jpayne@69
|
78 for IMG mode.
|
jpayne@69
|
79 taxlevel=subspecies Taxa hits below this rank will be promoted and merged
|
jpayne@69
|
80 with others.
|
jpayne@69
|
81 prefilter=f For huge datasets full of junk like nt, this flag
|
jpayne@69
|
82 will save memory by ignoring taxa smaller than minsize.
|
jpayne@69
|
83 Requires taxonomic information (tree and gi).
|
jpayne@69
|
84 tossjunk=t For taxa mode, discard taxonomically uninformative
|
jpayne@69
|
85 sequences. This includes sequences with no taxid,
|
jpayne@69
|
86 with a tax level NO_RANK, of parent taxid of LIFE.
|
jpayne@69
|
87 silva=f Parse headers using Silva or semicolon-delimited syntax.
|
jpayne@69
|
88
|
jpayne@69
|
89 Ribosomal flags, which allow SSU sequences to be attached to sketches:
|
jpayne@69
|
90 processSSU=t Run gene-calling to detect ribosomal SSU sequences.
|
jpayne@69
|
91 16Sfile=<file> Optional file of 16S sequences, annotated with TaxIDs.
|
jpayne@69
|
92 18Sfile=<file> Optional file of 18S sequences, annotated with TaxIDs.
|
jpayne@69
|
93 preferSSUMap=f Prefer file SSUs over called SSUs.
|
jpayne@69
|
94 preferSSUMapEuks=t Prefer file SSUs over called SSUs for Eukaryotes.
|
jpayne@69
|
95 SSUMapOnly=f Only use file SSUs.
|
jpayne@69
|
96 SSUMapOnlyEuks=f Only use file SSUs for Eukaryotes. This prevents
|
jpayne@69
|
97 associating an organism with its mitochondrial or
|
jpayne@69
|
98 chloroplast 16S/18S, which is otherwise a problem.
|
jpayne@69
|
99
|
jpayne@69
|
100
|
jpayne@69
|
101 Java Parameters:
|
jpayne@69
|
102 -Xmx This will set Java's memory usage, overriding autodetection.
|
jpayne@69
|
103 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
|
jpayne@69
|
104 The max is typically 85% of physical memory.
|
jpayne@69
|
105 -eoom This flag will cause the process to exit if an
|
jpayne@69
|
106 out-of-memory exception occurs. Requires Java 8u92+.
|
jpayne@69
|
107 -da Disable assertions.
|
jpayne@69
|
108
|
jpayne@69
|
109 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
|
jpayne@69
|
110 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
|
jpayne@69
|
111 "
|
jpayne@69
|
112 }
|
jpayne@69
|
113
|
jpayne@69
|
114 #This block allows symlinked shellscripts to correctly set classpath.
|
jpayne@69
|
115 pushd . > /dev/null
|
jpayne@69
|
116 DIR="${BASH_SOURCE[0]}"
|
jpayne@69
|
117 while [ -h "$DIR" ]; do
|
jpayne@69
|
118 cd "$(dirname "$DIR")"
|
jpayne@69
|
119 DIR="$(readlink "$(basename "$DIR")")"
|
jpayne@69
|
120 done
|
jpayne@69
|
121 cd "$(dirname "$DIR")"
|
jpayne@69
|
122 DIR="$(pwd)/"
|
jpayne@69
|
123 popd > /dev/null
|
jpayne@69
|
124
|
jpayne@69
|
125 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
|
jpayne@69
|
126 CP="$DIR""current/"
|
jpayne@69
|
127
|
jpayne@69
|
128 z="-Xmx4g"
|
jpayne@69
|
129 z2="-Xms4g"
|
jpayne@69
|
130 set=0
|
jpayne@69
|
131
|
jpayne@69
|
132 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
|
jpayne@69
|
133 usage
|
jpayne@69
|
134 exit
|
jpayne@69
|
135 fi
|
jpayne@69
|
136
|
jpayne@69
|
137 calcXmx () {
|
jpayne@69
|
138 source "$DIR""/calcmem.sh"
|
jpayne@69
|
139 setEnvironment
|
jpayne@69
|
140 parseXmx "$@"
|
jpayne@69
|
141 if [[ $set == 1 ]]; then
|
jpayne@69
|
142 return
|
jpayne@69
|
143 fi
|
jpayne@69
|
144 freeRam 4000m 84
|
jpayne@69
|
145 z="-Xmx${RAM}m"
|
jpayne@69
|
146 z2="-Xms${RAM}m"
|
jpayne@69
|
147 }
|
jpayne@69
|
148 calcXmx "$@"
|
jpayne@69
|
149
|
jpayne@69
|
150 sketch() {
|
jpayne@69
|
151 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.SketchMaker $@"
|
jpayne@69
|
152 echo $CMD >&2
|
jpayne@69
|
153 eval $CMD
|
jpayne@69
|
154 }
|
jpayne@69
|
155
|
jpayne@69
|
156 sketch "$@"
|