Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/sketch.sh @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 #!/bin/bash | |
2 | |
3 usage(){ | |
4 echo " | |
5 Written by Brian Bushnell | |
6 Last modified January 28, 2020 | |
7 | |
8 Description: Creates one or more sketches from a fasta file, | |
9 optionally annotated with taxonomic information. | |
10 | |
11 Please read bbmap/docs/guides/BBSketchGuide.txt for more information. | |
12 | |
13 Usage: sketch.sh in=<fasta file> out=<sketch file> | |
14 | |
15 Standard parameters: | |
16 in=<file> A fasta file containing one or more sequences. | |
17 out=<file> Output filename. If multiple files are desired it must | |
18 contain the # symbol. | |
19 blacklist=<file> Ignore keys in this sketch file. Additionaly, there are | |
20 built-in blacklists that can be specified: | |
21 nt: Blacklist for nt | |
22 refseq: Blacklist for Refseq | |
23 silva: Blacklist for Silva | |
24 img: Blacklist for IMG | |
25 files=1 Number of output sketch files to produce, for parallel | |
26 loading. Independent of the number of sketches produced; | |
27 sketches will be randomly distributed between files. | |
28 k=32,24 Kmer length, 1-32. To maximize sensitivity and | |
29 specificity, dual kmer lengths may be used, e.g. k=32,24 | |
30 Query and reference k must match. | |
31 rcomp=t Look at reverse-complement kmers also. | |
32 amino=f Use amino acid mode. Input should be amino acids. | |
33 translate=f Call genes and translate to proteins. Input should be | |
34 nucleotides. Designed for prokaryotes. | |
35 mode=single Possible modes: | |
36 single: Write one sketch. | |
37 sequence: Write one sketch per sequence. | |
38 taxa: Write one sketch per taxonomic unit. | |
39 Requires more memory, and taxonomic annotation. | |
40 img: Write one sketch per IMG id. | |
41 delta=t Delta-compress sketches. | |
42 a48=t Encode sketches as ASCII-48 rather than hex. | |
43 depth=f Track the number of times kmers appear. | |
44 Required for the depth2 field in comparisons. | |
45 entropy=0.66 Ignore sequence with entropy below this value. | |
46 ssu=t Scan for and retain full-length SSU sequence. | |
47 | |
48 Size parameters: | |
49 size=10000 Desired size of sketches (if not using autosize). | |
50 maxfraction=0.01 (mgf) Max fraction of genomic kmers to use. | |
51 minsize=100 Do not generate sketches for genomes smaller than this. | |
52 autosize=t Use flexible sizing instead of fixed-length. This is | |
53 nonlinear; a human sketch is only ~6x a bacterial sketch. | |
54 sizemult=1 Multiply the autosized size of sketches by this factor. | |
55 Normally a bacterial-size genome will get a sketch size | |
56 of around 10000; if autosizefactor=2, it would be ~20000. | |
57 density= If this flag is set (to a number between 0 and 1), | |
58 autosize and sizemult are ignored, and this fraction of | |
59 genomic kmers are used. For example, at density=0.001, | |
60 a 4.5Mbp bacteria will get a 4500-kmer sketch. | |
61 | |
62 Metadata flags (optional; intended for single-sketch mode): | |
63 taxid=-1 Set the NCBI taxid. | |
64 imgid=-1 Set the IMG id. | |
65 spid=-1 Set the JGI sequencing project id. | |
66 name= Set the name (taxname). | |
67 name0= Set name0 (normally the first sequence header). | |
68 fname= Set fname (normally the file name). | |
69 meta_= Set an arbitrary metadata field. | |
70 For example, meta_Month=March. | |
71 | |
72 Taxonomy-specific flags: | |
73 tree= Specify a taxtree file. On Genepool, use 'auto'. | |
74 gi= Specify a gitable file. On Genepool, use 'auto'. | |
75 accession= Specify one or more comma-delimited NCBI accession to | |
76 taxid files. On Genepool, use 'auto'. | |
77 imgdump= Specify an IMG dump file containing NCBI taxIDs, | |
78 for IMG mode. | |
79 taxlevel=subspecies Taxa hits below this rank will be promoted and merged | |
80 with others. | |
81 prefilter=f For huge datasets full of junk like nt, this flag | |
82 will save memory by ignoring taxa smaller than minsize. | |
83 Requires taxonomic information (tree and gi). | |
84 tossjunk=t For taxa mode, discard taxonomically uninformative | |
85 sequences. This includes sequences with no taxid, | |
86 with a tax level NO_RANK, of parent taxid of LIFE. | |
87 silva=f Parse headers using Silva or semicolon-delimited syntax. | |
88 | |
89 Ribosomal flags, which allow SSU sequences to be attached to sketches: | |
90 processSSU=t Run gene-calling to detect ribosomal SSU sequences. | |
91 16Sfile=<file> Optional file of 16S sequences, annotated with TaxIDs. | |
92 18Sfile=<file> Optional file of 18S sequences, annotated with TaxIDs. | |
93 preferSSUMap=f Prefer file SSUs over called SSUs. | |
94 preferSSUMapEuks=t Prefer file SSUs over called SSUs for Eukaryotes. | |
95 SSUMapOnly=f Only use file SSUs. | |
96 SSUMapOnlyEuks=f Only use file SSUs for Eukaryotes. This prevents | |
97 associating an organism with its mitochondrial or | |
98 chloroplast 16S/18S, which is otherwise a problem. | |
99 | |
100 | |
101 Java Parameters: | |
102 -Xmx This will set Java's memory usage, overriding autodetection. | |
103 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. | |
104 The max is typically 85% of physical memory. | |
105 -eoom This flag will cause the process to exit if an | |
106 out-of-memory exception occurs. Requires Java 8u92+. | |
107 -da Disable assertions. | |
108 | |
109 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt. | |
110 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems. | |
111 " | |
112 } | |
113 | |
114 #This block allows symlinked shellscripts to correctly set classpath. | |
115 pushd . > /dev/null | |
116 DIR="${BASH_SOURCE[0]}" | |
117 while [ -h "$DIR" ]; do | |
118 cd "$(dirname "$DIR")" | |
119 DIR="$(readlink "$(basename "$DIR")")" | |
120 done | |
121 cd "$(dirname "$DIR")" | |
122 DIR="$(pwd)/" | |
123 popd > /dev/null | |
124 | |
125 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" | |
126 CP="$DIR""current/" | |
127 | |
128 z="-Xmx4g" | |
129 z2="-Xms4g" | |
130 set=0 | |
131 | |
132 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then | |
133 usage | |
134 exit | |
135 fi | |
136 | |
137 calcXmx () { | |
138 source "$DIR""/calcmem.sh" | |
139 setEnvironment | |
140 parseXmx "$@" | |
141 if [[ $set == 1 ]]; then | |
142 return | |
143 fi | |
144 freeRam 4000m 84 | |
145 z="-Xmx${RAM}m" | |
146 z2="-Xms${RAM}m" | |
147 } | |
148 calcXmx "$@" | |
149 | |
150 sketch() { | |
151 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.SketchMaker $@" | |
152 echo $CMD >&2 | |
153 eval $CMD | |
154 } | |
155 | |
156 sketch "$@" |