comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/sketch.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 #!/bin/bash
2
3 usage(){
4 echo "
5 Written by Brian Bushnell
6 Last modified January 28, 2020
7
8 Description: Creates one or more sketches from a fasta file,
9 optionally annotated with taxonomic information.
10
11 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
12
13 Usage: sketch.sh in=<fasta file> out=<sketch file>
14
15 Standard parameters:
16 in=<file> A fasta file containing one or more sequences.
17 out=<file> Output filename. If multiple files are desired it must
18 contain the # symbol.
19 blacklist=<file> Ignore keys in this sketch file. Additionaly, there are
20 built-in blacklists that can be specified:
21 nt: Blacklist for nt
22 refseq: Blacklist for Refseq
23 silva: Blacklist for Silva
24 img: Blacklist for IMG
25 files=1 Number of output sketch files to produce, for parallel
26 loading. Independent of the number of sketches produced;
27 sketches will be randomly distributed between files.
28 k=32,24 Kmer length, 1-32. To maximize sensitivity and
29 specificity, dual kmer lengths may be used, e.g. k=32,24
30 Query and reference k must match.
31 rcomp=t Look at reverse-complement kmers also.
32 amino=f Use amino acid mode. Input should be amino acids.
33 translate=f Call genes and translate to proteins. Input should be
34 nucleotides. Designed for prokaryotes.
35 mode=single Possible modes:
36 single: Write one sketch.
37 sequence: Write one sketch per sequence.
38 taxa: Write one sketch per taxonomic unit.
39 Requires more memory, and taxonomic annotation.
40 img: Write one sketch per IMG id.
41 delta=t Delta-compress sketches.
42 a48=t Encode sketches as ASCII-48 rather than hex.
43 depth=f Track the number of times kmers appear.
44 Required for the depth2 field in comparisons.
45 entropy=0.66 Ignore sequence with entropy below this value.
46 ssu=t Scan for and retain full-length SSU sequence.
47
48 Size parameters:
49 size=10000 Desired size of sketches (if not using autosize).
50 maxfraction=0.01 (mgf) Max fraction of genomic kmers to use.
51 minsize=100 Do not generate sketches for genomes smaller than this.
52 autosize=t Use flexible sizing instead of fixed-length. This is
53 nonlinear; a human sketch is only ~6x a bacterial sketch.
54 sizemult=1 Multiply the autosized size of sketches by this factor.
55 Normally a bacterial-size genome will get a sketch size
56 of around 10000; if autosizefactor=2, it would be ~20000.
57 density= If this flag is set (to a number between 0 and 1),
58 autosize and sizemult are ignored, and this fraction of
59 genomic kmers are used. For example, at density=0.001,
60 a 4.5Mbp bacteria will get a 4500-kmer sketch.
61
62 Metadata flags (optional; intended for single-sketch mode):
63 taxid=-1 Set the NCBI taxid.
64 imgid=-1 Set the IMG id.
65 spid=-1 Set the JGI sequencing project id.
66 name= Set the name (taxname).
67 name0= Set name0 (normally the first sequence header).
68 fname= Set fname (normally the file name).
69 meta_= Set an arbitrary metadata field.
70 For example, meta_Month=March.
71
72 Taxonomy-specific flags:
73 tree= Specify a taxtree file. On Genepool, use 'auto'.
74 gi= Specify a gitable file. On Genepool, use 'auto'.
75 accession= Specify one or more comma-delimited NCBI accession to
76 taxid files. On Genepool, use 'auto'.
77 imgdump= Specify an IMG dump file containing NCBI taxIDs,
78 for IMG mode.
79 taxlevel=subspecies Taxa hits below this rank will be promoted and merged
80 with others.
81 prefilter=f For huge datasets full of junk like nt, this flag
82 will save memory by ignoring taxa smaller than minsize.
83 Requires taxonomic information (tree and gi).
84 tossjunk=t For taxa mode, discard taxonomically uninformative
85 sequences. This includes sequences with no taxid,
86 with a tax level NO_RANK, of parent taxid of LIFE.
87 silva=f Parse headers using Silva or semicolon-delimited syntax.
88
89 Ribosomal flags, which allow SSU sequences to be attached to sketches:
90 processSSU=t Run gene-calling to detect ribosomal SSU sequences.
91 16Sfile=<file> Optional file of 16S sequences, annotated with TaxIDs.
92 18Sfile=<file> Optional file of 18S sequences, annotated with TaxIDs.
93 preferSSUMap=f Prefer file SSUs over called SSUs.
94 preferSSUMapEuks=t Prefer file SSUs over called SSUs for Eukaryotes.
95 SSUMapOnly=f Only use file SSUs.
96 SSUMapOnlyEuks=f Only use file SSUs for Eukaryotes. This prevents
97 associating an organism with its mitochondrial or
98 chloroplast 16S/18S, which is otherwise a problem.
99
100
101 Java Parameters:
102 -Xmx This will set Java's memory usage, overriding autodetection.
103 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
104 The max is typically 85% of physical memory.
105 -eoom This flag will cause the process to exit if an
106 out-of-memory exception occurs. Requires Java 8u92+.
107 -da Disable assertions.
108
109 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
110 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
111 "
112 }
113
114 #This block allows symlinked shellscripts to correctly set classpath.
115 pushd . > /dev/null
116 DIR="${BASH_SOURCE[0]}"
117 while [ -h "$DIR" ]; do
118 cd "$(dirname "$DIR")"
119 DIR="$(readlink "$(basename "$DIR")")"
120 done
121 cd "$(dirname "$DIR")"
122 DIR="$(pwd)/"
123 popd > /dev/null
124
125 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
126 CP="$DIR""current/"
127
128 z="-Xmx4g"
129 z2="-Xms4g"
130 set=0
131
132 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
133 usage
134 exit
135 fi
136
137 calcXmx () {
138 source "$DIR""/calcmem.sh"
139 setEnvironment
140 parseXmx "$@"
141 if [[ $set == 1 ]]; then
142 return
143 fi
144 freeRam 4000m 84
145 z="-Xmx${RAM}m"
146 z2="-Xms${RAM}m"
147 }
148 calcXmx "$@"
149
150 sketch() {
151 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.SketchMaker $@"
152 echo $CMD >&2
153 eval $CMD
154 }
155
156 sketch "$@"