csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/comparesketch.sh annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/comparesketch.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 #!/bin/bash
jpayne@69	2
jpayne@69	3 usage(){
jpayne@69	4 echo "
jpayne@69	5 Written by Brian Bushnell
jpayne@69	6 Last modified Jan 7, 2020
jpayne@69	7
jpayne@69	8 Description: Compares query sketches to others, and prints their kmer identity.
jpayne@69	9 The input can be sketches made by sketch.sh, or fasta/fastq files.
jpayne@69	10 It's recommended to first sketch references with sketch.sh for large files,
jpayne@69	11 or when taxonomic information is desired.
jpayne@69	12
jpayne@69	13 Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
jpayne@69	14
jpayne@69	15 Usage: comparesketch.sh in=<file,file,file...> ref=<file,file,file...>
jpayne@69	16 Alternative: comparesketch.sh in=<file,file,file...> file file file
jpayne@69	17 Alternative: comparesketch.sh in=<file,file,file...> *.sketch
jpayne@69	18 Alternative: comparesketch.sh alltoall *.sketch.gz
jpayne@69	19
jpayne@69	20 File parameters:
jpayne@69	21 in=<file,file...> Sketches or fasta files to compare.
jpayne@69	22 out=stdout Comparison output. Can be set to a file instead.
jpayne@69	23 outsketch=<file> Optionally write sketch files generated from the input.
jpayne@69	24 ref=<file,file...> List of sketches to compare against. Files given without
jpayne@69	25 a prefix (ref=) will be treated as references,
jpayne@69	26 so you can use *.sketch without ref=.
jpayne@69	27 You can also do ref=nt#.sketch to load all numbered files
jpayne@69	28 fitting that pattern.
jpayne@69	29 On NERSC, you can use these abbreviations (e.g. ref=nt):
jpayne@69	30 nt: nt sketches
jpayne@69	31 refseq: Refseq sketches
jpayne@69	32 silva: Silva sketches
jpayne@69	33 img: IMG sketches
jpayne@69	34 mito: RefSeq mitochondrial sketches
jpayne@69	35 fungi: RefSeq fungi sketches
jpayne@69	36 protein: RefSeq prokaryotic amino acid sketches
jpayne@69	37 Using an abbreviation automatically sets the blacklist,
jpayne@69	38 and k. If the reference is in amino space, the query
jpayne@69	39 also be in amino acid space with the flag amino added.
jpayne@69	40 If the query is in nucleotide space, use the flag
jpayne@69	41 'translate', but this will only work for prokaryotes.
jpayne@69	42
jpayne@69	43 Blacklist and Whitelist parameters:
jpayne@69	44 blacklist=<file> Ignore keys in this sketch file. Additionally, there are
jpayne@69	45 built-in blacklists that can be specified:
jpayne@69	46 nt: Blacklist for nt
jpayne@69	47 refseq: Blacklist for Refseq
jpayne@69	48 silva: Blacklist for Silva
jpayne@69	49 img: Blacklist for IMG
jpayne@69	50 whitelist=f Ignore keys that are not in the index. Requires index=t.
jpayne@69	51
jpayne@69	52 Sketch-making parameters:
jpayne@69	53 mode=perfile Possible modes, for sequence input:
jpayne@69	54 single: Generate one sketch.
jpayne@69	55 sequence: Generate one sketch per sequence.
jpayne@69	56 perfile: Generate one sketch per file.
jpayne@69	57 sketchonly=f Don't run comparisons, just write the output sketch file.
jpayne@69	58 k=31 Kmer length, 1-32. To maximize sensitivity and
jpayne@69	59 specificity, dual kmer lengths may be used: k=31,24
jpayne@69	60 Dual kmers are fastest if the shorter is a multiple
jpayne@69	61 of 4. Query and reference k must match.
jpayne@69	62 samplerate=1 Set to a lower value to sample a fraction of input reads.
jpayne@69	63 For raw reads (rather than an assembly), 1-3x coverage
jpayne@69	64 gives best results, by reducing error kmers. Somewhat
jpayne@69	65 higher is better for high-error-rate data like PacBio.
jpayne@69	66 minkeycount=1 Ignore kmers that occur fewer times than this. Values
jpayne@69	67 over 1 can be used with raw reads to avoid error kmers.
jpayne@69	68 minprob=0.0001 Ignore kmers below this probability of correctness.
jpayne@69	69 minqual=0 Ignore kmers spanning bases below this quality.
jpayne@69	70 entropy=0.66 Ignore sequence with entropy below this value.
jpayne@69	71 merge=f Merge paired reads prior to sketching.
jpayne@69	72 amino=f Use amino acid mode. Input should be amino acids.
jpayne@69	73 translate=f Call genes and translate to proteins. Input should be
jpayne@69	74 nucleotides. Designed for prokaryotes.
jpayne@69	75 sixframes=f Translate all 6 frames instead of predicting genes.
jpayne@69	76 ssu=t Scan for and retain full-length SSU sequence.
jpayne@69	77 printssusequence=f Print the query SSU sequence (JSON mode only).
jpayne@69	78
jpayne@69	79 Size parameters:
jpayne@69	80 size=10000 Desired size of sketches (if not using autosize).
jpayne@69	81 mgf=0.01 (maxfraction) Max fraction of genomic kmers to use.
jpayne@69	82 minsize=100 Do not generate sketches for genomes smaller than this.
jpayne@69	83 autosize=t Use flexible sizing instead of fixed-length. This is
jpayne@69	84 nonlinear; a human sketch is only ~6x a bacterial sketch.
jpayne@69	85 sizemult=1 Multiply the autosized size of sketches by this factor.
jpayne@69	86 Normally a bacterial-size genome will get a sketch size
jpayne@69	87 of around 10000; if autosizefactor=2, it would be ~20000.
jpayne@69	88 density= If this flag is set (to a number between 0 and 1),
jpayne@69	89 autosize and sizemult are ignored, and this fraction of
jpayne@69	90 genomic kmers are used. For example, at density=0.001,
jpayne@69	91 a 4.5Mbp bacteria will get a 4500-kmer sketch.
jpayne@69	92 sketchheapfactor=4 If minkeycount>1, temporarily track this many kmers until
jpayne@69	93 counts are known and low-count kmers are discarded.
jpayne@69	94
jpayne@69	95 Sketch comparing parameters:
jpayne@69	96 threads=auto Use this many threads for comparison.
jpayne@69	97 index=auto Index the sketches for much faster searching.
jpayne@69	98 Requires more memory and adds startup time.
jpayne@69	99 Recommended true for many query sketches, false for few.
jpayne@69	100 prealloc=f Preallocate the index for greater efficiency.
jpayne@69	101 Can be set to a number between 0 and 1 to determine how
jpayne@69	102 much of total memory should be used.
jpayne@69	103 alltoall (ata) Compare all refs to all. Must be sketches.
jpayne@69	104 compareself=f In all-to-all mode, compare a sketch to itself.
jpayne@69	105
jpayne@69	106 Taxonomy-related parameters:
jpayne@69	107 tree=<file> Specify a TaxTree file. On Genepool, use tree=auto.
jpayne@69	108 Only necessary for use with printtaxa and level.
jpayne@69	109 Assumes comparisons are done against reference sketches
jpayne@69	110 with known taxonomy information.
jpayne@69	111 level=2 Only report the best record per taxa at this level.
jpayne@69	112 Either level names or numbers may be used.
jpayne@69	113 0: disabled
jpayne@69	114 1: subspecies
jpayne@69	115 2: species
jpayne@69	116 3: genus
jpayne@69	117 ...etc
jpayne@69	118 include= Restrict output to organisms in these clades.
jpayne@69	119 May be a comma-delimited list of names or NCBI TaxIDs.
jpayne@69	120 includelevel=0 Promote the include list to this taxonomic level.
jpayne@69	121 For example, include=h.sapiens includelevel=phylum
jpayne@69	122 would only include organisms in the same phylum as human.
jpayne@69	123 includestring= Only report records whose name contains this string.
jpayne@69	124 exclude= Ignore organisms in these clades.
jpayne@69	125 May be a comma-delimited list of names or NCBI TaxIDs.
jpayne@69	126 excludelevel=0 Promote the exclude list to this taxonomic level.
jpayne@69	127 For example, exclude=h.sapiens excludelevel=phylum
jpayne@69	128 would exclude all organisms in the same phylum as human.
jpayne@69	129 excludestring= Do not records whose name contains this string.
jpayne@69	130 minlevel= Use this to restrict comparisons to distantly-related
jpayne@69	131 organisms. Intended for finding misclassified organisms
jpayne@69	132 using all-to-all comparisons. minlevel=order would only
jpayne@69	133 report hits between organisms related at the order level
jpayne@69	134 or higher, not between same species or genus.
jpayne@69	135 banunclassified=f Ignore organisms descending from nodes like
jpayne@69	136 'unclassified Bacteria'
jpayne@69	137 banvirus=f Ignore viruses.
jpayne@69	138 requiressu=f Ignore records without SSUs.
jpayne@69	139 minrefsize=0 Ignore ref sketches smaller than this (unique kmers).
jpayne@69	140 minrefsizebases=0 Ignore ref sketches smaller than this (total base pairs).
jpayne@69	141
jpayne@69	142 Output format:
jpayne@69	143 format=2 2: Default format with, per query, one query header line;
jpayne@69	144 one column header line; and one reference line per hit.
jpayne@69	145 3: One line per hit, with columns query, reference, ANI,
jpayne@69	146 and sizeRatio. Useful for all-to-all comparisons.
jpayne@69	147 4: JSON (format=json also works).
jpayne@69	148 5: Constellation (format=constellation also works).
jpayne@69	149 usetaxidname=f For format 3, print the taxID in the name column.
jpayne@69	150 usetaxname for format 3, print the taxonomic name in the name column.
jpayne@69	151 useimgname For format 3, print the img ID in the name column.
jpayne@69	152
jpayne@69	153 Output columns (for format=2):
jpayne@69	154 printall=f Enable all output columns.
jpayne@69	155 printani=t (ani) Print average nucleotide identity estimate.
jpayne@69	156 completeness=t Genome completeness estimate.
jpayne@69	157 score=f Score (used for sorting the output).
jpayne@69	158 printmatches=t Number of kmer matches to reference.
jpayne@69	159 printlength=f Number of kmers compared.
jpayne@69	160 printtaxid=t NCBI taxID.
jpayne@69	161 printimg=f IMG identifier (only for IMG data).
jpayne@69	162 printgbases=f Number of genomic bases.
jpayne@69	163 printgkmers=f Number of genomic kmers.
jpayne@69	164 printgsize=t Estimated number of unique genomic kmers.
jpayne@69	165 printgseqs=t Number of sequences (scaffolds/reads).
jpayne@69	166 printtaxname=t Name associated with this taxID.
jpayne@69	167 printname0=f (pn0) Original seqeuence name.
jpayne@69	168 printfname=t Query filename.
jpayne@69	169 printtaxa=f Full taxonomy of each record.
jpayne@69	170 printcontam=t Print contamination estimate, and factor contaminant kmers
jpayne@69	171 into calculations. Kmers are considered contaminant if
jpayne@69	172 present in some ref sketch but not the current one.
jpayne@69	173 printunique=t Number of matches unique to this reference.
jpayne@69	174 printunique2=f Number of matches unique to this reference's taxa.
jpayne@69	175 printunique3=f Number of query kmers unique to this reference's taxa,
jpayne@69	176 regardless of whether they are in this reference sketch.
jpayne@69	177 printnohit=f Number of kmers that don't hit anything.
jpayne@69	178 printrefhits=f Average number of ref sketches hit by shared kmers.
jpayne@69	179 printgc=f GC content.
jpayne@69	180 printucontam=f Contam hits that hit exactly one reference sketch.
jpayne@69	181 printcontam2=f Print contamination estimate using only kmer hits
jpayne@69	182 to unrelated taxa.
jpayne@69	183 contamlevel=species Taxonomic level to use for contam2/unique2/unique3.
jpayne@69	184 NOTE: unique2/unique3/contam2/refhits require an index.
jpayne@69	185
jpayne@69	186 printdepth=f (depth) Print average depth of sketch kmers; intended
jpayne@69	187 for shotgun read input.
jpayne@69	188 printdepth2=f (depth2) Print depth compensating for genomic repeats.
jpayne@69	189 Requires reference sketches to be generated with depth.
jpayne@69	190 actualdepth=t If this is false, the raw average count is printed.
jpayne@69	191 If true, the raw average (observed depth) is converted
jpayne@69	192 to estimated actual depth (including uncovered areas).
jpayne@69	193 printvolume=f (volume) Product of average depth and matches.
jpayne@69	194 printca=f Print common ancestor, if query taxID is known.
jpayne@69	195 printcal=f Print common ancestor tax level, if query taxID is known.
jpayne@69	196 recordsperlevel=0 If query TaxID is known, and this is positive, print this
jpayne@69	197 many records per common ancestor level.
jpayne@69	198
jpayne@69	199 Sorting:
jpayne@69	200 sortbyscore=t Default sort order is by score, a composite metric.
jpayne@69	201 sortbydepth=f Include depth as a factor in sort order.
jpayne@69	202 sortbydepth2=f Include depth2 as a factor in sort order.
jpayne@69	203 sortbyvolume=f Include volume as a factor in sort order.
jpayne@69	204 sortbykid=f Sort strictly by KID.
jpayne@69	205 sortbyani=f Sort strictly by ANI/AAI/WKID.
jpayne@69	206 sortbyhits=f Sort strictly by the number of kmer hits.
jpayne@69	207
jpayne@69	208 Other output parameters:
jpayne@69	209 minhits=3 (hits) Only report records with at least this many hits.
jpayne@69	210 minani=0 (ani) Only report records with at least this ANI (0-1).
jpayne@69	211 minwkid=0.0001 (wkid) Only report records with at least this WKID (0-1).
jpayne@69	212 anifromwkid=t Calculate ani from wkid. If false, use kid.
jpayne@69	213 minbases=0 Ignore ref sketches of sequences shortert than this.
jpayne@69	214 minsizeratio=0 Don't compare sketches if the smaller genome is less than
jpayne@69	215 this fraction of the size of the larger.
jpayne@69	216 records=20 Report at most this many best-matching records.
jpayne@69	217 color=family Color records at the family level. color=f will disable.
jpayne@69	218 Colors work in most terminals but may cause odd characters
jpayne@69	219 to appear in text editors. So, color defaults to f if
jpayne@69	220 writing to a file. Requires the taxtree to be loaded.
jpayne@69	221 intersect=f Print sketch intersections. delta=f is suggested.
jpayne@69	222
jpayne@69	223 Metadata flags (optional, for the query sketch header):
jpayne@69	224 taxid=-1 Set the NCBI taxid.
jpayne@69	225 imgid=-1 Set the IMG id.
jpayne@69	226 spid=-1 Set the JGI sequencing project id.
jpayne@69	227 name= Set the name (taxname).
jpayne@69	228 name0= Set name0 (normally the first sequence header).
jpayne@69	229 fname= Set fname (normally the file name).
jpayne@69	230 meta_= Set an arbitrary metadata field.
jpayne@69	231 For example, meta_Month=March.
jpayne@69	232
jpayne@69	233 Other parameters:
jpayne@69	234 requiredmeta= (rmeta) Required optional metadata values. For example:
jpayne@69	235 rmeta=subunit:ssu,source:silva
jpayne@69	236 bannedmeta= (bmeta) Forbidden optional metadata values.
jpayne@69	237
jpayne@69	238
jpayne@69	239 Java Parameters:
jpayne@69	240 -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69	241 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69	242 The max is typically 85% of physical memory.
jpayne@69	243 -eoom This flag will cause the process to exit if an
jpayne@69	244 out-of-memory exception occurs. Requires Java 8u92+.
jpayne@69	245 -da Disable assertions.
jpayne@69	246
jpayne@69	247 For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
jpayne@69	248 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69	249 "
jpayne@69	250 }
jpayne@69	251
jpayne@69	252 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69	253 pushd . > /dev/null
jpayne@69	254 DIR="${BASH_SOURCE[0]}"
jpayne@69	255 while [ -h "$DIR" ]; do
jpayne@69	256 cd "$(dirname "$DIR")"
jpayne@69	257 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69	258 done
jpayne@69	259 cd "$(dirname "$DIR")"
jpayne@69	260 DIR="$(pwd)/"
jpayne@69	261 popd > /dev/null
jpayne@69	262
jpayne@69	263 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69	264 CP="$DIR""current/"
jpayne@69	265
jpayne@69	266 z="-Xmx4g"
jpayne@69	267 z2="-Xms4g"
jpayne@69	268 set=0
jpayne@69	269
jpayne@69	270 if [ -z "$1" ] \|\| [[ $1 == -h ]] \|\| [[ $1 == --help ]]; then
jpayne@69	271 usage
jpayne@69	272 exit
jpayne@69	273 fi
jpayne@69	274
jpayne@69	275 calcXmx () {
jpayne@69	276 source "$DIR""/calcmem.sh"
jpayne@69	277 setEnvironment
jpayne@69	278 parseXmx "$@"
jpayne@69	279 if [[ $set == 1 ]]; then
jpayne@69	280 return
jpayne@69	281 fi
jpayne@69	282 freeRam 3200m 84
jpayne@69	283 z="-Xmx${RAM}m"
jpayne@69	284 z2="-Xms${RAM}m"
jpayne@69	285 }
jpayne@69	286 calcXmx "$@"
jpayne@69	287
jpayne@69	288 comparesketch() {
jpayne@69	289 local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.CompareSketch $@"
jpayne@69	290 # echo $CMD >&2
jpayne@69	291 eval $CMD
jpayne@69	292 }
jpayne@69	293
jpayne@69	294 comparesketch "$@"

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/comparesketch.sh @ 69:33d812a61356