annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/demuxbyname.sh @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 #!/bin/bash
jpayne@69 2
jpayne@69 3 usage(){
jpayne@69 4 echo "
jpayne@69 5 Written by Brian Bushnell
jpayne@69 6 Last modified Jan 7, 2020
jpayne@69 7
jpayne@69 8 Description: Demultiplexes sequences into multiple files based on their names,
jpayne@69 9 substrings of their names, or prefixes or suffixes of their names.
jpayne@69 10 Allows unlimited output files while maintaining only a small number of open file handles.
jpayne@69 11
jpayne@69 12 Usage:
jpayne@69 13 demuxbyname.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> names=<string,string,string...>
jpayne@69 14
jpayne@69 15 Alternately:
jpayne@69 16 demuxbyname.sh in=<file> out=<outfile> delimiter=whitespace prefixmode=f
jpayne@69 17 This will demultiplex by the substring after the last whitespace.
jpayne@69 18
jpayne@69 19 demuxbyname.sh in=<file> out=<outfile> length=8 prefixmode=t
jpayne@69 20 This will demultiplex by the first 8 characters of read names.
jpayne@69 21
jpayne@69 22 demuxbyname.sh in=<file> out=<outfile> delimiter=: prefixmode=f
jpayne@69 23 This will split on colons, and use the last substring as the name; useful for
jpayne@69 24 demuxing by barcode for Illumina headers in this format:
jpayne@69 25 @A00178:73:HH7H3DSXX:4:1101:13666:1047 1:N:0:ACGTTGGT+TGACGCAT
jpayne@69 26
jpayne@69 27 in2 and out2 are for paired reads in twin files and are optional.
jpayne@69 28 If input is paired and there is only one output file, it will be written interleaved.
jpayne@69 29
jpayne@69 30 File Parameters:
jpayne@69 31 in=<file> Input file.
jpayne@69 32 in2=<file> If input reads are paired in twin files, use in2 for the second file.
jpayne@69 33 out=<file> Output files for reads with matched headers (must contain % symbol).
jpayne@69 34 For example, out=out_%.fq with names XX and YY would create out_XX.fq and out_YY.fq.
jpayne@69 35 If twin files for paired reads are desired, use the # symbol. For example,
jpayne@69 36 out=out_%_#.fq in this case would create out_XX_1.fq, out_XX_2.fq, out_YY_1.fq, etc.
jpayne@69 37 outu=<file> Output file for reads with unmatched headers.
jpayne@69 38 stats=<file> Print statistics about how many reads went to each file.
jpayne@69 39
jpayne@69 40 Processing Modes (determines how to convert a read into a name):
jpayne@69 41 prefixmode=t (pm) Match prefix of read header. If false, match suffix of read header.
jpayne@69 42 prefixmode=f is equivalent to suffixmode=t.
jpayne@69 43 barcode=f Parse barcodes from Illumina headers.
jpayne@69 44 chrom=f For mapped sam files, make one file per chromosome (scaffold) using the rname.
jpayne@69 45 header=f Use the entire sequence header.
jpayne@69 46 delimiter= For prefix or suffix mode, specifying a delimiter will allow exact matches even if the length is variable.
jpayne@69 47 This allows demultiplexing based on names that are found without specifying a list of names.
jpayne@69 48 In suffix mode, for example, everything after the last delimiter will be used.
jpayne@69 49 Normally the delimiter will be used as a literal string (a Java regular expression); for example, ':' or 'HISEQ'.
jpayne@69 50 But there are some special delimiters which will be replaced by the symbol they name,
jpayne@69 51 because they are reserved in some operating systems or cause other problems.
jpayne@69 52 These are provided for convenience due to possible OS conflicts:
jpayne@69 53 space, tab, whitespace, pound, greaterthan, lessthan, equals,
jpayne@69 54 colon, semicolon, bang, and, quote, singlequote
jpayne@69 55 These are provided because they interfere with Java regular expression syntax:
jpayne@69 56 backslash, hat, dollar, dot, pipe, questionmark, star,
jpayne@69 57 plus, openparen, closeparen, opensquare, opencurly
jpayne@69 58 In other words, to match '.', you should set 'delimiter=dot'.
jpayne@69 59 substring=f Names can be substrings of read headers. Substring mode is
jpayne@69 60 slow if the list of names is large. Requires a list of names.
jpayne@69 61
jpayne@69 62 Other Processing Parameters:
jpayne@69 63 column=-1 If positive, split the header on a delimiter and match that column (1-based).
jpayne@69 64 For example, using this header:
jpayne@69 65 NB501886:61:HL3GMAFXX:1:11101:10717:1140 1:N:0:ACTGAGC+ATTAGAC
jpayne@69 66 You could demux by tile (11101) using 'delimiter=: column=5'
jpayne@69 67 Column is 1-based (first column is 1).
jpayne@69 68 If column is omitted when a delimiter is present, prefixmode
jpayne@69 69 will use the first substring, and suffixmode will use the last substring.
jpayne@69 70 names= List of strings (or files containing strings) to parse from read names.
jpayne@69 71 If the names are in text files, there should be one name per line.
jpayne@69 72 This is optional. If a list of names is provided, files will only be created for those names.
jpayne@69 73 For example, 'prefixmode=t length=5' would create a file for every unique last 5 characters in read names,
jpayne@69 74 and every read would be written to one of those files. But if there was addionally 'names=ABCDE,FGHIJ'
jpayne@69 75 then at most 2 files would be created, and anything not matching those names would go to outu.
jpayne@69 76 length=0 If positive, use a suffix or prefix of this length from read name instead of or in addition to the list of names.
jpayne@69 77 For example, you could create files based on the first 8 characters of read names.
jpayne@69 78 hdist=0 Allow a hamming distance for demultiplexing barcodes. This requires a list of names (barcodes).
jpayne@69 79 replace= Replace some characters in the output filenames. For example, replace=+-
jpayne@69 80 would replace the + symbol in headers with the - symbol in filenames. So you could
jpayne@69 81 match the name ACTGAGC+ATTAGAC in the header, but write to a file named ACTGAGC-ATTAGAC.
jpayne@69 82
jpayne@69 83 Buffering Parameters
jpayne@69 84 streams=4 Allow at most this many active streams. The actual number of open files
jpayne@69 85 will be 1 greater than this if outu is set, and doubled if output
jpayne@69 86 is paired and written in twin files instead of interleaved.
jpayne@69 87 minreads=0 Don't create a file for fewer than this many reads; instead, send them to unknown.
jpayne@69 88 This option will incur additional memory usage.
jpayne@69 89
jpayne@69 90 Common parameters:
jpayne@69 91 ow=t (overwrite) Overwrites files that already exist.
jpayne@69 92 zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
jpayne@69 93 int=auto (interleaved) Determines whether INPUT file is considered interleaved.
jpayne@69 94 qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
jpayne@69 95 qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
jpayne@69 96
jpayne@69 97
jpayne@69 98 Java Parameters:
jpayne@69 99 -Xmx This will set Java's memory usage, overriding autodetection.
jpayne@69 100 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
jpayne@69 101 The max is typically 85% of physical memory.
jpayne@69 102 -eoom This flag will cause the process to exit if an out-of-memory
jpayne@69 103 exception occurs. Requires Java 8u92+.
jpayne@69 104 -da Disable assertions.
jpayne@69 105
jpayne@69 106 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
jpayne@69 107 "
jpayne@69 108 }
jpayne@69 109
jpayne@69 110 #This block allows symlinked shellscripts to correctly set classpath.
jpayne@69 111 pushd . > /dev/null
jpayne@69 112 DIR="${BASH_SOURCE[0]}"
jpayne@69 113 while [ -h "$DIR" ]; do
jpayne@69 114 cd "$(dirname "$DIR")"
jpayne@69 115 DIR="$(readlink "$(basename "$DIR")")"
jpayne@69 116 done
jpayne@69 117 cd "$(dirname "$DIR")"
jpayne@69 118 DIR="$(pwd)/"
jpayne@69 119 popd > /dev/null
jpayne@69 120
jpayne@69 121 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
jpayne@69 122 CP="$DIR""current/"
jpayne@69 123
jpayne@69 124 z="-Xmx2g"
jpayne@69 125 z2="-Xms2g"
jpayne@69 126 set=0
jpayne@69 127
jpayne@69 128 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
jpayne@69 129 usage
jpayne@69 130 exit
jpayne@69 131 fi
jpayne@69 132
jpayne@69 133 calcXmx () {
jpayne@69 134 source "$DIR""/calcmem.sh"
jpayne@69 135 setEnvironment
jpayne@69 136 parseXmx "$@"
jpayne@69 137 if [[ $set == 1 ]]; then
jpayne@69 138 return
jpayne@69 139 fi
jpayne@69 140 freeRam 3200m 84
jpayne@69 141 z="-Xmx${RAM}m"
jpayne@69 142 z2="-Xms${RAM}m"
jpayne@69 143 }
jpayne@69 144 calcXmx "$@"
jpayne@69 145
jpayne@69 146 function demuxbyname() {
jpayne@69 147 local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.DemuxByName2 $@"
jpayne@69 148 echo $CMD >&2
jpayne@69 149 eval $CMD
jpayne@69 150 }
jpayne@69 151
jpayne@69 152 demuxbyname "$@"