jpayne@69
|
1 #!/bin/bash
|
jpayne@69
|
2
|
jpayne@69
|
3 usage(){
|
jpayne@69
|
4 echo "
|
jpayne@69
|
5 Written by Brian Bushnell
|
jpayne@69
|
6 Last modified Jan 7, 2020
|
jpayne@69
|
7
|
jpayne@69
|
8 Description: Demultiplexes sequences into multiple files based on their names,
|
jpayne@69
|
9 substrings of their names, or prefixes or suffixes of their names.
|
jpayne@69
|
10 Allows unlimited output files while maintaining only a small number of open file handles.
|
jpayne@69
|
11
|
jpayne@69
|
12 Usage:
|
jpayne@69
|
13 demuxbyname.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> names=<string,string,string...>
|
jpayne@69
|
14
|
jpayne@69
|
15 Alternately:
|
jpayne@69
|
16 demuxbyname.sh in=<file> out=<outfile> delimiter=whitespace prefixmode=f
|
jpayne@69
|
17 This will demultiplex by the substring after the last whitespace.
|
jpayne@69
|
18
|
jpayne@69
|
19 demuxbyname.sh in=<file> out=<outfile> length=8 prefixmode=t
|
jpayne@69
|
20 This will demultiplex by the first 8 characters of read names.
|
jpayne@69
|
21
|
jpayne@69
|
22 demuxbyname.sh in=<file> out=<outfile> delimiter=: prefixmode=f
|
jpayne@69
|
23 This will split on colons, and use the last substring as the name; useful for
|
jpayne@69
|
24 demuxing by barcode for Illumina headers in this format:
|
jpayne@69
|
25 @A00178:73:HH7H3DSXX:4:1101:13666:1047 1:N:0:ACGTTGGT+TGACGCAT
|
jpayne@69
|
26
|
jpayne@69
|
27 in2 and out2 are for paired reads in twin files and are optional.
|
jpayne@69
|
28 If input is paired and there is only one output file, it will be written interleaved.
|
jpayne@69
|
29
|
jpayne@69
|
30 File Parameters:
|
jpayne@69
|
31 in=<file> Input file.
|
jpayne@69
|
32 in2=<file> If input reads are paired in twin files, use in2 for the second file.
|
jpayne@69
|
33 out=<file> Output files for reads with matched headers (must contain % symbol).
|
jpayne@69
|
34 For example, out=out_%.fq with names XX and YY would create out_XX.fq and out_YY.fq.
|
jpayne@69
|
35 If twin files for paired reads are desired, use the # symbol. For example,
|
jpayne@69
|
36 out=out_%_#.fq in this case would create out_XX_1.fq, out_XX_2.fq, out_YY_1.fq, etc.
|
jpayne@69
|
37 outu=<file> Output file for reads with unmatched headers.
|
jpayne@69
|
38 stats=<file> Print statistics about how many reads went to each file.
|
jpayne@69
|
39
|
jpayne@69
|
40 Processing Modes (determines how to convert a read into a name):
|
jpayne@69
|
41 prefixmode=t (pm) Match prefix of read header. If false, match suffix of read header.
|
jpayne@69
|
42 prefixmode=f is equivalent to suffixmode=t.
|
jpayne@69
|
43 barcode=f Parse barcodes from Illumina headers.
|
jpayne@69
|
44 chrom=f For mapped sam files, make one file per chromosome (scaffold) using the rname.
|
jpayne@69
|
45 header=f Use the entire sequence header.
|
jpayne@69
|
46 delimiter= For prefix or suffix mode, specifying a delimiter will allow exact matches even if the length is variable.
|
jpayne@69
|
47 This allows demultiplexing based on names that are found without specifying a list of names.
|
jpayne@69
|
48 In suffix mode, for example, everything after the last delimiter will be used.
|
jpayne@69
|
49 Normally the delimiter will be used as a literal string (a Java regular expression); for example, ':' or 'HISEQ'.
|
jpayne@69
|
50 But there are some special delimiters which will be replaced by the symbol they name,
|
jpayne@69
|
51 because they are reserved in some operating systems or cause other problems.
|
jpayne@69
|
52 These are provided for convenience due to possible OS conflicts:
|
jpayne@69
|
53 space, tab, whitespace, pound, greaterthan, lessthan, equals,
|
jpayne@69
|
54 colon, semicolon, bang, and, quote, singlequote
|
jpayne@69
|
55 These are provided because they interfere with Java regular expression syntax:
|
jpayne@69
|
56 backslash, hat, dollar, dot, pipe, questionmark, star,
|
jpayne@69
|
57 plus, openparen, closeparen, opensquare, opencurly
|
jpayne@69
|
58 In other words, to match '.', you should set 'delimiter=dot'.
|
jpayne@69
|
59 substring=f Names can be substrings of read headers. Substring mode is
|
jpayne@69
|
60 slow if the list of names is large. Requires a list of names.
|
jpayne@69
|
61
|
jpayne@69
|
62 Other Processing Parameters:
|
jpayne@69
|
63 column=-1 If positive, split the header on a delimiter and match that column (1-based).
|
jpayne@69
|
64 For example, using this header:
|
jpayne@69
|
65 NB501886:61:HL3GMAFXX:1:11101:10717:1140 1:N:0:ACTGAGC+ATTAGAC
|
jpayne@69
|
66 You could demux by tile (11101) using 'delimiter=: column=5'
|
jpayne@69
|
67 Column is 1-based (first column is 1).
|
jpayne@69
|
68 If column is omitted when a delimiter is present, prefixmode
|
jpayne@69
|
69 will use the first substring, and suffixmode will use the last substring.
|
jpayne@69
|
70 names= List of strings (or files containing strings) to parse from read names.
|
jpayne@69
|
71 If the names are in text files, there should be one name per line.
|
jpayne@69
|
72 This is optional. If a list of names is provided, files will only be created for those names.
|
jpayne@69
|
73 For example, 'prefixmode=t length=5' would create a file for every unique last 5 characters in read names,
|
jpayne@69
|
74 and every read would be written to one of those files. But if there was addionally 'names=ABCDE,FGHIJ'
|
jpayne@69
|
75 then at most 2 files would be created, and anything not matching those names would go to outu.
|
jpayne@69
|
76 length=0 If positive, use a suffix or prefix of this length from read name instead of or in addition to the list of names.
|
jpayne@69
|
77 For example, you could create files based on the first 8 characters of read names.
|
jpayne@69
|
78 hdist=0 Allow a hamming distance for demultiplexing barcodes. This requires a list of names (barcodes).
|
jpayne@69
|
79 replace= Replace some characters in the output filenames. For example, replace=+-
|
jpayne@69
|
80 would replace the + symbol in headers with the - symbol in filenames. So you could
|
jpayne@69
|
81 match the name ACTGAGC+ATTAGAC in the header, but write to a file named ACTGAGC-ATTAGAC.
|
jpayne@69
|
82
|
jpayne@69
|
83 Buffering Parameters
|
jpayne@69
|
84 streams=4 Allow at most this many active streams. The actual number of open files
|
jpayne@69
|
85 will be 1 greater than this if outu is set, and doubled if output
|
jpayne@69
|
86 is paired and written in twin files instead of interleaved.
|
jpayne@69
|
87 minreads=0 Don't create a file for fewer than this many reads; instead, send them to unknown.
|
jpayne@69
|
88 This option will incur additional memory usage.
|
jpayne@69
|
89
|
jpayne@69
|
90 Common parameters:
|
jpayne@69
|
91 ow=t (overwrite) Overwrites files that already exist.
|
jpayne@69
|
92 zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max).
|
jpayne@69
|
93 int=auto (interleaved) Determines whether INPUT file is considered interleaved.
|
jpayne@69
|
94 qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto.
|
jpayne@69
|
95 qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input).
|
jpayne@69
|
96
|
jpayne@69
|
97
|
jpayne@69
|
98 Java Parameters:
|
jpayne@69
|
99 -Xmx This will set Java's memory usage, overriding autodetection.
|
jpayne@69
|
100 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
|
jpayne@69
|
101 The max is typically 85% of physical memory.
|
jpayne@69
|
102 -eoom This flag will cause the process to exit if an out-of-memory
|
jpayne@69
|
103 exception occurs. Requires Java 8u92+.
|
jpayne@69
|
104 -da Disable assertions.
|
jpayne@69
|
105
|
jpayne@69
|
106 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
|
jpayne@69
|
107 "
|
jpayne@69
|
108 }
|
jpayne@69
|
109
|
jpayne@69
|
110 #This block allows symlinked shellscripts to correctly set classpath.
|
jpayne@69
|
111 pushd . > /dev/null
|
jpayne@69
|
112 DIR="${BASH_SOURCE[0]}"
|
jpayne@69
|
113 while [ -h "$DIR" ]; do
|
jpayne@69
|
114 cd "$(dirname "$DIR")"
|
jpayne@69
|
115 DIR="$(readlink "$(basename "$DIR")")"
|
jpayne@69
|
116 done
|
jpayne@69
|
117 cd "$(dirname "$DIR")"
|
jpayne@69
|
118 DIR="$(pwd)/"
|
jpayne@69
|
119 popd > /dev/null
|
jpayne@69
|
120
|
jpayne@69
|
121 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
|
jpayne@69
|
122 CP="$DIR""current/"
|
jpayne@69
|
123
|
jpayne@69
|
124 z="-Xmx2g"
|
jpayne@69
|
125 z2="-Xms2g"
|
jpayne@69
|
126 set=0
|
jpayne@69
|
127
|
jpayne@69
|
128 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
|
jpayne@69
|
129 usage
|
jpayne@69
|
130 exit
|
jpayne@69
|
131 fi
|
jpayne@69
|
132
|
jpayne@69
|
133 calcXmx () {
|
jpayne@69
|
134 source "$DIR""/calcmem.sh"
|
jpayne@69
|
135 setEnvironment
|
jpayne@69
|
136 parseXmx "$@"
|
jpayne@69
|
137 if [[ $set == 1 ]]; then
|
jpayne@69
|
138 return
|
jpayne@69
|
139 fi
|
jpayne@69
|
140 freeRam 3200m 84
|
jpayne@69
|
141 z="-Xmx${RAM}m"
|
jpayne@69
|
142 z2="-Xms${RAM}m"
|
jpayne@69
|
143 }
|
jpayne@69
|
144 calcXmx "$@"
|
jpayne@69
|
145
|
jpayne@69
|
146 function demuxbyname() {
|
jpayne@69
|
147 local CMD="java $EA $EOOM $z $z2 -cp $CP jgi.DemuxByName2 $@"
|
jpayne@69
|
148 echo $CMD >&2
|
jpayne@69
|
149 eval $CMD
|
jpayne@69
|
150 }
|
jpayne@69
|
151
|
jpayne@69
|
152 demuxbyname "$@"
|