jpayne@69
|
1 #!/bin/bash
|
jpayne@69
|
2
|
jpayne@69
|
3 usage(){
|
jpayne@69
|
4 echo "
|
jpayne@69
|
5 Written by Brian Bushnell
|
jpayne@69
|
6 Last modified April 1, 2019
|
jpayne@69
|
7
|
jpayne@69
|
8 Description: Generates random synthetic reads from a reference genome. Read names indicate their genomic origin.
|
jpayne@69
|
9 Allows precise customization of things like insert size and synthetic mutation type, sizes, and rates.
|
jpayne@69
|
10 Read names generated by this program are used by MakeRocCure (samtoroc.sh) and GradeSamFile (gradesam.sh).
|
jpayne@69
|
11 They can also be used by BBMap (bbmap.sh) and BBMerge (bbmerge.sh) to automatically calculate
|
jpayne@69
|
12 true and false positive rates, if the flag 'parsecustom' is used.
|
jpayne@69
|
13
|
jpayne@69
|
14 Usage: randomreads.sh ref=<file> out=<file> length=<number> reads=<number>
|
jpayne@69
|
15
|
jpayne@69
|
16 Basic parameters:
|
jpayne@69
|
17 out=null Output file. If reads are paired and a single file name is
|
jpayne@69
|
18 given, output will be interleaved. For paired reads in twin
|
jpayne@69
|
19 files, set out1= and out2=
|
jpayne@69
|
20 ref=null Reference file. Not needed if the reference is already indexed.
|
jpayne@69
|
21 build=1 If multiple references are indexed in the same directory,
|
jpayne@69
|
22 each needs a unique build ID.
|
jpayne@69
|
23 midpad=300 Specifies space between scaffolds in packed index.
|
jpayne@69
|
24 reads=0 Generate this many reads (or pairs).
|
jpayne@69
|
25 coverage=-1 If positive, generate enough reads to hit this coverage
|
jpayne@69
|
26 target, based on the genome size.
|
jpayne@69
|
27 overwrite=t Set to false to disallow overwriting of existing files.
|
jpayne@69
|
28 replacenoref=f Set to true to replace Ns in the reference sequence
|
jpayne@69
|
29 with random letters.
|
jpayne@69
|
30 simplenames=f Set to true to generate read names that clearly indicate
|
jpayne@69
|
31 genomic origin, without BBMap internal coordinates.
|
jpayne@69
|
32 illuminanames=f Set to true to have matching names for paired reads,
|
jpayne@69
|
33 rather than naming by location.
|
jpayne@69
|
34 renamebyinsert=f Insert the insert size into the name.
|
jpayne@69
|
35 addpairnum=f Set true to add ' 1:' and ' 2:' to the end of read names.
|
jpayne@69
|
36 addslash=f Set true to add '/1' and '/2' to the end of read names.
|
jpayne@69
|
37 spaceslash=f Set true to add a space before slash read pairnum.
|
jpayne@69
|
38 prefix=null Generated reads will start with this prefix,
|
jpayne@69
|
39 rather than naming by location.
|
jpayne@69
|
40 seed=0 Use this to set the random number generator seed;
|
jpayne@69
|
41 use -1 for a random seed.
|
jpayne@69
|
42
|
jpayne@69
|
43 Length Parameters - normally only minlength and maxlength are needed.
|
jpayne@69
|
44 minlength=150 Generate reads of up to this length.
|
jpayne@69
|
45 maxlength=150 Generate reads of at least this length.
|
jpayne@69
|
46 gaussianlength=f Use a gaussian length distribution (for PacBio).
|
jpayne@69
|
47 Otherwise, the distribution is linear.
|
jpayne@69
|
48 midlength=-1 Gaussian curve peaks at this point. Must be between
|
jpayne@69
|
49 minlength and maxlength, in Gaussian mode.
|
jpayne@69
|
50 readlengthsd=-1 Standard deviation of the Gaussian curve. Note that the
|
jpayne@69
|
51 final curve is a sum of multiple curves, but this will affect
|
jpayne@69
|
52 overall curve width. By default this is set to 1/4 of range.
|
jpayne@69
|
53
|
jpayne@69
|
54 Pairing parameters:
|
jpayne@69
|
55 paired=f Set to true for paired reads.
|
jpayne@69
|
56 mininsert= Controls minimum insert length. Default depends on read length.
|
jpayne@69
|
57 maxinsert= Controls maximum insert length. Default depends on read length.
|
jpayne@69
|
58 triangle=f Make a triangular insert size distribution.
|
jpayne@69
|
59 flat=f Make a roughly flat insert size distribution..
|
jpayne@69
|
60 superflat=f Make a perfectly flat insert size distribution.
|
jpayne@69
|
61 gaussian=t Make a bell-shaped insert size distribution, with
|
jpayne@69
|
62 standard deviation of (maxinsert-mininsert)/6.
|
jpayne@69
|
63 samestrand=f Generate paired reads on the same strand.
|
jpayne@69
|
64
|
jpayne@69
|
65 Mutation parameters:
|
jpayne@69
|
66 snprate=0 Add snps to reads with this probability (0-1).
|
jpayne@69
|
67 insrate=0 Add insertions to reads with this probability (0-1).
|
jpayne@69
|
68 delrate=0 Add deletions to reads with this probability (0-1).
|
jpayne@69
|
69 subrate=0 Add contiguous substitutions to reads with this probability (0-1).
|
jpayne@69
|
70 nrate=0 Add nocalls to reads with this probability (0-1).
|
jpayne@69
|
71
|
jpayne@69
|
72 Note: With a 'rate' of X, each read has an X chance of getting at least
|
jpayne@69
|
73 1 mutation, X^2 chance of 2+ mutations, X^3 chance of 3+ mutations,
|
jpayne@69
|
74 and so forth up to the maximum allowed number of mutations of that type.
|
jpayne@69
|
75
|
jpayne@69
|
76 maxsnps=3 Add at most this many snps per read.
|
jpayne@69
|
77 maxinss=2 Add at most this many deletions per read.
|
jpayne@69
|
78 maxdels=2 Add at most this many insertions per read.
|
jpayne@69
|
79 maxsubs=2 Add at most this many contiguous substitutions per read.
|
jpayne@69
|
80 maxns=0 Add at most this many blocks of Ns per read.
|
jpayne@69
|
81
|
jpayne@69
|
82 maxinslen=12 Max length of insertions.
|
jpayne@69
|
83 maxdellen=400 Max length of deletions.
|
jpayne@69
|
84 maxsublen=12 Max length of contiguous substitutions.
|
jpayne@69
|
85 maxnlen=1 Min length of N blocks.
|
jpayne@69
|
86
|
jpayne@69
|
87 mininslen=1 Min length of insertions.
|
jpayne@69
|
88 mindellen=1 Min length of deletions.
|
jpayne@69
|
89 minsublen=2 Min length of contiguous substitutions.
|
jpayne@69
|
90 minnlen=1 Min length of N blocks.
|
jpayne@69
|
91
|
jpayne@69
|
92 Illumina quality parameters:
|
jpayne@69
|
93 maxq=36 Upper bound of quality values.
|
jpayne@69
|
94 midq=28 Approximate average of quality values.
|
jpayne@69
|
95 minq=20 Lower bound of quality values.
|
jpayne@69
|
96 q= Sets maxq, midq, and minq to the same value.
|
jpayne@69
|
97 adderrors=t Add substitution errors based on quality values,
|
jpayne@69
|
98 after mutations.
|
jpayne@69
|
99 qv=4 Vary the base quality of reads by up to this much
|
jpayne@69
|
100 to simulate tile effects.
|
jpayne@69
|
101
|
jpayne@69
|
102 PacBio quality parameters:
|
jpayne@69
|
103 pacbio=f Use a PacBio error model, rather than Illumina
|
jpayne@69
|
104 error model, and add PacBio errors after mutations.
|
jpayne@69
|
105 pbmin=0.13 Minimum rate of PacBio errors for a read.
|
jpayne@69
|
106 pbmax=0.17 Maximum rate of PacBio errors for a read.
|
jpayne@69
|
107
|
jpayne@69
|
108 Other Parameters:
|
jpayne@69
|
109 overlap=1 Require reads to overlap scaffold end by at least this much.
|
jpayne@69
|
110 banns=f Do not generate reads over reference Ns.
|
jpayne@69
|
111 metagenome=f Assign scaffolds a random exponential coverage level,
|
jpayne@69
|
112 to simulate a metagenomic or RNA coverage distribution.
|
jpayne@69
|
113 randomscaffold=f Choose random scaffolds without respect to length.
|
jpayne@69
|
114 amp=1 Simulate highly-amplified MDA single-cell data by
|
jpayne@69
|
115 setting this to a higher number like 1000.
|
jpayne@69
|
116 replacenoref=f Replace intra- and inter-scaffold Ns with random bases.
|
jpayne@69
|
117 pbadapter= Add adapter sequence to some reads using this literal string.
|
jpayne@69
|
118 fragadapter= Add this sequence to paired reads with insert size
|
jpayne@69
|
119 shorter than read length.
|
jpayne@69
|
120 fragadapter2= Use this sequence for read 2.
|
jpayne@69
|
121
|
jpayne@69
|
122 Java Parameters:
|
jpayne@69
|
123 -Xmx This will set Java's memory usage, overriding the
|
jpayne@69
|
124 program's automatic memory detection.
|
jpayne@69
|
125 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify
|
jpayne@69
|
126 200 megs.
|
jpayne@69
|
127 The max is typically 85% of physical memory.
|
jpayne@69
|
128 -eoom This flag will cause the process to exit if an out-of-memory
|
jpayne@69
|
129 exception occurs. Requires Java 8u92+.
|
jpayne@69
|
130 -da Disable assertions.
|
jpayne@69
|
131 "
|
jpayne@69
|
132 }
|
jpayne@69
|
133
|
jpayne@69
|
134 #This block allows symlinked shellscripts to correctly set classpath.
|
jpayne@69
|
135 pushd . > /dev/null
|
jpayne@69
|
136 DIR="${BASH_SOURCE[0]}"
|
jpayne@69
|
137 while [ -h "$DIR" ]; do
|
jpayne@69
|
138 cd "$(dirname "$DIR")"
|
jpayne@69
|
139 DIR="$(readlink "$(basename "$DIR")")"
|
jpayne@69
|
140 done
|
jpayne@69
|
141 cd "$(dirname "$DIR")"
|
jpayne@69
|
142 DIR="$(pwd)/"
|
jpayne@69
|
143 popd > /dev/null
|
jpayne@69
|
144
|
jpayne@69
|
145 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
|
jpayne@69
|
146 CP="$DIR""current/"
|
jpayne@69
|
147
|
jpayne@69
|
148 z="-Xmx1g"
|
jpayne@69
|
149 z2="-Xms1g"
|
jpayne@69
|
150 set=0
|
jpayne@69
|
151
|
jpayne@69
|
152 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
|
jpayne@69
|
153 usage
|
jpayne@69
|
154 exit
|
jpayne@69
|
155 fi
|
jpayne@69
|
156
|
jpayne@69
|
157 calcXmx () {
|
jpayne@69
|
158 source "$DIR""/calcmem.sh"
|
jpayne@69
|
159 setEnvironment
|
jpayne@69
|
160 parseXmx "$@"
|
jpayne@69
|
161 if [[ $set == 1 ]]; then
|
jpayne@69
|
162 return
|
jpayne@69
|
163 fi
|
jpayne@69
|
164 freeRam 3200m 84
|
jpayne@69
|
165 z="-Xmx${RAM}m"
|
jpayne@69
|
166 z2="-Xms${RAM}m"
|
jpayne@69
|
167 }
|
jpayne@69
|
168 calcXmx "$@"
|
jpayne@69
|
169
|
jpayne@69
|
170 randomreads() {
|
jpayne@69
|
171 local CMD="java $EA $EOOM $z -cp $CP align2.RandomReads3 build=1 $@"
|
jpayne@69
|
172 echo $CMD >&2
|
jpayne@69
|
173 eval $CMD
|
jpayne@69
|
174 }
|
jpayne@69
|
175
|
jpayne@69
|
176 randomreads "$@"
|