jpayne@69
|
1 #!/bin/bash
|
jpayne@69
|
2
|
jpayne@69
|
3 usage(){
|
jpayne@69
|
4 echo "
|
jpayne@69
|
5 Written by Brian Bushnell
|
jpayne@69
|
6 Last modified October 6, 2020
|
jpayne@69
|
7
|
jpayne@69
|
8 Description: Calls variants from sam or bam input.
|
jpayne@69
|
9 In default mode, all input files are combined and treated as a single sample.
|
jpayne@69
|
10 In multisample mode, each file is treated as an individual sample,
|
jpayne@69
|
11 and gets its own column in the VCF file. Unless overridden, input file
|
jpayne@69
|
12 names are used as sample names.
|
jpayne@69
|
13 Please read bbmap/docs/guides/CallVariantsGuide.txt for more information,
|
jpayne@69
|
14 or bbmap/pipelines/variantPipeline.sh for a usage example.
|
jpayne@69
|
15
|
jpayne@69
|
16 Usage: callvariants.sh in=<file,file,...> ref=<file> vcf=<file>
|
jpayne@69
|
17
|
jpayne@69
|
18 Input may be sorted or unsorted.
|
jpayne@69
|
19 The reference should be fasta.
|
jpayne@69
|
20
|
jpayne@69
|
21 I/O parameters:
|
jpayne@69
|
22 in=<file> Input; may be one file or multiple comma-delimited files.
|
jpayne@69
|
23 list=<file> Optional text file containing one input file per line.
|
jpayne@69
|
24 Use list or in, but not both.
|
jpayne@69
|
25 out=<file> Output variant list in var format. If the name ends
|
jpayne@69
|
26 with .vcf then it will be vcf format.
|
jpayne@69
|
27 vcf=<file> Output variant list in vcf format.
|
jpayne@69
|
28 outgff=<file> Output variant list in gff format.
|
jpayne@69
|
29 ref=<file> Reference fasta. Required to display ref alleles.
|
jpayne@69
|
30 Variant calling wil be more accurate with the reference.
|
jpayne@69
|
31 vcfin=<file> Force calls at these locations, even if allele count is 0.
|
jpayne@69
|
32 shist=<file> (scorehist) Output for variant score histogram.
|
jpayne@69
|
33 zhist=<file> (zygosityhist) Output for zygosity histogram.
|
jpayne@69
|
34 qhist=<file> (qualityhist) Output for variant base quality histogram.
|
jpayne@69
|
35 overwrite=f (ow) Set to false to force the program to abort rather than
|
jpayne@69
|
36 overwrite an existing file.
|
jpayne@69
|
37 extended=t Print additional variant statistics columns.
|
jpayne@69
|
38 sample= Optional comma-delimited list of sample names.
|
jpayne@69
|
39 multisample=f (multi) Set to true if there are multiple sam/bam files,
|
jpayne@69
|
40 and each should be tracked as an individual sample.
|
jpayne@69
|
41 vcf0= Optional comma-delimited list of per-sample outputs.
|
jpayne@69
|
42 Only used in multisample mode.
|
jpayne@69
|
43 bgzip=t Use bgzip for gzip compression.
|
jpayne@69
|
44 samstreamer=t (ss) Load reads multithreaded to increase speed.
|
jpayne@69
|
45 Disable to reduce the number of threads used. The number of
|
jpayne@69
|
46 streamer threads can be set with e.g. 'ss=4'; default is 6.
|
jpayne@69
|
47 streamermf=8 (ssmf) Allow multiple sam files to be read simultaneously.
|
jpayne@69
|
48 Set ssmf=X to specify the maximum number or ssmf=f
|
jpayne@69
|
49 to disable.
|
jpayne@69
|
50
|
jpayne@69
|
51 Processing Parameters:
|
jpayne@69
|
52 prefilter=f Use a Bloom filter to exclude variants seen fewer than
|
jpayne@69
|
53 minreads times. Doubles the runtime but greatly reduces
|
jpayne@69
|
54 memory usage. The results are identical.
|
jpayne@69
|
55 coverage=t (cc) Calculate coverage, to better call variants.
|
jpayne@69
|
56 ploidy=1 Set the organism's ploidy.
|
jpayne@69
|
57 rarity=1.0 Penalize the quality of variants with allele fraction
|
jpayne@69
|
58 lower than this. For example, if you are interested in
|
jpayne@69
|
59 4% frequency variants, you could set both rarity and
|
jpayne@69
|
60 minallelefraction to 0.04. This is affected by ploidy -
|
jpayne@69
|
61 a variant with frequency indicating at least one copy
|
jpayne@69
|
62 is never penalized.
|
jpayne@69
|
63 covpenalty=0.8 (lowcoveragepenalty) A lower penalty will increase the
|
jpayne@69
|
64 scores of low-coverage variants, and is useful for
|
jpayne@69
|
65 low-coverage datasets.
|
jpayne@69
|
66 useidentity=t Include average read identity in score calculation.
|
jpayne@69
|
67 usepairing=t Include pairing rate in score calculation.
|
jpayne@69
|
68 usebias=t Include strand bias in score calculation.
|
jpayne@69
|
69 useedist=t Include read-end distance in score calculation.
|
jpayne@69
|
70 homopolymer=t Penalize scores of substitutions matching adjacent bases.
|
jpayne@69
|
71 nscan=t Consider the distance of a variant from contig ends when
|
jpayne@69
|
72 calculating strand bias.
|
jpayne@69
|
73 callsub=t Call substitutions.
|
jpayne@69
|
74 calldel=t Call deletions.
|
jpayne@69
|
75 callins=t Call insertions.
|
jpayne@69
|
76 calljunct=f Call junctions (in development).
|
jpayne@69
|
77 nopassdot=f Use . as genotype for variations failing the filter.
|
jpayne@69
|
78
|
jpayne@69
|
79 Coverage Parameters (these mainly affect speed and memory use):
|
jpayne@69
|
80 32bit=f Set to true to allow coverage tracking over depth 65535,
|
jpayne@69
|
81 which increases memory use. Variant calls are impacted
|
jpayne@69
|
82 where coverage exceeds the maximum.
|
jpayne@69
|
83 atomic=auto Increases multithreaded speed; forces 32bit to true.
|
jpayne@69
|
84 Defaults to true if there are more than 8 threads.
|
jpayne@69
|
85 strandedcov=f (stranded) Tracks per-strand ref coverage to print the MCOV
|
jpayne@69
|
86 and DP4 fields. Requires more memory when enabled. Strand
|
jpayne@69
|
87 of variant reads is tracked regardless of this flag.
|
jpayne@69
|
88
|
jpayne@69
|
89 Trimming Parameters:
|
jpayne@69
|
90 border=5 Trim at least this many bases on both ends of reads.
|
jpayne@69
|
91 qtrim=r Quality-trim reads on this end
|
jpayne@69
|
92 r: right, l: left, rl: both, f: don't quality-trim.
|
jpayne@69
|
93 trimq=10 Quality-trim bases below this score.
|
jpayne@69
|
94
|
jpayne@69
|
95 Realignment Parameters:
|
jpayne@69
|
96 realign=f Realign all reads with more than a couple mismatches.
|
jpayne@69
|
97 Decreases speed. Recommended for aligners other than BBMap.
|
jpayne@69
|
98 unclip=f Convert clip symbols from exceeding the ends of the
|
jpayne@69
|
99 realignment zone into matches and substitutitions.
|
jpayne@69
|
100 repadding=70 Pad alignment by this much on each end. Typically,
|
jpayne@69
|
101 longer is more accurate for long indels, but greatly
|
jpayne@69
|
102 reduces speed.
|
jpayne@69
|
103 rerows=602 Use this many rows maximum for realignment. Reads longer
|
jpayne@69
|
104 than this cannot be realigned.
|
jpayne@69
|
105 recols=2000 Reads may not be aligned to reference seqments longer
|
jpayne@69
|
106 than this. Needs to be at least read length plus
|
jpayne@69
|
107 max deletion length plus twice padding.
|
jpayne@69
|
108 msa= Select the aligner. Options:
|
jpayne@69
|
109 MultiStateAligner11ts: Default.
|
jpayne@69
|
110 MultiStateAligner9PacBio: Use for PacBio reads, or for
|
jpayne@69
|
111 Illumina reads mapped to PacBio/Nanopore reads.
|
jpayne@69
|
112
|
jpayne@69
|
113 Sam-filtering Parameters:
|
jpayne@69
|
114 minpos= Ignore alignments not overlapping this range.
|
jpayne@69
|
115 maxpos= Ignore alignments not overlapping this range.
|
jpayne@69
|
116 minreadmapq=4 Ignore alignments with lower mapq.
|
jpayne@69
|
117 contigs= Comma-delimited list of contig names to include. These
|
jpayne@69
|
118 should have no spaces, or underscores instead of spaces.
|
jpayne@69
|
119 secondary=f Include secondary alignments.
|
jpayne@69
|
120 supplimentary=f Include supplimentary alignments.
|
jpayne@69
|
121 duplicate=f Include reads flagged as duplicates.
|
jpayne@69
|
122 invert=f Invert sam filters.
|
jpayne@69
|
123
|
jpayne@69
|
124 Variant-Calling Cutoffs:
|
jpayne@69
|
125 minreads=2 (minad) Ignore variants seen in fewer reads.
|
jpayne@69
|
126 maxreads=BIG (maxad) Ignore variants seen in more reads.
|
jpayne@69
|
127 mincov=0 Ignore variants in lower-coverage locations.
|
jpayne@69
|
128 maxcov=BIG Ignore variants in higher-coverage locations.
|
jpayne@69
|
129 minqualitymax=15 Ignore variants with lower max base quality.
|
jpayne@69
|
130 minedistmax=20 Ignore variants with lower max distance from read ends.
|
jpayne@69
|
131 minmapqmax=0 Ignore variants with lower max mapq.
|
jpayne@69
|
132 minidmax=0 Ignore variants with lower max read identity.
|
jpayne@69
|
133 minpairingrate=0.1 Ignore variants with lower pairing rate.
|
jpayne@69
|
134 minstrandratio=0.1 Ignore variants with lower plus/minus strand ratio.
|
jpayne@69
|
135 minquality=12.0 Ignore variants with lower average base quality.
|
jpayne@69
|
136 minedist=10.0 Ignore variants with lower average distance from ends.
|
jpayne@69
|
137 minavgmapq=0.0 Ignore variants with lower average mapq.
|
jpayne@69
|
138 minallelefraction=0.1 Ignore variants with lower allele fraction. This
|
jpayne@69
|
139 should be adjusted for high ploidies.
|
jpayne@69
|
140 minid=0 Ignore variants with lower average read identity.
|
jpayne@69
|
141 minscore=20.0 Ignore variants with lower Phred-scaled score.
|
jpayne@69
|
142 clearfilters Clear all filters. Filter flags placed after
|
jpayne@69
|
143 the clearfilters flag will still be applied.
|
jpayne@69
|
144
|
jpayne@69
|
145 There are additionally max filters for score, quality, mapq, allelefraction,
|
jpayne@69
|
146 and identity.
|
jpayne@69
|
147
|
jpayne@69
|
148 Other Parameters:
|
jpayne@69
|
149 minvarcopies=0 If set to 0, a genotype (vcf GT field) of 0 or 0/0
|
jpayne@69
|
150 will be called if observed allele frequency suggests
|
jpayne@69
|
151 this is a minor allele. If set to 1, GT field will
|
jpayne@69
|
152 contain at least one 1.
|
jpayne@69
|
153
|
jpayne@69
|
154 Java Parameters:
|
jpayne@69
|
155 -Xmx This will set Java's memory usage, overriding autodetection.
|
jpayne@69
|
156 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will
|
jpayne@69
|
157 specify 200 megs. The max is typically 85% of physical memory.
|
jpayne@69
|
158 -eoom This flag will cause the process to exit if an out-of-memory
|
jpayne@69
|
159 exception occurs. Requires Java 8u92+.
|
jpayne@69
|
160 -da Disable assertions.
|
jpayne@69
|
161
|
jpayne@69
|
162 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
|
jpayne@69
|
163 "
|
jpayne@69
|
164 }
|
jpayne@69
|
165
|
jpayne@69
|
166 #This block allows symlinked shellscripts to correctly set classpath.
|
jpayne@69
|
167 pushd . > /dev/null
|
jpayne@69
|
168 DIR="${BASH_SOURCE[0]}"
|
jpayne@69
|
169 while [ -h "$DIR" ]; do
|
jpayne@69
|
170 cd "$(dirname "$DIR")"
|
jpayne@69
|
171 DIR="$(readlink "$(basename "$DIR")")"
|
jpayne@69
|
172 done
|
jpayne@69
|
173 cd "$(dirname "$DIR")"
|
jpayne@69
|
174 DIR="$(pwd)/"
|
jpayne@69
|
175 popd > /dev/null
|
jpayne@69
|
176
|
jpayne@69
|
177 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
|
jpayne@69
|
178 CP="$DIR""current/"
|
jpayne@69
|
179
|
jpayne@69
|
180 z="-Xmx4g"
|
jpayne@69
|
181 z2="-Xms4g"
|
jpayne@69
|
182 set=0
|
jpayne@69
|
183
|
jpayne@69
|
184 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
|
jpayne@69
|
185 usage
|
jpayne@69
|
186 exit
|
jpayne@69
|
187 fi
|
jpayne@69
|
188
|
jpayne@69
|
189 calcXmx () {
|
jpayne@69
|
190 source "$DIR""/calcmem.sh"
|
jpayne@69
|
191 setEnvironment
|
jpayne@69
|
192 parseXmx "$@"
|
jpayne@69
|
193 if [[ $set == 1 ]]; then
|
jpayne@69
|
194 return
|
jpayne@69
|
195 fi
|
jpayne@69
|
196 freeRam 4000m 84
|
jpayne@69
|
197 z="-Xmx${RAM}m"
|
jpayne@69
|
198 z2="-Xms${RAM}m"
|
jpayne@69
|
199 }
|
jpayne@69
|
200 calcXmx "$@"
|
jpayne@69
|
201
|
jpayne@69
|
202 callvariants() {
|
jpayne@69
|
203 local CMD="java $EA $EOOM $z $z2 -cp $CP var2.CallVariants $@"
|
jpayne@69
|
204 echo $CMD >&2
|
jpayne@69
|
205 eval $CMD
|
jpayne@69
|
206 }
|
jpayne@69
|
207
|
jpayne@69
|
208 callvariants "$@"
|