jpayne@69
|
1 #!/bin/bash
|
jpayne@69
|
2
|
jpayne@69
|
3 usage(){
|
jpayne@69
|
4 echo "
|
jpayne@69
|
5 Written by Brian Bushnell
|
jpayne@69
|
6 Last modified March 3, 2020
|
jpayne@69
|
7
|
jpayne@69
|
8 Description: Generates basic assembly statistics such as scaffold count,
|
jpayne@69
|
9 N50, L50, GC content, gap percent, etc. For multiple files, please use
|
jpayne@69
|
10 statswrapper.sh. Works with fasta and fastq only (gzipped is fine).
|
jpayne@69
|
11 Please read bbmap/docs/guides/StatsGuide.txt for more information.
|
jpayne@69
|
12
|
jpayne@69
|
13 Usage: stats.sh in=<file>
|
jpayne@69
|
14
|
jpayne@69
|
15 Parameters:
|
jpayne@69
|
16 in=file Specify the input fasta file, or stdin.
|
jpayne@69
|
17 out=stdout Destination of primary output; may be directed to a file.
|
jpayne@69
|
18 gc=file Writes ACGTN content per scaffold to a file.
|
jpayne@69
|
19 gchist=file Filename to output scaffold gc content histogram.
|
jpayne@69
|
20 shist=file Filename to output cumulative scaffold length histogram.
|
jpayne@69
|
21 gcbins=200 Number of bins for gc histogram.
|
jpayne@69
|
22 n=10 Number of contiguous Ns to signify a break between contigs.
|
jpayne@69
|
23 k=13 Estimate memory usage of BBMap with this kmer length.
|
jpayne@69
|
24 minscaf=0 Ignore scaffolds shorter than this.
|
jpayne@69
|
25 phs=f (printheaderstats) Set to true to print total size of headers.
|
jpayne@69
|
26 n90=t (printn90) Print the N/L90 metrics.
|
jpayne@69
|
27 extended=f Print additional metrics such as L90, logsum, and score.
|
jpayne@69
|
28 pdl=f (printduplicatelines) Set to true to print lines in the
|
jpayne@69
|
29 scaffold size table where the counts did not change.
|
jpayne@69
|
30 n_=t This flag will prefix the terms 'contigs' and 'scaffolds'
|
jpayne@69
|
31 with 'n_' in formats 3-6.
|
jpayne@69
|
32 addname=f Adds a column for input file name, for formats 3-6.
|
jpayne@69
|
33
|
jpayne@69
|
34 Logsum and Powsum:
|
jpayne@69
|
35 logoffset=1000 Minimum length for calculating log sum.
|
jpayne@69
|
36 logbase=2 Log base for calculating log sum.
|
jpayne@69
|
37 logpower=1 Raise the log to a power to increase the weight
|
jpayne@69
|
38 of longer scaffolds for log sum.
|
jpayne@69
|
39 powsum=0.25 Use this power of the length to increase weight
|
jpayne@69
|
40 of longer scaffolds for power sum.
|
jpayne@69
|
41
|
jpayne@69
|
42 Assembly Score Metric:
|
jpayne@69
|
43 score=f Print assembly score.
|
jpayne@69
|
44 aligned=0.0 Set the fraction of aligned reads (0-1).
|
jpayne@69
|
45 assemblyscoreminlen=2000 Minimum length of scaffolds to include in
|
jpayne@69
|
46 assembly score calculation.
|
jpayne@69
|
47 assemblyscoremaxlen=50000 Maximum length of scaffolds to get bonus points
|
jpayne@69
|
48 for being long.
|
jpayne@69
|
49
|
jpayne@69
|
50
|
jpayne@69
|
51 format=<0-7> Format of the stats information; default 1.
|
jpayne@69
|
52 format=0 prints no assembly stats.
|
jpayne@69
|
53 format=1 uses variable units like MB and KB, and is designed for compatibility with existing tools.
|
jpayne@69
|
54 format=2 uses only whole numbers of bases, with no commas in numbers, and is designed for machine parsing.
|
jpayne@69
|
55 format=3 outputs stats in 2 rows of tab-delimited columns: a header row and a data row.
|
jpayne@69
|
56 format=4 is like 3 but with scaffold data only.
|
jpayne@69
|
57 format=5 is like 3 but with contig data only.
|
jpayne@69
|
58 format=6 is like 3 but the header starts with a #.
|
jpayne@69
|
59 format=7 is like 1 but only prints contig info.
|
jpayne@69
|
60 format=8 is like 3 but in JSON. You can also just use the 'json' flag.
|
jpayne@69
|
61
|
jpayne@69
|
62 gcformat=<0-5> Select GC output format; default 1.
|
jpayne@69
|
63 gcformat=0: (no base content info printed)
|
jpayne@69
|
64 gcformat=1: name length A C G T N GC
|
jpayne@69
|
65 gcformat=2: name GC
|
jpayne@69
|
66 gcformat=4: name length GC
|
jpayne@69
|
67 gcformat=5: name length GC logsum powsum
|
jpayne@69
|
68 Note that in gcformat 1, A+C+G+T=1 even when N is nonzero.
|
jpayne@69
|
69
|
jpayne@69
|
70 Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
|
jpayne@69
|
71 "
|
jpayne@69
|
72 }
|
jpayne@69
|
73
|
jpayne@69
|
74 #This block allows symlinked shellscripts to correctly set classpath.
|
jpayne@69
|
75 pushd . > /dev/null
|
jpayne@69
|
76 DIR="${BASH_SOURCE[0]}"
|
jpayne@69
|
77 while [ -h "$DIR" ]; do
|
jpayne@69
|
78 cd "$(dirname "$DIR")"
|
jpayne@69
|
79 DIR="$(readlink "$(basename "$DIR")")"
|
jpayne@69
|
80 done
|
jpayne@69
|
81 cd "$(dirname "$DIR")"
|
jpayne@69
|
82 DIR="$(pwd)/"
|
jpayne@69
|
83 popd > /dev/null
|
jpayne@69
|
84
|
jpayne@69
|
85 #DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
|
jpayne@69
|
86 CP="$DIR""current/"
|
jpayne@69
|
87
|
jpayne@69
|
88 z="-Xmx120m"
|
jpayne@69
|
89 set=0
|
jpayne@69
|
90
|
jpayne@69
|
91 if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
|
jpayne@69
|
92 usage
|
jpayne@69
|
93 exit
|
jpayne@69
|
94 fi
|
jpayne@69
|
95
|
jpayne@69
|
96 calcXmx () {
|
jpayne@69
|
97 source "$DIR""/calcmem.sh"
|
jpayne@69
|
98 setEnvironment
|
jpayne@69
|
99 parseXmx "$@"
|
jpayne@69
|
100 }
|
jpayne@69
|
101 calcXmx "$@"
|
jpayne@69
|
102
|
jpayne@69
|
103 stats() {
|
jpayne@69
|
104 local CMD="java $EA $EOOM $z -cp $CP jgi.AssemblyStats2 $@"
|
jpayne@69
|
105 # echo $CMD >&2
|
jpayne@69
|
106 eval $CMD
|
jpayne@69
|
107 }
|
jpayne@69
|
108
|
jpayne@69
|
109 stats "$@"
|