jpayne@68
|
1 package sketch;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.io.File;
|
jpayne@68
|
4 import java.io.PrintStream;
|
jpayne@68
|
5 import java.util.ArrayList;
|
jpayne@68
|
6 import java.util.Arrays;
|
jpayne@68
|
7
|
jpayne@68
|
8 import dna.AminoAcid;
|
jpayne@68
|
9 import fileIO.FileFormat;
|
jpayne@68
|
10 import fileIO.ReadWrite;
|
jpayne@68
|
11 import jgi.BBMerge;
|
jpayne@68
|
12 import jgi.TranslateSixFrames;
|
jpayne@68
|
13 import prok.CallGenes;
|
jpayne@68
|
14 import prok.GeneCaller;
|
jpayne@68
|
15 import prok.Orf;
|
jpayne@68
|
16 import prok.ProkObject;
|
jpayne@68
|
17 import shared.Parse;
|
jpayne@68
|
18 import shared.Tools;
|
jpayne@68
|
19 import stream.ConcurrentReadInputStream;
|
jpayne@68
|
20 import stream.Read;
|
jpayne@68
|
21 import structures.EntropyTracker;
|
jpayne@68
|
22 import structures.ListNum;
|
jpayne@68
|
23 import tax.TaxNode;
|
jpayne@68
|
24 import tax.TaxTree;
|
jpayne@68
|
25
|
jpayne@68
|
26 /**
|
jpayne@68
|
27 * Creates MinHashSketches rapidly.
|
jpayne@68
|
28 *
|
jpayne@68
|
29 * @author Brian Bushnell
|
jpayne@68
|
30 * @date July 6, 2016
|
jpayne@68
|
31 *
|
jpayne@68
|
32 */
|
jpayne@68
|
33 public class SketchMakerMini extends SketchObject {
|
jpayne@68
|
34
|
jpayne@68
|
35 /*--------------------------------------------------------------*/
|
jpayne@68
|
36 /*---------------- Initialization ----------------*/
|
jpayne@68
|
37 /*--------------------------------------------------------------*/
|
jpayne@68
|
38
|
jpayne@68
|
39 public SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params){
|
jpayne@68
|
40 this(tool_, mode_, params.minEntropy, params.minProb, params.minQual);
|
jpayne@68
|
41 }
|
jpayne@68
|
42
|
jpayne@68
|
43 /**
|
jpayne@68
|
44 * Constructor.
|
jpayne@68
|
45 */
|
jpayne@68
|
46 public SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_){
|
jpayne@68
|
47
|
jpayne@68
|
48 tool=tool_;
|
jpayne@68
|
49 mode=mode_;
|
jpayne@68
|
50 minProb=minProb_;
|
jpayne@68
|
51 minQual=minQual_;
|
jpayne@68
|
52
|
jpayne@68
|
53 aminoShift=AminoAcid.AMINO_SHIFT;
|
jpayne@68
|
54 if(!aminoOrTranslate()){
|
jpayne@68
|
55 shift=2*k;
|
jpayne@68
|
56 shift2=shift-2;
|
jpayne@68
|
57 mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32
|
jpayne@68
|
58 }else{
|
jpayne@68
|
59 shift=aminoShift*k;
|
jpayne@68
|
60 shift2=shift-aminoShift;
|
jpayne@68
|
61 mask=(shift>63 ? -1L : ~((-1L)<<shift));
|
jpayne@68
|
62 }
|
jpayne@68
|
63
|
jpayne@68
|
64 if(AUTOSIZE && (mode==ONE_SKETCH || mode==PER_FILE)){
|
jpayne@68
|
65 heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(80000*Tools.mid(1, AUTOSIZE_FACTOR, 32))), tool.minKeyOccuranceCount, tool.trackCounts);
|
jpayne@68
|
66 }else if(AUTOSIZE_LINEAR && (mode==ONE_SKETCH || mode==PER_FILE)){
|
jpayne@68
|
67 heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(10000000*Tools.mid(0.1, 2*AUTOSIZE_LINEAR_DENSITY, 0.00001))),
|
jpayne@68
|
68 tool.minKeyOccuranceCount, tool.trackCounts);
|
jpayne@68
|
69 }else{
|
jpayne@68
|
70 heap=new SketchHeap(tool.stTargetSketchSize, tool.minKeyOccuranceCount, tool.trackCounts);
|
jpayne@68
|
71 }
|
jpayne@68
|
72
|
jpayne@68
|
73 if(minEntropy_>0){
|
jpayne@68
|
74 eTracker=new EntropyTracker(entropyK, k, (amino || translate), minEntropy_, true);
|
jpayne@68
|
75 }else{
|
jpayne@68
|
76 eTracker=null;
|
jpayne@68
|
77 }
|
jpayne@68
|
78
|
jpayne@68
|
79 if(translate || processSSU){
|
jpayne@68
|
80 gCaller=CallGenes.makeGeneCaller(pgm);
|
jpayne@68
|
81 }else{
|
jpayne@68
|
82 gCaller=null;
|
jpayne@68
|
83 }
|
jpayne@68
|
84 }
|
jpayne@68
|
85
|
jpayne@68
|
86 /*--------------------------------------------------------------*/
|
jpayne@68
|
87 /*---------------- Outer Methods ----------------*/
|
jpayne@68
|
88 /*--------------------------------------------------------------*/
|
jpayne@68
|
89
|
jpayne@68
|
90 /** Create read streams and process all data */
|
jpayne@68
|
91 public ArrayList<Sketch> toSketches(final String fname, float samplerate, long reads){
|
jpayne@68
|
92 heap.clear(false); //123
|
jpayne@68
|
93 final String simpleName;
|
jpayne@68
|
94
|
jpayne@68
|
95 final FileFormat ffin1, ffin2;
|
jpayne@68
|
96 if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){
|
jpayne@68
|
97 ffin1=FileFormat.testInput(fname.replaceFirst("#", "1"), FileFormat.FASTQ, null, true, true);
|
jpayne@68
|
98 ffin2=FileFormat.testInput(fname.replaceFirst("#", "2"), FileFormat.FASTQ, null, true, true);
|
jpayne@68
|
99 }else{
|
jpayne@68
|
100 ffin1=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true);
|
jpayne@68
|
101 ffin2=null;
|
jpayne@68
|
102 }
|
jpayne@68
|
103
|
jpayne@68
|
104 //Create a read input stream
|
jpayne@68
|
105 final ConcurrentReadInputStream cris;
|
jpayne@68
|
106 {
|
jpayne@68
|
107 simpleName=ffin1.simpleName();
|
jpayne@68
|
108 heap.setFname(simpleName);
|
jpayne@68
|
109 cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null);
|
jpayne@68
|
110 if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);}
|
jpayne@68
|
111 cris.start(); //Start the stream
|
jpayne@68
|
112 if(verbose){outstream.println("Started cris");}
|
jpayne@68
|
113 }
|
jpayne@68
|
114 if(mode==ONE_SKETCH || mode==PER_FILE){
|
jpayne@68
|
115 if(heap.name0()==null){heap.setName0(simpleName);}
|
jpayne@68
|
116 }
|
jpayne@68
|
117 ArrayList<Sketch> sketches=processInner(cris);
|
jpayne@68
|
118
|
jpayne@68
|
119 errorState|=ReadWrite.closeStream(cris);
|
jpayne@68
|
120 sketchesMade++;
|
jpayne@68
|
121 return sketches;
|
jpayne@68
|
122 }
|
jpayne@68
|
123
|
jpayne@68
|
124 /*--------------------------------------------------------------*/
|
jpayne@68
|
125 /*---------------- Inner Methods ----------------*/
|
jpayne@68
|
126 /*--------------------------------------------------------------*/
|
jpayne@68
|
127
|
jpayne@68
|
128 /** Iterate through the reads */
|
jpayne@68
|
129 ArrayList<Sketch> processInner(ConcurrentReadInputStream cris){
|
jpayne@68
|
130 assert(heap.size()==0);
|
jpayne@68
|
131 ArrayList<Sketch> sketches=new ArrayList<Sketch>(mode==ONE_SKETCH || mode==PER_FILE ? 1 : 8);
|
jpayne@68
|
132
|
jpayne@68
|
133 //Grab the first ListNum of reads
|
jpayne@68
|
134 ListNum<Read> ln=cris.nextList();
|
jpayne@68
|
135 //Grab the actual read list from the ListNum
|
jpayne@68
|
136 ArrayList<Read> reads=(ln!=null ? ln.list : null);
|
jpayne@68
|
137
|
jpayne@68
|
138 //As long as there is a nonempty read list...
|
jpayne@68
|
139 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
|
jpayne@68
|
140 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
|
jpayne@68
|
141
|
jpayne@68
|
142 //Loop through each read in the list
|
jpayne@68
|
143 for(int idx=0; idx<reads.size(); idx++){
|
jpayne@68
|
144 final Read r1=reads.get(idx);
|
jpayne@68
|
145 final Read r2=r1.mate;
|
jpayne@68
|
146
|
jpayne@68
|
147 processReadPair(r1, r2);
|
jpayne@68
|
148 if(mode!=ONE_SKETCH && mode!=PER_FILE){
|
jpayne@68
|
149 if(heap!=null && heap.size()>0 && heap.maxLen()>=Tools.max(1, minSketchSize)){
|
jpayne@68
|
150 int size=heap.size();
|
jpayne@68
|
151 Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
|
jpayne@68
|
152 assert(sketch.keys.length>0) : sketch.keys.length+", "+size;
|
jpayne@68
|
153 sketch.loadSSU();
|
jpayne@68
|
154 sketches.add(sketch);
|
jpayne@68
|
155 sketchesMade++;
|
jpayne@68
|
156 }
|
jpayne@68
|
157 if(heap!=null){heap.clear(false);}
|
jpayne@68
|
158 }
|
jpayne@68
|
159 }
|
jpayne@68
|
160
|
jpayne@68
|
161 //Notify the input stream that the list was used
|
jpayne@68
|
162 cris.returnList(ln);
|
jpayne@68
|
163 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
|
jpayne@68
|
164
|
jpayne@68
|
165 //Fetch a new list
|
jpayne@68
|
166 ln=cris.nextList();
|
jpayne@68
|
167 reads=(ln!=null ? ln.list : null);
|
jpayne@68
|
168 }
|
jpayne@68
|
169
|
jpayne@68
|
170 //Notify the input stream that the final list was used
|
jpayne@68
|
171 if(ln!=null){
|
jpayne@68
|
172 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
|
jpayne@68
|
173 }
|
jpayne@68
|
174
|
jpayne@68
|
175 if(mode==ONE_SKETCH || mode==PER_FILE){
|
jpayne@68
|
176 Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
|
jpayne@68
|
177 sketch.loadSSU();
|
jpayne@68
|
178 sketches.add(sketch);
|
jpayne@68
|
179 sketchesMade++;
|
jpayne@68
|
180 }
|
jpayne@68
|
181 heap.clear(true);
|
jpayne@68
|
182 return sketches;
|
jpayne@68
|
183 }
|
jpayne@68
|
184
|
jpayne@68
|
185 void processReadPair(Read r1, Read r2){
|
jpayne@68
|
186 //Track the initial length for statistics
|
jpayne@68
|
187 final int initialLength1=r1.length();
|
jpayne@68
|
188 final int initialLength2=r1.mateLength();
|
jpayne@68
|
189
|
jpayne@68
|
190 //Increment counters
|
jpayne@68
|
191 readsProcessed+=r1.pairCount();
|
jpayne@68
|
192 basesProcessed+=initialLength1+initialLength2;
|
jpayne@68
|
193
|
jpayne@68
|
194 if(mode!=ONE_SKETCH && mode!=PER_FILE){
|
jpayne@68
|
195 int expectedSize=toSketchSize(initialLength1+initialLength2, -1, -1, targetSketchSize);
|
jpayne@68
|
196 if(heap==null || heap.capacity()<expectedSize){heap=new SketchHeap(expectedSize, tool.minKeyOccuranceCount, tool.trackCounts);}
|
jpayne@68
|
197 }
|
jpayne@68
|
198
|
jpayne@68
|
199 if(tool.mergePairs && r2!=null){
|
jpayne@68
|
200 final int insert=BBMerge.findOverlapStrict(r1, r2, false);
|
jpayne@68
|
201 if(insert>0){
|
jpayne@68
|
202 heap.genomeSequences++;
|
jpayne@68
|
203 r2.reverseComplement();
|
jpayne@68
|
204 r1=r1.joinRead(insert);
|
jpayne@68
|
205 r2=null;
|
jpayne@68
|
206 }
|
jpayne@68
|
207 }
|
jpayne@68
|
208
|
jpayne@68
|
209 processRead(r1);
|
jpayne@68
|
210 if(r2!=null){processRead(r2);}
|
jpayne@68
|
211
|
jpayne@68
|
212 if(heap.name0()==null){
|
jpayne@68
|
213 heap.setName0(r1.id);
|
jpayne@68
|
214 }
|
jpayne@68
|
215
|
jpayne@68
|
216 TaxNode tn=null;
|
jpayne@68
|
217 if(heap.taxID<0 && r1.length()>800){
|
jpayne@68
|
218 if(taxtree!=null){
|
jpayne@68
|
219 try {
|
jpayne@68
|
220 tn=taxtree.parseNodeFromHeader(r1.id, true);
|
jpayne@68
|
221 } catch (Throwable e) {}
|
jpayne@68
|
222 if(tn!=null){
|
jpayne@68
|
223 heap.taxID=tn.id;
|
jpayne@68
|
224 if(heap.taxName()==null){
|
jpayne@68
|
225 heap.setTaxName(tn.name);
|
jpayne@68
|
226 }
|
jpayne@68
|
227 }
|
jpayne@68
|
228 // System.err.println("A) "+heap.taxID+r1.id);
|
jpayne@68
|
229 }else{
|
jpayne@68
|
230 heap.taxID=TaxTree.parseHeaderStatic(r1.id);
|
jpayne@68
|
231 // System.err.println("B) "+heap.taxID+r1.id);
|
jpayne@68
|
232 }
|
jpayne@68
|
233 }
|
jpayne@68
|
234 assert(heap.taxID<0 || heap.taxName()!=null || taxtree==null) : heap.taxID+", "+heap.taxName()+", "+heap.name()+", "+tn;
|
jpayne@68
|
235 }
|
jpayne@68
|
236
|
jpayne@68
|
237 public void processRead(final Read r){
|
jpayne@68
|
238 if(amino){
|
jpayne@68
|
239 processReadAmino(r);
|
jpayne@68
|
240 }else if(translate){
|
jpayne@68
|
241 processReadTranslated(r);
|
jpayne@68
|
242 }else{
|
jpayne@68
|
243 processReadNucleotide(r);
|
jpayne@68
|
244 }
|
jpayne@68
|
245 }
|
jpayne@68
|
246
|
jpayne@68
|
247 public void processReadTranslated(final Read r){
|
jpayne@68
|
248 assert(!r.aminoacid());
|
jpayne@68
|
249 final ArrayList<Read> prots;
|
jpayne@68
|
250 if(sixframes){
|
jpayne@68
|
251 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
|
jpayne@68
|
252 Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: allow 18S also
|
jpayne@68
|
253 if(orf!=null && orf.length()>=min_SSU_len){
|
jpayne@68
|
254 assert(orf.is16S());
|
jpayne@68
|
255 if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
|
jpayne@68
|
256 }
|
jpayne@68
|
257 //TODO: Add 18S.
|
jpayne@68
|
258 }
|
jpayne@68
|
259 prots=TranslateSixFrames.toFrames(r, true, false, 6);
|
jpayne@68
|
260 }else{
|
jpayne@68
|
261 ArrayList<Orf> list;
|
jpayne@68
|
262 list=gCaller.callGenes(r);
|
jpayne@68
|
263 prots=CallGenes.translate(r, list);
|
jpayne@68
|
264 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
|
jpayne@68
|
265 for(Orf orf : list){
|
jpayne@68
|
266 if(orf.is16S() && orf.length()>=min_SSU_len && orf.length()>=heap.r16SLen()){
|
jpayne@68
|
267 heap.set16S(CallGenes.fetch(orf, r).bases);
|
jpayne@68
|
268 break;
|
jpayne@68
|
269 }
|
jpayne@68
|
270 }
|
jpayne@68
|
271 }
|
jpayne@68
|
272 }
|
jpayne@68
|
273 if(prots!=null){
|
jpayne@68
|
274 for(Read p : prots){
|
jpayne@68
|
275 processReadAmino(p);
|
jpayne@68
|
276 }
|
jpayne@68
|
277 }
|
jpayne@68
|
278 }
|
jpayne@68
|
279
|
jpayne@68
|
280 void processReadNucleotide(final Read r){
|
jpayne@68
|
281 if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
|
jpayne@68
|
282 Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: 18S
|
jpayne@68
|
283 if(orf!=null && orf.length()>=min_SSU_len){
|
jpayne@68
|
284 assert(orf.start>=0 && orf.stop<r.length()) : r.length()+"\n"+orf;
|
jpayne@68
|
285 assert(orf.is16S());
|
jpayne@68
|
286 if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
|
jpayne@68
|
287 }
|
jpayne@68
|
288 //TODO: Add 18S.
|
jpayne@68
|
289 }
|
jpayne@68
|
290
|
jpayne@68
|
291 final byte[] bases=r.bases;
|
jpayne@68
|
292 final byte[] quals=r.quality;
|
jpayne@68
|
293 final long[] baseCounts=heap.baseCounts(true);
|
jpayne@68
|
294 long kmer=0;
|
jpayne@68
|
295 long rkmer=0;
|
jpayne@68
|
296 int len=0;
|
jpayne@68
|
297 assert(!r.aminoacid());
|
jpayne@68
|
298
|
jpayne@68
|
299 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
|
jpayne@68
|
300 final long min=minHashValue;
|
jpayne@68
|
301 heap.genomeSizeBases+=r.length();
|
jpayne@68
|
302 heap.genomeSequences++;
|
jpayne@68
|
303 if(eTracker!=null){eTracker.clear();}
|
jpayne@68
|
304
|
jpayne@68
|
305 // assert(false) : minProb+", "+minQual+", "+(quals==null);
|
jpayne@68
|
306
|
jpayne@68
|
307 if(quals==null || (minProb<=0 && minQual<2)){
|
jpayne@68
|
308 // System.err.println("A");
|
jpayne@68
|
309 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
310 // System.err.println("B: len="+len);
|
jpayne@68
|
311 byte b=bases[i];
|
jpayne@68
|
312 long x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
313 long x2=AminoAcid.baseToComplementNumber[b];
|
jpayne@68
|
314
|
jpayne@68
|
315 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
316 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
|
jpayne@68
|
317 if(eTracker!=null){eTracker.add(b);}
|
jpayne@68
|
318 if(x<0){
|
jpayne@68
|
319 len=0;
|
jpayne@68
|
320 rkmer=0;
|
jpayne@68
|
321 }else{
|
jpayne@68
|
322 len++;
|
jpayne@68
|
323 baseCounts[(int)x]++;
|
jpayne@68
|
324 }
|
jpayne@68
|
325
|
jpayne@68
|
326 // System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
|
jpayne@68
|
327 // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
|
jpayne@68
|
328 // +len+", "+(char)b+", "+x+", "+x2+"\n");
|
jpayne@68
|
329
|
jpayne@68
|
330 if(len>=k){
|
jpayne@68
|
331 kmersProcessed++;
|
jpayne@68
|
332 heap.genomeSizeKmers++;
|
jpayne@68
|
333 // heap.probSum++; //Note really necessary for fasta data
|
jpayne@68
|
334 if(eTracker==null || eTracker.passes()){
|
jpayne@68
|
335 // System.err.println("Pass.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
|
jpayne@68
|
336
|
jpayne@68
|
337 // assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) :
|
jpayne@68
|
338 // "\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
|
jpayne@68
|
339 // +AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
|
jpayne@68
|
340 // +len+", "+(char)b+", "+x+", "+x2;
|
jpayne@68
|
341
|
jpayne@68
|
342 final long hashcode=hash(kmer, rkmer);
|
jpayne@68
|
343 // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
|
jpayne@68
|
344 if(hashcode>min){
|
jpayne@68
|
345 if(noBlacklist){
|
jpayne@68
|
346 heap.add(hashcode);
|
jpayne@68
|
347 }else{
|
jpayne@68
|
348 heap.checkAndAdd(hashcode);
|
jpayne@68
|
349 }
|
jpayne@68
|
350 }
|
jpayne@68
|
351 }else{
|
jpayne@68
|
352 // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()+"\n"+r.toFastq()+"\n"+eTracker);
|
jpayne@68
|
353 // assert(false);
|
jpayne@68
|
354 }
|
jpayne@68
|
355 }
|
jpayne@68
|
356 }
|
jpayne@68
|
357 }else{
|
jpayne@68
|
358 int zeroQualityKmers=0;
|
jpayne@68
|
359 int positiveQualityKmers=0;
|
jpayne@68
|
360
|
jpayne@68
|
361 float prob=1;
|
jpayne@68
|
362 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
363 final byte b=bases[i];
|
jpayne@68
|
364 final long x=AminoAcid.baseToNumber[b];
|
jpayne@68
|
365 final long x2=AminoAcid.baseToComplementNumber[b];
|
jpayne@68
|
366
|
jpayne@68
|
367 kmer=((kmer<<2)|x)&mask;
|
jpayne@68
|
368 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
|
jpayne@68
|
369 if(eTracker!=null){eTracker.add(b);}
|
jpayne@68
|
370
|
jpayne@68
|
371 final byte q=quals[i];
|
jpayne@68
|
372 {//Quality-related stuff
|
jpayne@68
|
373 assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual;
|
jpayne@68
|
374 // if(x>=0){
|
jpayne@68
|
375 // if(q>0){
|
jpayne@68
|
376 // positiveQualityBases++;
|
jpayne@68
|
377 // }else{
|
jpayne@68
|
378 // zeroQualityBases++;
|
jpayne@68
|
379 // }
|
jpayne@68
|
380 // }
|
jpayne@68
|
381 prob=prob*align2.QualityTools.PROB_CORRECT[q];
|
jpayne@68
|
382 if(len>k){
|
jpayne@68
|
383 byte oldq=quals[i-k];
|
jpayne@68
|
384 prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
|
jpayne@68
|
385 }
|
jpayne@68
|
386 if(x<0 || q<minQual){
|
jpayne@68
|
387 len=0;
|
jpayne@68
|
388 kmer=rkmer=0;
|
jpayne@68
|
389 prob=1;
|
jpayne@68
|
390 }else{
|
jpayne@68
|
391 len++;
|
jpayne@68
|
392 baseCounts[(int)x]++;
|
jpayne@68
|
393 }
|
jpayne@68
|
394 }
|
jpayne@68
|
395
|
jpayne@68
|
396 if(len>=k){
|
jpayne@68
|
397 kmersProcessed++;
|
jpayne@68
|
398 if(prob>=minProb){
|
jpayne@68
|
399 heap.genomeSizeKmers++;
|
jpayne@68
|
400 heap.probSum+=prob;
|
jpayne@68
|
401 if(eTracker==null || eTracker.passes()){
|
jpayne@68
|
402 final long hashcode=hash(kmer, rkmer);
|
jpayne@68
|
403 // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
|
jpayne@68
|
404 if(hashcode>min){
|
jpayne@68
|
405 if(noBlacklist){
|
jpayne@68
|
406 heap.add(hashcode);
|
jpayne@68
|
407 }else{
|
jpayne@68
|
408 heap.checkAndAdd(hashcode);
|
jpayne@68
|
409 }
|
jpayne@68
|
410 }
|
jpayne@68
|
411 }else{
|
jpayne@68
|
412 // System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
|
jpayne@68
|
413 }
|
jpayne@68
|
414 positiveQualityKmers++;
|
jpayne@68
|
415 }else if(q<=2){
|
jpayne@68
|
416 zeroQualityKmers++;
|
jpayne@68
|
417 }
|
jpayne@68
|
418 }
|
jpayne@68
|
419
|
jpayne@68
|
420 //This version is slow but calculates depth better.
|
jpayne@68
|
421 // if(len>=k){
|
jpayne@68
|
422 // kmersProcessed++;
|
jpayne@68
|
423 // heap.genomeSizeKmers++;
|
jpayne@68
|
424 // final long hash=hash(kmer, rkmer);
|
jpayne@68
|
425 // // System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
|
jpayne@68
|
426 // if(hash>min){
|
jpayne@68
|
427 // if(prob>=minProb || (!heap.setMode && heap.contains(hash))){
|
jpayne@68
|
428 // if(noBlacklist){
|
jpayne@68
|
429 // heap.add(hash);
|
jpayne@68
|
430 // }else{
|
jpayne@68
|
431 // heap.checkAndAdd(hash);
|
jpayne@68
|
432 // }
|
jpayne@68
|
433 // }
|
jpayne@68
|
434 // }
|
jpayne@68
|
435 // }
|
jpayne@68
|
436 }
|
jpayne@68
|
437 if(minProb>0 && zeroQualityKmers>100 && positiveQualityKmers==0){
|
jpayne@68
|
438 if(looksLikePacBio(r)){
|
jpayne@68
|
439 synchronized(this){
|
jpayne@68
|
440 minProb=0;
|
jpayne@68
|
441 }
|
jpayne@68
|
442 processReadNucleotide(r);
|
jpayne@68
|
443 }
|
jpayne@68
|
444 }
|
jpayne@68
|
445 }
|
jpayne@68
|
446 // assert(false);
|
jpayne@68
|
447 }
|
jpayne@68
|
448
|
jpayne@68
|
449 boolean looksLikePacBio(Read r){
|
jpayne@68
|
450 if(r.length()<302 || r.mate!=null){return false;}
|
jpayne@68
|
451 if(r.quality==null){
|
jpayne@68
|
452 int x=Parse.parseZmw(r.id);
|
jpayne@68
|
453 return x>=0;
|
jpayne@68
|
454 }
|
jpayne@68
|
455 int positive=0;
|
jpayne@68
|
456 int zero=0;
|
jpayne@68
|
457 int ns=0;
|
jpayne@68
|
458 for(int i=0; i<r.bases.length; i++){
|
jpayne@68
|
459 byte b=r.bases[i];
|
jpayne@68
|
460 byte q=r.quality[i];
|
jpayne@68
|
461 if(b=='N'){ns++;}
|
jpayne@68
|
462 else if(q==0 || q==2){
|
jpayne@68
|
463 zero++;
|
jpayne@68
|
464 }else{
|
jpayne@68
|
465 positive++;
|
jpayne@68
|
466 }
|
jpayne@68
|
467 }
|
jpayne@68
|
468 return zero>=r.length()/2 && positive==0;
|
jpayne@68
|
469 }
|
jpayne@68
|
470
|
jpayne@68
|
471 void processReadAmino(final Read r){
|
jpayne@68
|
472 final byte[] bases=r.bases;
|
jpayne@68
|
473 long kmer=0;
|
jpayne@68
|
474 int len=0;
|
jpayne@68
|
475 assert(r.aminoacid());
|
jpayne@68
|
476
|
jpayne@68
|
477 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
|
jpayne@68
|
478 final long min=minHashValue;
|
jpayne@68
|
479 heap.genomeSizeBases+=r.length();
|
jpayne@68
|
480 heap.genomeSequences++;
|
jpayne@68
|
481
|
jpayne@68
|
482 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
483 byte b=bases[i];
|
jpayne@68
|
484 long x=AminoAcid.acidToNumberNoStops[b];
|
jpayne@68
|
485 kmer=((kmer<<aminoShift)|x)&mask;
|
jpayne@68
|
486 // if(eTracker!=null){eTracker.add(b);}
|
jpayne@68
|
487
|
jpayne@68
|
488 if(x<0){len=0;}else{len++;}
|
jpayne@68
|
489 if(len>=k){
|
jpayne@68
|
490 kmersProcessed++;
|
jpayne@68
|
491 heap.genomeSizeKmers++;
|
jpayne@68
|
492 // if(eTracker==null || eTracker.passes()){
|
jpayne@68
|
493 // assert(false) : (eTracker==null)+", "+eTracker.cutoff()+", "+eTracker.calcEntropy()+", "+r;
|
jpayne@68
|
494 long hashcode=hash(kmer, kmer);
|
jpayne@68
|
495 if(hashcode>min){
|
jpayne@68
|
496 if(noBlacklist){
|
jpayne@68
|
497 heap.add(hashcode);
|
jpayne@68
|
498 }else{
|
jpayne@68
|
499 heap.checkAndAdd(hashcode);
|
jpayne@68
|
500 }
|
jpayne@68
|
501 }
|
jpayne@68
|
502 // }
|
jpayne@68
|
503 }
|
jpayne@68
|
504 }
|
jpayne@68
|
505 }
|
jpayne@68
|
506
|
jpayne@68
|
507 void processReadAmino_old_no_entropy(final Read r){
|
jpayne@68
|
508 final byte[] bases=r.bases;
|
jpayne@68
|
509 long kmer=0;
|
jpayne@68
|
510 int len=0;
|
jpayne@68
|
511 assert(r.aminoacid());
|
jpayne@68
|
512
|
jpayne@68
|
513 final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
|
jpayne@68
|
514 final long min=minHashValue;
|
jpayne@68
|
515 heap.genomeSizeBases+=r.length();
|
jpayne@68
|
516 heap.genomeSequences++;
|
jpayne@68
|
517
|
jpayne@68
|
518 for(int i=0; i<bases.length; i++){
|
jpayne@68
|
519 byte b=bases[i];
|
jpayne@68
|
520 long x=AminoAcid.acidToNumberNoStops[b];
|
jpayne@68
|
521 kmer=((kmer<<aminoShift)|x)&mask;
|
jpayne@68
|
522 if(x<0){len=0;}else{len++;}
|
jpayne@68
|
523 if(len>=k){
|
jpayne@68
|
524 kmersProcessed++;
|
jpayne@68
|
525 heap.genomeSizeKmers++;
|
jpayne@68
|
526 long hashcode=hash(kmer, kmer);
|
jpayne@68
|
527 if(hashcode>min){
|
jpayne@68
|
528 if(noBlacklist){
|
jpayne@68
|
529 heap.add(hashcode);
|
jpayne@68
|
530 }else{
|
jpayne@68
|
531 heap.checkAndAdd(hashcode);
|
jpayne@68
|
532 }
|
jpayne@68
|
533 }
|
jpayne@68
|
534 }
|
jpayne@68
|
535 }
|
jpayne@68
|
536 }
|
jpayne@68
|
537
|
jpayne@68
|
538 public Sketch toSketch(int minCount){
|
jpayne@68
|
539 Sketch sketch=null;
|
jpayne@68
|
540 if(heap!=null && heap.size()>0){
|
jpayne@68
|
541 try {
|
jpayne@68
|
542 sketch=new Sketch(heap, false, tool.trackCounts, null, minCount);
|
jpayne@68
|
543 } catch (Throwable e) {
|
jpayne@68
|
544 // TODO Auto-generated catch block
|
jpayne@68
|
545 e.printStackTrace();
|
jpayne@68
|
546 }
|
jpayne@68
|
547 heap.clear(false);
|
jpayne@68
|
548 }
|
jpayne@68
|
549 return sketch;
|
jpayne@68
|
550 }
|
jpayne@68
|
551
|
jpayne@68
|
552 public void add(SketchMakerMini smm){
|
jpayne@68
|
553 heap.add(smm.heap);
|
jpayne@68
|
554 readsProcessed+=smm.readsProcessed;
|
jpayne@68
|
555 basesProcessed+=smm.basesProcessed;
|
jpayne@68
|
556 kmersProcessed+=smm.kmersProcessed;
|
jpayne@68
|
557 sketchesMade+=smm.sketchesMade;
|
jpayne@68
|
558 pacBioDetected|=smm.pacBioDetected;
|
jpayne@68
|
559 }
|
jpayne@68
|
560
|
jpayne@68
|
561 /** True only if this thread has completed successfully */
|
jpayne@68
|
562 boolean success=false;
|
jpayne@68
|
563
|
jpayne@68
|
564 SketchHeap heap;
|
jpayne@68
|
565
|
jpayne@68
|
566 final int aminoShift;
|
jpayne@68
|
567 final int shift;
|
jpayne@68
|
568 final int shift2;
|
jpayne@68
|
569 final long mask;
|
jpayne@68
|
570 final EntropyTracker eTracker;
|
jpayne@68
|
571 final GeneCaller gCaller;
|
jpayne@68
|
572
|
jpayne@68
|
573 public float minEntropy() {
|
jpayne@68
|
574 // TODO Auto-generated method stub
|
jpayne@68
|
575 return eTracker==null ? -1 : eTracker.cutoff();
|
jpayne@68
|
576 }
|
jpayne@68
|
577
|
jpayne@68
|
578 /*--------------------------------------------------------------*/
|
jpayne@68
|
579 /*---------------- Fields ----------------*/
|
jpayne@68
|
580 /*--------------------------------------------------------------*/
|
jpayne@68
|
581
|
jpayne@68
|
582 /** Number of reads processed */
|
jpayne@68
|
583 protected long readsProcessed=0;
|
jpayne@68
|
584 /** Number of bases processed */
|
jpayne@68
|
585 protected long basesProcessed=0;
|
jpayne@68
|
586 /** Number of bases processed */
|
jpayne@68
|
587 protected long kmersProcessed=0;
|
jpayne@68
|
588 /** Number of sketches started */
|
jpayne@68
|
589 protected long sketchesMade=0;
|
jpayne@68
|
590
|
jpayne@68
|
591 float minProb() {return minProb;}
|
jpayne@68
|
592 byte minQual() {return minQual;}
|
jpayne@68
|
593 public boolean pacBioDetected=false;
|
jpayne@68
|
594 private float minProb;
|
jpayne@68
|
595 private byte minQual;
|
jpayne@68
|
596
|
jpayne@68
|
597 /*--------------------------------------------------------------*/
|
jpayne@68
|
598 /*---------------- Final Fields ----------------*/
|
jpayne@68
|
599 /*--------------------------------------------------------------*/
|
jpayne@68
|
600
|
jpayne@68
|
601 private final SketchTool tool;
|
jpayne@68
|
602 final int mode;
|
jpayne@68
|
603
|
jpayne@68
|
604 /*--------------------------------------------------------------*/
|
jpayne@68
|
605 /*---------------- Common Fields ----------------*/
|
jpayne@68
|
606 /*--------------------------------------------------------------*/
|
jpayne@68
|
607
|
jpayne@68
|
608 /** Print status messages to this output stream */
|
jpayne@68
|
609 private PrintStream outstream=System.err;
|
jpayne@68
|
610 /** Print verbose messages */
|
jpayne@68
|
611 public static boolean verbose=false;
|
jpayne@68
|
612 /** True if an error was encountered */
|
jpayne@68
|
613 public boolean errorState=false;
|
jpayne@68
|
614
|
jpayne@68
|
615 }
|