Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/RenameGiToTaxid.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package tax; | |
2 | |
3 import java.io.File; | |
4 import java.io.PrintStream; | |
5 import java.util.ArrayList; | |
6 import java.util.LinkedHashSet; | |
7 | |
8 import fileIO.ByteFile; | |
9 import fileIO.ByteFile1; | |
10 import fileIO.ByteFile2; | |
11 import fileIO.ByteStreamWriter; | |
12 import fileIO.FileFormat; | |
13 import fileIO.ReadWrite; | |
14 import kmer.HashArray1D; | |
15 import shared.KillSwitch; | |
16 import shared.Parse; | |
17 import shared.Parser; | |
18 import shared.PreParser; | |
19 import shared.ReadStats; | |
20 import shared.Shared; | |
21 import shared.Timer; | |
22 import shared.Tools; | |
23 import stream.ConcurrentGenericReadInputStream; | |
24 import stream.FASTQ; | |
25 import stream.FastaReadInputStream; | |
26 import structures.ByteBuilder; | |
27 import structures.IntList; | |
28 | |
29 /** | |
30 * @author Brian Bushnell | |
31 * @date Mar 10, 2015 | |
32 * | |
33 */ | |
34 public class RenameGiToTaxid { | |
35 | |
36 public static void main(String[] args){ | |
37 Timer t=new Timer(); | |
38 RenameGiToTaxid x=new RenameGiToTaxid(args); | |
39 x.process(t); | |
40 | |
41 //Close the print stream if it was redirected | |
42 Shared.closeStream(x.outstream); | |
43 } | |
44 | |
45 public RenameGiToTaxid(String[] args){ | |
46 | |
47 {//Preparse block for help, config files, and outstream | |
48 PreParser pp=new PreParser(args, getClass(), false); | |
49 args=pp.args; | |
50 outstream=pp.outstream; | |
51 } | |
52 | |
53 Shared.capBuffers(4); | |
54 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; | |
55 ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true; | |
56 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
57 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false; | |
58 | |
59 Parser parser=new Parser(); | |
60 for(int i=0; i<args.length; i++){ | |
61 String arg=args[i]; | |
62 String[] split=arg.split("="); | |
63 String a=split[0].toLowerCase(); | |
64 String b=split.length>1 ? split[1] : null; | |
65 | |
66 if(a.equals("prefix")){ | |
67 prefix=Parse.parseBoolean(b); | |
68 | |
69 }else if(a.equals("server") || a.equals("useserver")){ | |
70 if(b!=null && b.startsWith("http")){ | |
71 useServer=true; | |
72 String path=b; | |
73 if(!path.endsWith("/")){path+="/";} | |
74 Shared.setTaxServer(path); | |
75 }else{ | |
76 useServer=Parse.parseBoolean(b); | |
77 } | |
78 }else if(a.equals("title")){ | |
79 title=(b==null ? ">" : (">"+b+"|")).getBytes(); | |
80 }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){ | |
81 giTableFile=b; | |
82 }else if(a.equals("accession")){ | |
83 accessionFile=b; | |
84 }else if(a.equals("pattern")){ | |
85 patternFile=b; | |
86 }else if(a.equals("tree") || a.equals("taxtree")){ | |
87 taxTreeFile=b; | |
88 }else if(a.equals("invalid")){ | |
89 outInvalid=b; | |
90 }else if(a.equals("deleteinvalid")){ | |
91 deleteInvalid=Parse.parseBoolean(b); | |
92 }else if(a.equals("badheaders")){ | |
93 badHeaders=b; | |
94 }else if(a.equals("maxbadheaders") || a.equals("maxinvalidheaders")){ | |
95 maxInvalidHeaders=Parse.parseKMG(b); | |
96 }else if(a.equals("keepall")){ | |
97 keepAll=Parse.parseBoolean(b); | |
98 }else if(a.equals("shrinknames")){ | |
99 shrinkNames=Parse.parseBoolean(b); | |
100 }else if(a.equals("warn")){ | |
101 warnBadHeaders=Parse.parseBoolean(b); | |
102 } | |
103 | |
104 else if(a.equals("maxpigzprocesses")){ | |
105 AccessionToTaxid.maxPigzProcesses=Integer.parseInt(b); | |
106 }else if(a.equals("skipparse")){ | |
107 AccessionToTaxid.skipParse=Parse.parseBoolean(b); | |
108 }else if(a.equals("skiphash")){ | |
109 AccessionToTaxid.skipHash=Parse.parseBoolean(b); | |
110 } | |
111 | |
112 else if(a.equals("mode")){ | |
113 if(b!=null && Character.isDigit(b.charAt(0))){ | |
114 mode=Integer.parseInt(b); | |
115 }else if("accession".equalsIgnoreCase(b)){ | |
116 mode=ACCESSION_MODE; | |
117 }else if("unite".equalsIgnoreCase(b)){ | |
118 mode=UNITE_MODE; | |
119 TaxTree.UNITE_MODE=true; | |
120 }else if("gi".equalsIgnoreCase(b)){ | |
121 mode=GI_MODE; | |
122 }else if("header".equalsIgnoreCase(b)){ | |
123 mode=HEADER_MODE; | |
124 }else{ | |
125 assert(false) : "Bad mode: "+b; | |
126 } | |
127 } | |
128 | |
129 else if(a.equals("verbose")){ | |
130 verbose=Parse.parseBoolean(b); | |
131 ByteFile1.verbose=verbose; | |
132 ByteFile2.verbose=verbose; | |
133 stream.FastaReadInputStream.verbose=verbose; | |
134 ConcurrentGenericReadInputStream.verbose=verbose; | |
135 stream.FastqReadInputStream.verbose=verbose; | |
136 ReadWrite.verbose=verbose; | |
137 }else if(a.equals("in") || a.equals("in1")){ | |
138 assert(b!=null) : "Bad parameter: "+arg; | |
139 if(new File(b).exists()){ | |
140 in1.add(b); | |
141 }else{ | |
142 for(String bb : b.split(",")){ | |
143 in1.add(bb); | |
144 } | |
145 } | |
146 }else if(new File(arg).exists()){ //For asterisk expansion | |
147 in1.add(arg); | |
148 }else if(parser.parse(arg, a, b)){ | |
149 //do nothing | |
150 }else{ | |
151 outstream.println("Unknown parameter "+args[i]); | |
152 assert(false) : "Unknown parameter "+args[i]; | |
153 // throw new RuntimeException("Unknown parameter "+args[i]); | |
154 } | |
155 } | |
156 | |
157 if(useServer){ | |
158 giTableFile=null; | |
159 accessionFile=null; | |
160 patternFile=null; | |
161 if(mode!=UNITE_MODE){taxTreeFile=null;} | |
162 }//else if taxpath!=null... set them | |
163 | |
164 {//Process parser fields | |
165 Parser.processQuality(); | |
166 | |
167 maxReads=parser.maxReads; | |
168 | |
169 overwrite=ReadStats.overwrite=parser.overwrite; | |
170 append=ReadStats.append=parser.append; | |
171 | |
172 out1=parser.out1; | |
173 } | |
174 | |
175 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();} | |
176 if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();} | |
177 if("auto".equalsIgnoreCase(accessionFile)){accessionFile=TaxTree.defaultAccessionFile();} | |
178 if("auto".equalsIgnoreCase(patternFile)){patternFile=TaxTree.defaultPatternFile();} | |
179 | |
180 assert(FastaReadInputStream.settingsOK()); | |
181 | |
182 if(in1==null || in1.isEmpty()){throw new RuntimeException("Error - at least one input file is required.");} | |
183 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){ | |
184 ByteFile.FORCE_MODE_BF2=false; | |
185 ByteFile.FORCE_MODE_BF1=true; | |
186 } | |
187 | |
188 if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} | |
189 assert(out1!=null) : "This program requires an output file."; | |
190 | |
191 if(!Tools.testOutputFiles(overwrite, append, false, out1)){ | |
192 outstream.println((out1==null)+", "+out1); | |
193 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n"); | |
194 } | |
195 if(!Tools.testInputFiles(false, true, in1.toArray(new String[0]))){ | |
196 throw new RuntimeException("\nCan't read some input files.\n"); | |
197 } | |
198 | |
199 ffout1=FileFormat.testOutput(out1, FileFormat.FA, null, true, overwrite, append, false); | |
200 ffoutInvalid=FileFormat.testOutput(outInvalid, FileFormat.FA, null, true, overwrite, append, false); | |
201 ffin1=new ArrayList<FileFormat>(in1.size()); | |
202 for(String s : in1){ | |
203 FileFormat ff=FileFormat.testInput(s, FileFormat.FA, null, true, true); | |
204 ffin1.add(ff); | |
205 } | |
206 | |
207 if(ffoutInvalid!=null){keepAll=false;} | |
208 | |
209 assert(giTableFile!=null || accessionFile!=null || TaxTree.SILVA_MODE || useServer) : "No gi or accession information loaded."; | |
210 | |
211 if(taxTreeFile!=null){ | |
212 tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, false); | |
213 assert(tree.nameMap!=null); | |
214 }else{ | |
215 tree=null; | |
216 if(!useServer){throw new RuntimeException("No tree specified.");} | |
217 } | |
218 | |
219 if(giTableFile!=null){ | |
220 GiToTaxid.initialize(giTableFile); | |
221 } | |
222 | |
223 if(patternFile!=null){ | |
224 Timer t=new Timer(); | |
225 AnalyzeAccession.loadCodeMap(patternFile); | |
226 outstream.println("Loading pattern table."); | |
227 t.stopAndPrint(); | |
228 } | |
229 | |
230 if(accessionFile!=null){ | |
231 AccessionToTaxid.tree=tree; | |
232 outstream.println("Loading accession table."); | |
233 AccessionToTaxid.load(accessionFile); | |
234 // System.gc(); | |
235 } | |
236 } | |
237 | |
238 void process(Timer t){ | |
239 | |
240 ByteStreamWriter bsw=(ffout1==null ? null : new ByteStreamWriter(ffout1)); //Actually, this is required. | |
241 if(bsw!=null){bsw.start();} | |
242 | |
243 ByteStreamWriter bswInvalid=null; | |
244 if(ffoutInvalid!=null){ | |
245 bswInvalid=new ByteStreamWriter(ffoutInvalid); | |
246 bswInvalid.start(); | |
247 } | |
248 | |
249 ByteStreamWriter bswBadHeaders=null; | |
250 if(badHeaders!=null) { | |
251 bswBadHeaders=new ByteStreamWriter(badHeaders, overwrite, append, false); | |
252 bswBadHeaders.start(); | |
253 } | |
254 | |
255 final HashArray1D counts=(countTable && !prefix) ? new HashArray1D(256000, -1L, true) : null; | |
256 | |
257 gffIn=false; | |
258 for(FileFormat ffin : ffin1){ | |
259 gffIn=gffIn||ffin.gff(); | |
260 ByteFile bf=ByteFile.makeByteFile(ffin); | |
261 if(useServer){ | |
262 processInner_server(bf, bsw, bswInvalid, bswBadHeaders, counts, ffin.format()); | |
263 }else{ | |
264 // IntList list=(useServer ? getIds(bf) : null); | |
265 processInner(bf, bsw, bswInvalid, bswBadHeaders, counts, null); | |
266 } | |
267 } | |
268 | |
269 if(bsw!=null){ | |
270 errorState|=bsw.poisonAndWait(); | |
271 if(deleteInvalid && invalidReads>0 && !ffout1.stdio()){ | |
272 try { | |
273 System.err.println("Deleting "+out1); | |
274 new File(out1).delete(); | |
275 } catch (Exception e) { | |
276 System.err.println("An error occured while attempting to delete "+out1); | |
277 e.printStackTrace(); | |
278 } | |
279 } | |
280 } | |
281 if(bswInvalid!=null){errorState|=bswInvalid.poisonAndWait();} | |
282 if(bswBadHeaders!=null){errorState|=bswBadHeaders.poisonAndWait();} | |
283 | |
284 t.stop(); | |
285 if(!gffIn) { | |
286 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); | |
287 | |
288 outstream.println(); | |
289 outstream.println("Valid Sequences: \t"+validReads); | |
290 outstream.println("Valid Bases: \t"+validBases); | |
291 outstream.println("Invalid Sequences: \t"+invalidReads); | |
292 outstream.println("Invalid Bases: \t"+invalidBases); | |
293 }else{ | |
294 outstream.println(Tools.timeLinesBytesProcessed(t, linesIn, basesProcessed, 8)); | |
295 | |
296 outstream.println(); | |
297 outstream.println("Valid Lines: \t"+validLines); | |
298 outstream.println("Valid Bytes: \t"+validBases); | |
299 outstream.println("Invalid Lines: \t"+invalidLines); | |
300 outstream.println("Invalid Bytes: \t"+invalidBases); | |
301 } | |
302 if(counts!=null){ | |
303 outstream.println("Unique Taxa: \t"+taxaCounted); | |
304 } | |
305 | |
306 if(errorState){ | |
307 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
308 } | |
309 } | |
310 | |
311 //Unused; not efficient | |
312 // public IntList getIds(ByteFile bf){ | |
313 // IntList ids=new IntList(); | |
314 // | |
315 // int readsProcessedInner=0; | |
316 // | |
317 // byte[] line=bf.nextLine(); | |
318 // ByteBuilder bb=new ByteBuilder(); | |
319 // while(line!=null){ | |
320 // if(line.length>0 && line[0]=='>'){ | |
321 // readsProcessedInner++; | |
322 // if(maxReads>0 && readsProcessedInner>maxReads){break;} | |
323 // | |
324 // for(int i=1; i<line.length; i++){ | |
325 // byte b=line[i]; | |
326 // if(b==' ' || b=='.'){break;} | |
327 // else{bb.append(b);} | |
328 // } | |
329 // bb.append(','); | |
330 // if(bb.length()>100000){ | |
331 // bb.setLength(bb.length()-1); | |
332 // int[] ret; | |
333 // if(mode==ACCESSION_MODE){ | |
334 // ret=TaxClient.accessionToTaxidArray(bb.toString()); | |
335 // }else if(mode==GI_MODE){ | |
336 // ret=TaxClient.giToTaxidArray(bb.toString()); | |
337 // }else{ | |
338 // ret=TaxClient.headerToTaxidArray(bb.toString()); | |
339 // } | |
340 // assert(ret!=null) : bb.toString(); | |
341 // for(int i : ret){ids.add(i);} | |
342 // bb.clear(); | |
343 // } | |
344 // } | |
345 // line=bf.nextLine(); | |
346 // } | |
347 // if(bb.length()>0){ | |
348 // bb.setLength(bb.length()-1); | |
349 // int[] ret; | |
350 // if(mode==ACCESSION_MODE){ | |
351 // ret=TaxClient.accessionToTaxidArray(bb.toString()); | |
352 // }else if(mode==GI_MODE){ | |
353 // ret=TaxClient.giToTaxidArray(bb.toString()); | |
354 // }else{ | |
355 // ret=TaxClient.headerToTaxidArray(bb.toString()); | |
356 // } | |
357 // assert(ret!=null) : bb.toString(); | |
358 // for(int i : ret){ids.add(i);} | |
359 // bb.clear(); | |
360 // } | |
361 // | |
362 // bf.reset(); | |
363 // return ids; | |
364 // } | |
365 | |
366 private void processInner(ByteFile bf, ByteStreamWriter bsw, ByteStreamWriter bswInvalid, ByteStreamWriter bswBadHeaders, HashArray1D counts, IntList ids){ | |
367 | |
368 int readsProcessedInner=0; | |
369 | |
370 byte[] line=bf.nextLine(); | |
371 boolean valid=false; | |
372 while(line!=null){ | |
373 if(line.length>0 && line[0]=='>'){ | |
374 readsProcessedInner++; | |
375 readsProcessed++; | |
376 if(maxReads>0 && readsProcessed>maxReads){break;} | |
377 int initial=1, terminal=line.length; | |
378 final int number; | |
379 if(ids==null){ | |
380 final TaxNode tn; | |
381 | |
382 { | |
383 { | |
384 // Handles renumbering when the format is correct but the number is wrong. | |
385 if(Tools.startsWith(line, ">tid|")){ | |
386 initial=6; | |
387 while(initial<=line.length && line[initial-1]!='|'){initial++;} | |
388 }else if(Tools.startsWith(line, ">ncbi|")){ | |
389 initial=7; | |
390 while(initial<=line.length && line[initial-1]!='|'){initial++;} | |
391 } | |
392 } | |
393 | |
394 if(shrinkNames){//This is for nr/nt | |
395 for(int i=initial; i<terminal; i++){ | |
396 if(line[i]==1){//SOH | |
397 terminal=i; | |
398 } | |
399 } | |
400 } | |
401 | |
402 String s=new String(line, initial, terminal-initial); | |
403 | |
404 tn=tree.parseNodeFromHeader(s, true); | |
405 } | |
406 number=(tn==null ? -1 : tn.id); | |
407 }else{ | |
408 number=ids.get((int)(readsProcessedInner-1)); | |
409 | |
410 if(shrinkNames){//This is for nr/nt | |
411 for(int i=initial; i<terminal; i++){ | |
412 if(line[i]==1){//SOH | |
413 terminal=i; | |
414 } | |
415 } | |
416 } | |
417 } | |
418 | |
419 valid=(number>=0); | |
420 if(valid){ | |
421 validReads++; | |
422 bsw.print(title); | |
423 bsw.print(number); | |
424 if(prefix){ | |
425 bsw.print('|'); | |
426 for(int i=initial; i<terminal; i++){ | |
427 bsw.print(line[i]); | |
428 } | |
429 }else if(counts!=null){ | |
430 bsw.print('|'); | |
431 int count=counts.increment(number, 1); | |
432 bsw.print(count); | |
433 if(count==1){taxaCounted++;} | |
434 } | |
435 bsw.println(); | |
436 }else{ | |
437 invalidReads++; | |
438 if(deleteInvalid){ | |
439 System.err.println("Invalid sequence detected; aborting.\n"); | |
440 break; | |
441 } | |
442 if(bswBadHeaders!=null){bswBadHeaders.println(line);} | |
443 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){ | |
444 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders+"\n"+new String(line)); | |
445 } | |
446 if(keepAll){ | |
447 if(shrinkNames){ | |
448 for(int i=0; i<terminal; i++){ | |
449 bsw.print(line[i]); | |
450 } | |
451 bsw.println(); | |
452 }else{ | |
453 bsw.println(line); | |
454 } | |
455 }else if(bswInvalid!=null){ | |
456 if(shrinkNames){ | |
457 for(int i=0; i<terminal; i++){ | |
458 bswInvalid.print(line[i]); | |
459 } | |
460 bswInvalid.println(); | |
461 }else{ | |
462 bswInvalid.println(line); | |
463 } | |
464 } | |
465 } | |
466 }else{ | |
467 basesProcessed+=line.length; | |
468 if(valid || keepAll){ | |
469 if(valid){validBases+=line.length;} | |
470 else{invalidBases+=line.length;} | |
471 bsw.println(line); | |
472 }else{ | |
473 invalidBases+=line.length; | |
474 if(bswInvalid!=null){ | |
475 bswInvalid.println(line); | |
476 } | |
477 } | |
478 } | |
479 line=bf.nextLine(); | |
480 } | |
481 | |
482 errorState|=bf.close(); | |
483 } | |
484 | |
485 private static boolean looksLikeRealAccession(byte[] line){ | |
486 int space=Tools.indexOf(line, ' '); | |
487 if(space<0){space=line.length;} | |
488 if(space>18 || space<4){return false;} | |
489 //... hmm... this is a pretty short list for false cases! | |
490 int dot=-1; | |
491 for(int i=0; i<space; i++){ | |
492 if(line[i]=='.'){ | |
493 if(dot>=0){return false;}//Only 1 dot allowed | |
494 dot=i; | |
495 } | |
496 } | |
497 if(dot>0){ | |
498 if(dot!=space-2){return false;} | |
499 } | |
500 for(int i=0; i<space; i++){ | |
501 byte b=line[i]; | |
502 if(b!='_' && b!='-' && b!='.' && !Tools.isLetterOrDigit(b)){return false;} | |
503 } | |
504 return true; | |
505 } | |
506 | |
507 void appendHeaderLine(byte[] line, ByteBuilder bb){ | |
508 assert(line[0]=='>' || line[0]=='@') : new String(line); | |
509 | |
510 if(mode==ACCESSION_MODE){ | |
511 for(int i=1; i<line.length; i++){ | |
512 byte b=line[i]; | |
513 if(b==' ' || b=='.'){break;} | |
514 else{bb.append(b);} | |
515 } | |
516 }else if(mode==GI_MODE){ | |
517 for(int i=1; i<line.length; i++){ | |
518 byte b=line[i]; | |
519 if(b==' ' || b=='|'){break;} | |
520 else{bb.append(b);} | |
521 } | |
522 }else if(mode==UNITE_MODE){ | |
523 int initial=Tools.indexOf(line, '|'); | |
524 for(int i=initial+1; i<line.length; i++){ | |
525 byte b=line[i]; | |
526 if(b==' ' || b=='.' || b=='|'){break;} | |
527 else{bb.append(b);} | |
528 } | |
529 }else{ | |
530 for(int i=1; i<line.length; i++){ | |
531 byte b=line[i]; | |
532 bb.append(b); | |
533 } | |
534 } | |
535 bb.append(','); | |
536 } | |
537 | |
538 private void updateHeadersFromServer(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders, int format){ | |
539 if(format==FileFormat.FA){ | |
540 updateHeadersFromServer_fasta(lines, counts, bswBadHeaders); | |
541 }else if(format==FileFormat.GFF){ | |
542 updateHeadersFromServer_gff(lines, counts, bswBadHeaders); | |
543 }else{ | |
544 assert(false) : "Unsupported type: "+format; | |
545 } | |
546 } | |
547 | |
548 private void updateHeadersFromServer_fasta(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders){ | |
549 ByteBuilder bb=new ByteBuilder(); | |
550 ArrayList<String> names=new ArrayList<String>(); | |
551 for(byte[] line : lines){ | |
552 if(line[0]=='>' && !Tools.startsWith(line, ">tid")){ | |
553 appendHeaderLine(line, bb); | |
554 if(mode==UNITE_MODE){ | |
555 int bar=Tools.indexOf(line, '|'); | |
556 names.add(new String(line, 1, bar-1)); | |
557 } | |
558 } | |
559 } | |
560 if(bb.length()<1){return;} | |
561 | |
562 assert(bb.endsWith(',')); | |
563 bb.length--; | |
564 | |
565 // System.err.println("Sending '"+bb+"'"); | |
566 | |
567 final int[] serverIds; | |
568 if(mode==ACCESSION_MODE || mode==UNITE_MODE){ | |
569 serverIds=TaxClient.accessionToTaxidArray(bb.toString()); | |
570 }else if(mode==GI_MODE){ | |
571 serverIds=TaxClient.giToTaxidArray(bb.toString()); | |
572 }else{ | |
573 serverIds=TaxClient.headerToTaxidArray(bb.toString()); | |
574 } | |
575 assert(serverIds!=null) : "Null response for '"+bb.toString()+"'"; | |
576 bb.clear(); | |
577 | |
578 if(!names.isEmpty()){ | |
579 assert(tree!=null) : "Need to load a TaxTree."; | |
580 assert(names.size()==serverIds.length); | |
581 for(int i=0; i<serverIds.length; i++){ | |
582 final String name=names.get(i); | |
583 if(serverIds[i]<0){ | |
584 TaxNode tn=tree.getNodeByName(name); | |
585 if(tn!=null){serverIds[i]=tn.id;} | |
586 // else { | |
587 // assert(false) : names.get(i); | |
588 // } | |
589 }else{ | |
590 //Sometimes the species gets renamed. | |
591 // TaxNode tn=tree.getNodeByName(name); | |
592 // if(tn==null || tn.id==serverIds[i]) {System.err.println(name+", "+serverIds[i]+", "+tn+", "+tree.getNodesByName(name));} | |
593 } | |
594 } | |
595 } | |
596 | |
597 for(int lineNum=0, serverNum=0; lineNum<=lines.size(); lineNum++){ | |
598 byte[] line=lines.get(lineNum); | |
599 if(line[0]=='>' && !Tools.startsWith(line, ">tid")){ | |
600 bb.clear(); | |
601 final int tid=serverIds[serverNum]; | |
602 if(tid<0){ | |
603 //WARN | |
604 if(bswBadHeaders!=null){ | |
605 bswBadHeaders.print(tid).tab(); | |
606 bswBadHeaders.print(looksLikeRealAccession(line)).tab(); | |
607 bswBadHeaders.println(line); | |
608 }else if(warnBadHeaders){ | |
609 System.err.println(tid+"\t"+looksLikeRealAccession(line)+"\t"+new String(line)); | |
610 } | |
611 } | |
612 int initial=1, terminal=line.length; | |
613 if(shrinkNames){//This is for nr/nt | |
614 for(int i=initial; i<terminal; i++){ | |
615 if(line[i]==1){//SOH | |
616 terminal=i; | |
617 } | |
618 } | |
619 } | |
620 | |
621 bb.append(title); | |
622 bb.append(tid); | |
623 if(prefix){ | |
624 bb.append('|'); | |
625 for(int i=initial; i<terminal; i++){ | |
626 bb.append(line[i]); | |
627 } | |
628 }else if(counts!=null && tid>=0){ | |
629 bb.append('|'); | |
630 int count=counts.increment(tid, 1); | |
631 bb.append(count); | |
632 if(count==1){taxaCounted++;} | |
633 } | |
634 | |
635 lines.set(lineNum, bb.toBytes()); | |
636 | |
637 serverNum++; | |
638 if(serverNum>=serverIds.length){break;} | |
639 } | |
640 } | |
641 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){ | |
642 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders); | |
643 } | |
644 } | |
645 | |
646 private void updateHeadersFromServer_gff(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders){ | |
647 ByteBuilder bb=new ByteBuilder(); | |
648 ArrayList<String> names=new ArrayList<String>(); | |
649 for(byte[] line : lines){ | |
650 if(line[0]!='#' && !Tools.startsWith(line, "tid")){ | |
651 if(bb.length()>0){bb.append(',');} | |
652 for(byte b : line){ | |
653 if(b=='\t'){break;} | |
654 bb.append(b); | |
655 } | |
656 } | |
657 } | |
658 if(bb.length()<1){return;} | |
659 | |
660 // assert(false) : bb; | |
661 | |
662 // System.err.println("Sending '"+bb+"'"); | |
663 | |
664 int[] serverIds; | |
665 if(mode==ACCESSION_MODE || mode==UNITE_MODE){ | |
666 serverIds=TaxClient.accessionToTaxidArray(bb.toString()); | |
667 }else if(mode==GI_MODE){ | |
668 serverIds=TaxClient.giToTaxidArray(bb.toString()); | |
669 }else{ | |
670 serverIds=TaxClient.headerToTaxidArray(bb.toString()); | |
671 } | |
672 if(serverIds==null){ | |
673 KillSwitch.kill("Null response for '"+bb.toString()+"'"); | |
674 } | |
675 // assert(serverIds!=null) : "Null response for '"+bb.toString()+"'"; | |
676 bb.clear(); | |
677 | |
678 if(!names.isEmpty()){ | |
679 assert(tree!=null) : "Need to load a TaxTree."; | |
680 assert(names.size()==serverIds.length); | |
681 for(int i=0; i<serverIds.length; i++){ | |
682 final String name=names.get(i); | |
683 if(serverIds[i]<0){ | |
684 TaxNode tn=tree.getNodeByName(name); | |
685 if(tn!=null){serverIds[i]=tn.id;} | |
686 // else { | |
687 // assert(false) : names.get(i); | |
688 // } | |
689 }else{ | |
690 //Sometimes the species gets renamed. | |
691 // TaxNode tn=tree.getNodeByName(name); | |
692 // if(tn==null || tn.id==serverIds[i]) {System.err.println(name+", "+serverIds[i]+", "+tn+", "+tree.getNodesByName(name));} | |
693 } | |
694 } | |
695 } | |
696 | |
697 for(int lineNum=0, serverNum=0; lineNum<=lines.size(); lineNum++){ | |
698 byte[] line=lines.get(lineNum); | |
699 if(line[0]!='#' && !Tools.startsWith(line, "tid")){ | |
700 bb.clear(); | |
701 final int tid=serverIds[serverNum]; | |
702 if(tid<0){ | |
703 //WARN | |
704 if(bswBadHeaders!=null){ | |
705 bswBadHeaders.print(tid).tab(); | |
706 bswBadHeaders.print(looksLikeRealAccession(line)).tab(); | |
707 bswBadHeaders.println(line); | |
708 }else if(warnBadHeaders){ | |
709 System.err.println(tid+"\t"+looksLikeRealAccession(line)+"\t"+new String(line)); | |
710 } | |
711 } | |
712 | |
713 bb.append("tid|"); | |
714 bb.append(tid); | |
715 if(prefix){ | |
716 bb.append('|'); | |
717 bb.append(line); | |
718 }else if(counts!=null && tid>=0){ | |
719 bb.append('|'); | |
720 int count=counts.increment(tid, 1); | |
721 bb.append(count); | |
722 if(count==1){taxaCounted++;} | |
723 } | |
724 | |
725 lines.set(lineNum, bb.toBytes()); | |
726 | |
727 serverNum++; | |
728 if(serverNum>=serverIds.length){break;} | |
729 } | |
730 } | |
731 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){ | |
732 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders); | |
733 } | |
734 } | |
735 | |
736 private void processInner_server(ByteFile bf, ByteStreamWriter bsw, ByteStreamWriter bswInvalid, ByteStreamWriter bswBadHeaders, HashArray1D counts, int format){ | |
737 | |
738 ArrayList<byte[]> lines=new ArrayList<byte[]>(); | |
739 byte[] line=bf.nextLine(); | |
740 boolean valid=false; | |
741 long storedBytes=0; | |
742 | |
743 while(line!=null){ | |
744 | |
745 if(line.length>0){ | |
746 linesIn++; | |
747 lines.add(line); | |
748 storedBytes+=line.length; | |
749 if(storedBytes>=maxStoredBytes){ | |
750 updateHeadersFromServer(lines, counts, bswBadHeaders, format); | |
751 valid=dumpBuffer(lines, valid, bsw, bswInvalid); | |
752 lines=new ArrayList<byte[]>(); | |
753 storedBytes=0; | |
754 if(deleteInvalid && invalidReads>0){ | |
755 System.err.println("Invalid sequence detected; aborting.\n" | |
756 + "Input file: \t"+bf.name()+"\n" | |
757 + "Output file: \t"+(bsw==null ? "null" : bsw.fname)+"\n" | |
758 + "Line: \t"+new String(line)+"\n"); | |
759 break; | |
760 } | |
761 } | |
762 } | |
763 line=bf.nextLine(); | |
764 } | |
765 | |
766 if(storedBytes>0){ | |
767 updateHeadersFromServer(lines, counts, bswBadHeaders, format); | |
768 valid=dumpBuffer(lines, valid, bsw, bswInvalid); | |
769 lines=new ArrayList<byte[]>(); | |
770 storedBytes=0; | |
771 } | |
772 | |
773 errorState|=bf.close(); | |
774 } | |
775 | |
776 private boolean dumpBuffer(ArrayList<byte[]> lines, boolean valid, ByteStreamWriter bsw, ByteStreamWriter bswInvalid){ | |
777 | |
778 for(byte[] line : lines){ | |
779 | |
780 if(line.length>0 && line[0]=='>'){ | |
781 readsProcessed++; | |
782 if(maxReads>0 && readsProcessed>maxReads){break;} | |
783 | |
784 if(Tools.startsWith(line, invalidTitle)){ | |
785 valid=false; | |
786 invalidReads++; | |
787 invalidLines++; | |
788 if(deleteInvalid){break;} | |
789 }else{ | |
790 assert(Tools.startsWith(line, title)); | |
791 valid=true; | |
792 validReads++; | |
793 validLines++; | |
794 } | |
795 }else if(gffIn){ | |
796 basesProcessed+=line.length; | |
797 valid=!Tools.startsWith(line, invalidGffTitle); | |
798 if(valid){ | |
799 validBases+=line.length; | |
800 validLines++; | |
801 }else{ | |
802 invalidBases+=line.length; | |
803 invalidLines++; | |
804 } | |
805 }else{ | |
806 basesProcessed+=line.length; | |
807 if(valid){ | |
808 validBases+=line.length; | |
809 validLines++; | |
810 }else{ | |
811 invalidBases+=line.length; | |
812 invalidLines++; | |
813 } | |
814 } | |
815 | |
816 if(valid || keepAll){ | |
817 if(bsw!=null){bsw.println(line);} | |
818 }else{ | |
819 if(bswInvalid!=null){bswInvalid.println(line);} | |
820 } | |
821 } | |
822 return valid; | |
823 } | |
824 | |
825 /*--------------------------------------------------------------*/ | |
826 | |
827 | |
828 /*--------------------------------------------------------------*/ | |
829 | |
830 private LinkedHashSet<String> in1=new LinkedHashSet<String>(); | |
831 private String out1=null; | |
832 private String outInvalid=null; | |
833 private String badHeaders=null; | |
834 | |
835 private String taxTreeFile=null; | |
836 private String giTableFile=null; | |
837 private String accessionFile=null; | |
838 private String patternFile=null; | |
839 | |
840 /*--------------------------------------------------------------*/ | |
841 | |
842 private long maxReads=-1; | |
843 | |
844 private long validReads=0; | |
845 private long validBases=0; | |
846 private long invalidReads=0; | |
847 private long invalidBases=0; | |
848 private long taxaCounted=0; | |
849 | |
850 private long linesIn=0; | |
851 private long validLines=0; | |
852 private long invalidLines=0; | |
853 | |
854 private long maxStoredBytes=10000000; | |
855 | |
856 private long readsProcessed=0, basesProcessed=0; | |
857 | |
858 private boolean prefix=true; | |
859 private boolean countTable=true; | |
860 private boolean keepAll=true; | |
861 private boolean shrinkNames=false; | |
862 private boolean warnBadHeaders=true; | |
863 private boolean useServer=false; | |
864 /** Crash if the number of invalid headers exceeds this */ | |
865 private long maxInvalidHeaders=-1; | |
866 /** Delete the output file if there are any invalid headers */ | |
867 private boolean deleteInvalid=false; | |
868 | |
869 private int mode; | |
870 private static final int ACCESSION_MODE=0, GI_MODE=1, HEADER_MODE=2, UNITE_MODE=3; | |
871 | |
872 private boolean gffIn=false; | |
873 | |
874 /*--------------------------------------------------------------*/ | |
875 | |
876 private final ArrayList<FileFormat> ffin1; | |
877 private final FileFormat ffout1; | |
878 private final FileFormat ffoutInvalid; | |
879 private final TaxTree tree; | |
880 | |
881 /*--------------------------------------------------------------*/ | |
882 | |
883 private PrintStream outstream=System.err; | |
884 public static boolean verbose=false; | |
885 public boolean errorState=false; | |
886 private boolean overwrite=false; | |
887 private boolean append=false; | |
888 | |
889 private static byte[] title=">tid|".getBytes(); | |
890 private static byte[] invalidTitle=">tid|-1".getBytes(); | |
891 private static byte[] invalidGffTitle="tid|-1".getBytes(); | |
892 | |
893 } |