comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/RenameGiToTaxid.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package tax;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.LinkedHashSet;
7
8 import fileIO.ByteFile;
9 import fileIO.ByteFile1;
10 import fileIO.ByteFile2;
11 import fileIO.ByteStreamWriter;
12 import fileIO.FileFormat;
13 import fileIO.ReadWrite;
14 import kmer.HashArray1D;
15 import shared.KillSwitch;
16 import shared.Parse;
17 import shared.Parser;
18 import shared.PreParser;
19 import shared.ReadStats;
20 import shared.Shared;
21 import shared.Timer;
22 import shared.Tools;
23 import stream.ConcurrentGenericReadInputStream;
24 import stream.FASTQ;
25 import stream.FastaReadInputStream;
26 import structures.ByteBuilder;
27 import structures.IntList;
28
29 /**
30 * @author Brian Bushnell
31 * @date Mar 10, 2015
32 *
33 */
34 public class RenameGiToTaxid {
35
36 public static void main(String[] args){
37 Timer t=new Timer();
38 RenameGiToTaxid x=new RenameGiToTaxid(args);
39 x.process(t);
40
41 //Close the print stream if it was redirected
42 Shared.closeStream(x.outstream);
43 }
44
45 public RenameGiToTaxid(String[] args){
46
47 {//Preparse block for help, config files, and outstream
48 PreParser pp=new PreParser(args, getClass(), false);
49 args=pp.args;
50 outstream=pp.outstream;
51 }
52
53 Shared.capBuffers(4);
54 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
55 ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true;
56 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
57 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
58
59 Parser parser=new Parser();
60 for(int i=0; i<args.length; i++){
61 String arg=args[i];
62 String[] split=arg.split("=");
63 String a=split[0].toLowerCase();
64 String b=split.length>1 ? split[1] : null;
65
66 if(a.equals("prefix")){
67 prefix=Parse.parseBoolean(b);
68
69 }else if(a.equals("server") || a.equals("useserver")){
70 if(b!=null && b.startsWith("http")){
71 useServer=true;
72 String path=b;
73 if(!path.endsWith("/")){path+="/";}
74 Shared.setTaxServer(path);
75 }else{
76 useServer=Parse.parseBoolean(b);
77 }
78 }else if(a.equals("title")){
79 title=(b==null ? ">" : (">"+b+"|")).getBytes();
80 }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
81 giTableFile=b;
82 }else if(a.equals("accession")){
83 accessionFile=b;
84 }else if(a.equals("pattern")){
85 patternFile=b;
86 }else if(a.equals("tree") || a.equals("taxtree")){
87 taxTreeFile=b;
88 }else if(a.equals("invalid")){
89 outInvalid=b;
90 }else if(a.equals("deleteinvalid")){
91 deleteInvalid=Parse.parseBoolean(b);
92 }else if(a.equals("badheaders")){
93 badHeaders=b;
94 }else if(a.equals("maxbadheaders") || a.equals("maxinvalidheaders")){
95 maxInvalidHeaders=Parse.parseKMG(b);
96 }else if(a.equals("keepall")){
97 keepAll=Parse.parseBoolean(b);
98 }else if(a.equals("shrinknames")){
99 shrinkNames=Parse.parseBoolean(b);
100 }else if(a.equals("warn")){
101 warnBadHeaders=Parse.parseBoolean(b);
102 }
103
104 else if(a.equals("maxpigzprocesses")){
105 AccessionToTaxid.maxPigzProcesses=Integer.parseInt(b);
106 }else if(a.equals("skipparse")){
107 AccessionToTaxid.skipParse=Parse.parseBoolean(b);
108 }else if(a.equals("skiphash")){
109 AccessionToTaxid.skipHash=Parse.parseBoolean(b);
110 }
111
112 else if(a.equals("mode")){
113 if(b!=null && Character.isDigit(b.charAt(0))){
114 mode=Integer.parseInt(b);
115 }else if("accession".equalsIgnoreCase(b)){
116 mode=ACCESSION_MODE;
117 }else if("unite".equalsIgnoreCase(b)){
118 mode=UNITE_MODE;
119 TaxTree.UNITE_MODE=true;
120 }else if("gi".equalsIgnoreCase(b)){
121 mode=GI_MODE;
122 }else if("header".equalsIgnoreCase(b)){
123 mode=HEADER_MODE;
124 }else{
125 assert(false) : "Bad mode: "+b;
126 }
127 }
128
129 else if(a.equals("verbose")){
130 verbose=Parse.parseBoolean(b);
131 ByteFile1.verbose=verbose;
132 ByteFile2.verbose=verbose;
133 stream.FastaReadInputStream.verbose=verbose;
134 ConcurrentGenericReadInputStream.verbose=verbose;
135 stream.FastqReadInputStream.verbose=verbose;
136 ReadWrite.verbose=verbose;
137 }else if(a.equals("in") || a.equals("in1")){
138 assert(b!=null) : "Bad parameter: "+arg;
139 if(new File(b).exists()){
140 in1.add(b);
141 }else{
142 for(String bb : b.split(",")){
143 in1.add(bb);
144 }
145 }
146 }else if(new File(arg).exists()){ //For asterisk expansion
147 in1.add(arg);
148 }else if(parser.parse(arg, a, b)){
149 //do nothing
150 }else{
151 outstream.println("Unknown parameter "+args[i]);
152 assert(false) : "Unknown parameter "+args[i];
153 // throw new RuntimeException("Unknown parameter "+args[i]);
154 }
155 }
156
157 if(useServer){
158 giTableFile=null;
159 accessionFile=null;
160 patternFile=null;
161 if(mode!=UNITE_MODE){taxTreeFile=null;}
162 }//else if taxpath!=null... set them
163
164 {//Process parser fields
165 Parser.processQuality();
166
167 maxReads=parser.maxReads;
168
169 overwrite=ReadStats.overwrite=parser.overwrite;
170 append=ReadStats.append=parser.append;
171
172 out1=parser.out1;
173 }
174
175 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();}
176 if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();}
177 if("auto".equalsIgnoreCase(accessionFile)){accessionFile=TaxTree.defaultAccessionFile();}
178 if("auto".equalsIgnoreCase(patternFile)){patternFile=TaxTree.defaultPatternFile();}
179
180 assert(FastaReadInputStream.settingsOK());
181
182 if(in1==null || in1.isEmpty()){throw new RuntimeException("Error - at least one input file is required.");}
183 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){
184 ByteFile.FORCE_MODE_BF2=false;
185 ByteFile.FORCE_MODE_BF1=true;
186 }
187
188 if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
189 assert(out1!=null) : "This program requires an output file.";
190
191 if(!Tools.testOutputFiles(overwrite, append, false, out1)){
192 outstream.println((out1==null)+", "+out1);
193 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
194 }
195 if(!Tools.testInputFiles(false, true, in1.toArray(new String[0]))){
196 throw new RuntimeException("\nCan't read some input files.\n");
197 }
198
199 ffout1=FileFormat.testOutput(out1, FileFormat.FA, null, true, overwrite, append, false);
200 ffoutInvalid=FileFormat.testOutput(outInvalid, FileFormat.FA, null, true, overwrite, append, false);
201 ffin1=new ArrayList<FileFormat>(in1.size());
202 for(String s : in1){
203 FileFormat ff=FileFormat.testInput(s, FileFormat.FA, null, true, true);
204 ffin1.add(ff);
205 }
206
207 if(ffoutInvalid!=null){keepAll=false;}
208
209 assert(giTableFile!=null || accessionFile!=null || TaxTree.SILVA_MODE || useServer) : "No gi or accession information loaded.";
210
211 if(taxTreeFile!=null){
212 tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, false);
213 assert(tree.nameMap!=null);
214 }else{
215 tree=null;
216 if(!useServer){throw new RuntimeException("No tree specified.");}
217 }
218
219 if(giTableFile!=null){
220 GiToTaxid.initialize(giTableFile);
221 }
222
223 if(patternFile!=null){
224 Timer t=new Timer();
225 AnalyzeAccession.loadCodeMap(patternFile);
226 outstream.println("Loading pattern table.");
227 t.stopAndPrint();
228 }
229
230 if(accessionFile!=null){
231 AccessionToTaxid.tree=tree;
232 outstream.println("Loading accession table.");
233 AccessionToTaxid.load(accessionFile);
234 // System.gc();
235 }
236 }
237
238 void process(Timer t){
239
240 ByteStreamWriter bsw=(ffout1==null ? null : new ByteStreamWriter(ffout1)); //Actually, this is required.
241 if(bsw!=null){bsw.start();}
242
243 ByteStreamWriter bswInvalid=null;
244 if(ffoutInvalid!=null){
245 bswInvalid=new ByteStreamWriter(ffoutInvalid);
246 bswInvalid.start();
247 }
248
249 ByteStreamWriter bswBadHeaders=null;
250 if(badHeaders!=null) {
251 bswBadHeaders=new ByteStreamWriter(badHeaders, overwrite, append, false);
252 bswBadHeaders.start();
253 }
254
255 final HashArray1D counts=(countTable && !prefix) ? new HashArray1D(256000, -1L, true) : null;
256
257 gffIn=false;
258 for(FileFormat ffin : ffin1){
259 gffIn=gffIn||ffin.gff();
260 ByteFile bf=ByteFile.makeByteFile(ffin);
261 if(useServer){
262 processInner_server(bf, bsw, bswInvalid, bswBadHeaders, counts, ffin.format());
263 }else{
264 // IntList list=(useServer ? getIds(bf) : null);
265 processInner(bf, bsw, bswInvalid, bswBadHeaders, counts, null);
266 }
267 }
268
269 if(bsw!=null){
270 errorState|=bsw.poisonAndWait();
271 if(deleteInvalid && invalidReads>0 && !ffout1.stdio()){
272 try {
273 System.err.println("Deleting "+out1);
274 new File(out1).delete();
275 } catch (Exception e) {
276 System.err.println("An error occured while attempting to delete "+out1);
277 e.printStackTrace();
278 }
279 }
280 }
281 if(bswInvalid!=null){errorState|=bswInvalid.poisonAndWait();}
282 if(bswBadHeaders!=null){errorState|=bswBadHeaders.poisonAndWait();}
283
284 t.stop();
285 if(!gffIn) {
286 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
287
288 outstream.println();
289 outstream.println("Valid Sequences: \t"+validReads);
290 outstream.println("Valid Bases: \t"+validBases);
291 outstream.println("Invalid Sequences: \t"+invalidReads);
292 outstream.println("Invalid Bases: \t"+invalidBases);
293 }else{
294 outstream.println(Tools.timeLinesBytesProcessed(t, linesIn, basesProcessed, 8));
295
296 outstream.println();
297 outstream.println("Valid Lines: \t"+validLines);
298 outstream.println("Valid Bytes: \t"+validBases);
299 outstream.println("Invalid Lines: \t"+invalidLines);
300 outstream.println("Invalid Bytes: \t"+invalidBases);
301 }
302 if(counts!=null){
303 outstream.println("Unique Taxa: \t"+taxaCounted);
304 }
305
306 if(errorState){
307 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
308 }
309 }
310
311 //Unused; not efficient
312 // public IntList getIds(ByteFile bf){
313 // IntList ids=new IntList();
314 //
315 // int readsProcessedInner=0;
316 //
317 // byte[] line=bf.nextLine();
318 // ByteBuilder bb=new ByteBuilder();
319 // while(line!=null){
320 // if(line.length>0 && line[0]=='>'){
321 // readsProcessedInner++;
322 // if(maxReads>0 && readsProcessedInner>maxReads){break;}
323 //
324 // for(int i=1; i<line.length; i++){
325 // byte b=line[i];
326 // if(b==' ' || b=='.'){break;}
327 // else{bb.append(b);}
328 // }
329 // bb.append(',');
330 // if(bb.length()>100000){
331 // bb.setLength(bb.length()-1);
332 // int[] ret;
333 // if(mode==ACCESSION_MODE){
334 // ret=TaxClient.accessionToTaxidArray(bb.toString());
335 // }else if(mode==GI_MODE){
336 // ret=TaxClient.giToTaxidArray(bb.toString());
337 // }else{
338 // ret=TaxClient.headerToTaxidArray(bb.toString());
339 // }
340 // assert(ret!=null) : bb.toString();
341 // for(int i : ret){ids.add(i);}
342 // bb.clear();
343 // }
344 // }
345 // line=bf.nextLine();
346 // }
347 // if(bb.length()>0){
348 // bb.setLength(bb.length()-1);
349 // int[] ret;
350 // if(mode==ACCESSION_MODE){
351 // ret=TaxClient.accessionToTaxidArray(bb.toString());
352 // }else if(mode==GI_MODE){
353 // ret=TaxClient.giToTaxidArray(bb.toString());
354 // }else{
355 // ret=TaxClient.headerToTaxidArray(bb.toString());
356 // }
357 // assert(ret!=null) : bb.toString();
358 // for(int i : ret){ids.add(i);}
359 // bb.clear();
360 // }
361 //
362 // bf.reset();
363 // return ids;
364 // }
365
366 private void processInner(ByteFile bf, ByteStreamWriter bsw, ByteStreamWriter bswInvalid, ByteStreamWriter bswBadHeaders, HashArray1D counts, IntList ids){
367
368 int readsProcessedInner=0;
369
370 byte[] line=bf.nextLine();
371 boolean valid=false;
372 while(line!=null){
373 if(line.length>0 && line[0]=='>'){
374 readsProcessedInner++;
375 readsProcessed++;
376 if(maxReads>0 && readsProcessed>maxReads){break;}
377 int initial=1, terminal=line.length;
378 final int number;
379 if(ids==null){
380 final TaxNode tn;
381
382 {
383 {
384 // Handles renumbering when the format is correct but the number is wrong.
385 if(Tools.startsWith(line, ">tid|")){
386 initial=6;
387 while(initial<=line.length && line[initial-1]!='|'){initial++;}
388 }else if(Tools.startsWith(line, ">ncbi|")){
389 initial=7;
390 while(initial<=line.length && line[initial-1]!='|'){initial++;}
391 }
392 }
393
394 if(shrinkNames){//This is for nr/nt
395 for(int i=initial; i<terminal; i++){
396 if(line[i]==1){//SOH
397 terminal=i;
398 }
399 }
400 }
401
402 String s=new String(line, initial, terminal-initial);
403
404 tn=tree.parseNodeFromHeader(s, true);
405 }
406 number=(tn==null ? -1 : tn.id);
407 }else{
408 number=ids.get((int)(readsProcessedInner-1));
409
410 if(shrinkNames){//This is for nr/nt
411 for(int i=initial; i<terminal; i++){
412 if(line[i]==1){//SOH
413 terminal=i;
414 }
415 }
416 }
417 }
418
419 valid=(number>=0);
420 if(valid){
421 validReads++;
422 bsw.print(title);
423 bsw.print(number);
424 if(prefix){
425 bsw.print('|');
426 for(int i=initial; i<terminal; i++){
427 bsw.print(line[i]);
428 }
429 }else if(counts!=null){
430 bsw.print('|');
431 int count=counts.increment(number, 1);
432 bsw.print(count);
433 if(count==1){taxaCounted++;}
434 }
435 bsw.println();
436 }else{
437 invalidReads++;
438 if(deleteInvalid){
439 System.err.println("Invalid sequence detected; aborting.\n");
440 break;
441 }
442 if(bswBadHeaders!=null){bswBadHeaders.println(line);}
443 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
444 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders+"\n"+new String(line));
445 }
446 if(keepAll){
447 if(shrinkNames){
448 for(int i=0; i<terminal; i++){
449 bsw.print(line[i]);
450 }
451 bsw.println();
452 }else{
453 bsw.println(line);
454 }
455 }else if(bswInvalid!=null){
456 if(shrinkNames){
457 for(int i=0; i<terminal; i++){
458 bswInvalid.print(line[i]);
459 }
460 bswInvalid.println();
461 }else{
462 bswInvalid.println(line);
463 }
464 }
465 }
466 }else{
467 basesProcessed+=line.length;
468 if(valid || keepAll){
469 if(valid){validBases+=line.length;}
470 else{invalidBases+=line.length;}
471 bsw.println(line);
472 }else{
473 invalidBases+=line.length;
474 if(bswInvalid!=null){
475 bswInvalid.println(line);
476 }
477 }
478 }
479 line=bf.nextLine();
480 }
481
482 errorState|=bf.close();
483 }
484
485 private static boolean looksLikeRealAccession(byte[] line){
486 int space=Tools.indexOf(line, ' ');
487 if(space<0){space=line.length;}
488 if(space>18 || space<4){return false;}
489 //... hmm... this is a pretty short list for false cases!
490 int dot=-1;
491 for(int i=0; i<space; i++){
492 if(line[i]=='.'){
493 if(dot>=0){return false;}//Only 1 dot allowed
494 dot=i;
495 }
496 }
497 if(dot>0){
498 if(dot!=space-2){return false;}
499 }
500 for(int i=0; i<space; i++){
501 byte b=line[i];
502 if(b!='_' && b!='-' && b!='.' && !Tools.isLetterOrDigit(b)){return false;}
503 }
504 return true;
505 }
506
507 void appendHeaderLine(byte[] line, ByteBuilder bb){
508 assert(line[0]=='>' || line[0]=='@') : new String(line);
509
510 if(mode==ACCESSION_MODE){
511 for(int i=1; i<line.length; i++){
512 byte b=line[i];
513 if(b==' ' || b=='.'){break;}
514 else{bb.append(b);}
515 }
516 }else if(mode==GI_MODE){
517 for(int i=1; i<line.length; i++){
518 byte b=line[i];
519 if(b==' ' || b=='|'){break;}
520 else{bb.append(b);}
521 }
522 }else if(mode==UNITE_MODE){
523 int initial=Tools.indexOf(line, '|');
524 for(int i=initial+1; i<line.length; i++){
525 byte b=line[i];
526 if(b==' ' || b=='.' || b=='|'){break;}
527 else{bb.append(b);}
528 }
529 }else{
530 for(int i=1; i<line.length; i++){
531 byte b=line[i];
532 bb.append(b);
533 }
534 }
535 bb.append(',');
536 }
537
538 private void updateHeadersFromServer(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders, int format){
539 if(format==FileFormat.FA){
540 updateHeadersFromServer_fasta(lines, counts, bswBadHeaders);
541 }else if(format==FileFormat.GFF){
542 updateHeadersFromServer_gff(lines, counts, bswBadHeaders);
543 }else{
544 assert(false) : "Unsupported type: "+format;
545 }
546 }
547
548 private void updateHeadersFromServer_fasta(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders){
549 ByteBuilder bb=new ByteBuilder();
550 ArrayList<String> names=new ArrayList<String>();
551 for(byte[] line : lines){
552 if(line[0]=='>' && !Tools.startsWith(line, ">tid")){
553 appendHeaderLine(line, bb);
554 if(mode==UNITE_MODE){
555 int bar=Tools.indexOf(line, '|');
556 names.add(new String(line, 1, bar-1));
557 }
558 }
559 }
560 if(bb.length()<1){return;}
561
562 assert(bb.endsWith(','));
563 bb.length--;
564
565 // System.err.println("Sending '"+bb+"'");
566
567 final int[] serverIds;
568 if(mode==ACCESSION_MODE || mode==UNITE_MODE){
569 serverIds=TaxClient.accessionToTaxidArray(bb.toString());
570 }else if(mode==GI_MODE){
571 serverIds=TaxClient.giToTaxidArray(bb.toString());
572 }else{
573 serverIds=TaxClient.headerToTaxidArray(bb.toString());
574 }
575 assert(serverIds!=null) : "Null response for '"+bb.toString()+"'";
576 bb.clear();
577
578 if(!names.isEmpty()){
579 assert(tree!=null) : "Need to load a TaxTree.";
580 assert(names.size()==serverIds.length);
581 for(int i=0; i<serverIds.length; i++){
582 final String name=names.get(i);
583 if(serverIds[i]<0){
584 TaxNode tn=tree.getNodeByName(name);
585 if(tn!=null){serverIds[i]=tn.id;}
586 // else {
587 // assert(false) : names.get(i);
588 // }
589 }else{
590 //Sometimes the species gets renamed.
591 // TaxNode tn=tree.getNodeByName(name);
592 // if(tn==null || tn.id==serverIds[i]) {System.err.println(name+", "+serverIds[i]+", "+tn+", "+tree.getNodesByName(name));}
593 }
594 }
595 }
596
597 for(int lineNum=0, serverNum=0; lineNum<=lines.size(); lineNum++){
598 byte[] line=lines.get(lineNum);
599 if(line[0]=='>' && !Tools.startsWith(line, ">tid")){
600 bb.clear();
601 final int tid=serverIds[serverNum];
602 if(tid<0){
603 //WARN
604 if(bswBadHeaders!=null){
605 bswBadHeaders.print(tid).tab();
606 bswBadHeaders.print(looksLikeRealAccession(line)).tab();
607 bswBadHeaders.println(line);
608 }else if(warnBadHeaders){
609 System.err.println(tid+"\t"+looksLikeRealAccession(line)+"\t"+new String(line));
610 }
611 }
612 int initial=1, terminal=line.length;
613 if(shrinkNames){//This is for nr/nt
614 for(int i=initial; i<terminal; i++){
615 if(line[i]==1){//SOH
616 terminal=i;
617 }
618 }
619 }
620
621 bb.append(title);
622 bb.append(tid);
623 if(prefix){
624 bb.append('|');
625 for(int i=initial; i<terminal; i++){
626 bb.append(line[i]);
627 }
628 }else if(counts!=null && tid>=0){
629 bb.append('|');
630 int count=counts.increment(tid, 1);
631 bb.append(count);
632 if(count==1){taxaCounted++;}
633 }
634
635 lines.set(lineNum, bb.toBytes());
636
637 serverNum++;
638 if(serverNum>=serverIds.length){break;}
639 }
640 }
641 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
642 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders);
643 }
644 }
645
646 private void updateHeadersFromServer_gff(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders){
647 ByteBuilder bb=new ByteBuilder();
648 ArrayList<String> names=new ArrayList<String>();
649 for(byte[] line : lines){
650 if(line[0]!='#' && !Tools.startsWith(line, "tid")){
651 if(bb.length()>0){bb.append(',');}
652 for(byte b : line){
653 if(b=='\t'){break;}
654 bb.append(b);
655 }
656 }
657 }
658 if(bb.length()<1){return;}
659
660 // assert(false) : bb;
661
662 // System.err.println("Sending '"+bb+"'");
663
664 int[] serverIds;
665 if(mode==ACCESSION_MODE || mode==UNITE_MODE){
666 serverIds=TaxClient.accessionToTaxidArray(bb.toString());
667 }else if(mode==GI_MODE){
668 serverIds=TaxClient.giToTaxidArray(bb.toString());
669 }else{
670 serverIds=TaxClient.headerToTaxidArray(bb.toString());
671 }
672 if(serverIds==null){
673 KillSwitch.kill("Null response for '"+bb.toString()+"'");
674 }
675 // assert(serverIds!=null) : "Null response for '"+bb.toString()+"'";
676 bb.clear();
677
678 if(!names.isEmpty()){
679 assert(tree!=null) : "Need to load a TaxTree.";
680 assert(names.size()==serverIds.length);
681 for(int i=0; i<serverIds.length; i++){
682 final String name=names.get(i);
683 if(serverIds[i]<0){
684 TaxNode tn=tree.getNodeByName(name);
685 if(tn!=null){serverIds[i]=tn.id;}
686 // else {
687 // assert(false) : names.get(i);
688 // }
689 }else{
690 //Sometimes the species gets renamed.
691 // TaxNode tn=tree.getNodeByName(name);
692 // if(tn==null || tn.id==serverIds[i]) {System.err.println(name+", "+serverIds[i]+", "+tn+", "+tree.getNodesByName(name));}
693 }
694 }
695 }
696
697 for(int lineNum=0, serverNum=0; lineNum<=lines.size(); lineNum++){
698 byte[] line=lines.get(lineNum);
699 if(line[0]!='#' && !Tools.startsWith(line, "tid")){
700 bb.clear();
701 final int tid=serverIds[serverNum];
702 if(tid<0){
703 //WARN
704 if(bswBadHeaders!=null){
705 bswBadHeaders.print(tid).tab();
706 bswBadHeaders.print(looksLikeRealAccession(line)).tab();
707 bswBadHeaders.println(line);
708 }else if(warnBadHeaders){
709 System.err.println(tid+"\t"+looksLikeRealAccession(line)+"\t"+new String(line));
710 }
711 }
712
713 bb.append("tid|");
714 bb.append(tid);
715 if(prefix){
716 bb.append('|');
717 bb.append(line);
718 }else if(counts!=null && tid>=0){
719 bb.append('|');
720 int count=counts.increment(tid, 1);
721 bb.append(count);
722 if(count==1){taxaCounted++;}
723 }
724
725 lines.set(lineNum, bb.toBytes());
726
727 serverNum++;
728 if(serverNum>=serverIds.length){break;}
729 }
730 }
731 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
732 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders);
733 }
734 }
735
736 private void processInner_server(ByteFile bf, ByteStreamWriter bsw, ByteStreamWriter bswInvalid, ByteStreamWriter bswBadHeaders, HashArray1D counts, int format){
737
738 ArrayList<byte[]> lines=new ArrayList<byte[]>();
739 byte[] line=bf.nextLine();
740 boolean valid=false;
741 long storedBytes=0;
742
743 while(line!=null){
744
745 if(line.length>0){
746 linesIn++;
747 lines.add(line);
748 storedBytes+=line.length;
749 if(storedBytes>=maxStoredBytes){
750 updateHeadersFromServer(lines, counts, bswBadHeaders, format);
751 valid=dumpBuffer(lines, valid, bsw, bswInvalid);
752 lines=new ArrayList<byte[]>();
753 storedBytes=0;
754 if(deleteInvalid && invalidReads>0){
755 System.err.println("Invalid sequence detected; aborting.\n"
756 + "Input file: \t"+bf.name()+"\n"
757 + "Output file: \t"+(bsw==null ? "null" : bsw.fname)+"\n"
758 + "Line: \t"+new String(line)+"\n");
759 break;
760 }
761 }
762 }
763 line=bf.nextLine();
764 }
765
766 if(storedBytes>0){
767 updateHeadersFromServer(lines, counts, bswBadHeaders, format);
768 valid=dumpBuffer(lines, valid, bsw, bswInvalid);
769 lines=new ArrayList<byte[]>();
770 storedBytes=0;
771 }
772
773 errorState|=bf.close();
774 }
775
776 private boolean dumpBuffer(ArrayList<byte[]> lines, boolean valid, ByteStreamWriter bsw, ByteStreamWriter bswInvalid){
777
778 for(byte[] line : lines){
779
780 if(line.length>0 && line[0]=='>'){
781 readsProcessed++;
782 if(maxReads>0 && readsProcessed>maxReads){break;}
783
784 if(Tools.startsWith(line, invalidTitle)){
785 valid=false;
786 invalidReads++;
787 invalidLines++;
788 if(deleteInvalid){break;}
789 }else{
790 assert(Tools.startsWith(line, title));
791 valid=true;
792 validReads++;
793 validLines++;
794 }
795 }else if(gffIn){
796 basesProcessed+=line.length;
797 valid=!Tools.startsWith(line, invalidGffTitle);
798 if(valid){
799 validBases+=line.length;
800 validLines++;
801 }else{
802 invalidBases+=line.length;
803 invalidLines++;
804 }
805 }else{
806 basesProcessed+=line.length;
807 if(valid){
808 validBases+=line.length;
809 validLines++;
810 }else{
811 invalidBases+=line.length;
812 invalidLines++;
813 }
814 }
815
816 if(valid || keepAll){
817 if(bsw!=null){bsw.println(line);}
818 }else{
819 if(bswInvalid!=null){bswInvalid.println(line);}
820 }
821 }
822 return valid;
823 }
824
825 /*--------------------------------------------------------------*/
826
827
828 /*--------------------------------------------------------------*/
829
830 private LinkedHashSet<String> in1=new LinkedHashSet<String>();
831 private String out1=null;
832 private String outInvalid=null;
833 private String badHeaders=null;
834
835 private String taxTreeFile=null;
836 private String giTableFile=null;
837 private String accessionFile=null;
838 private String patternFile=null;
839
840 /*--------------------------------------------------------------*/
841
842 private long maxReads=-1;
843
844 private long validReads=0;
845 private long validBases=0;
846 private long invalidReads=0;
847 private long invalidBases=0;
848 private long taxaCounted=0;
849
850 private long linesIn=0;
851 private long validLines=0;
852 private long invalidLines=0;
853
854 private long maxStoredBytes=10000000;
855
856 private long readsProcessed=0, basesProcessed=0;
857
858 private boolean prefix=true;
859 private boolean countTable=true;
860 private boolean keepAll=true;
861 private boolean shrinkNames=false;
862 private boolean warnBadHeaders=true;
863 private boolean useServer=false;
864 /** Crash if the number of invalid headers exceeds this */
865 private long maxInvalidHeaders=-1;
866 /** Delete the output file if there are any invalid headers */
867 private boolean deleteInvalid=false;
868
869 private int mode;
870 private static final int ACCESSION_MODE=0, GI_MODE=1, HEADER_MODE=2, UNITE_MODE=3;
871
872 private boolean gffIn=false;
873
874 /*--------------------------------------------------------------*/
875
876 private final ArrayList<FileFormat> ffin1;
877 private final FileFormat ffout1;
878 private final FileFormat ffoutInvalid;
879 private final TaxTree tree;
880
881 /*--------------------------------------------------------------*/
882
883 private PrintStream outstream=System.err;
884 public static boolean verbose=false;
885 public boolean errorState=false;
886 private boolean overwrite=false;
887 private boolean append=false;
888
889 private static byte[] title=">tid|".getBytes();
890 private static byte[] invalidTitle=">tid|-1".getBytes();
891 private static byte[] invalidGffTitle="tid|-1".getBytes();
892
893 }