annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/RenameGiToTaxid.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package tax;
jpayne@68 2
jpayne@68 3 import java.io.File;
jpayne@68 4 import java.io.PrintStream;
jpayne@68 5 import java.util.ArrayList;
jpayne@68 6 import java.util.LinkedHashSet;
jpayne@68 7
jpayne@68 8 import fileIO.ByteFile;
jpayne@68 9 import fileIO.ByteFile1;
jpayne@68 10 import fileIO.ByteFile2;
jpayne@68 11 import fileIO.ByteStreamWriter;
jpayne@68 12 import fileIO.FileFormat;
jpayne@68 13 import fileIO.ReadWrite;
jpayne@68 14 import kmer.HashArray1D;
jpayne@68 15 import shared.KillSwitch;
jpayne@68 16 import shared.Parse;
jpayne@68 17 import shared.Parser;
jpayne@68 18 import shared.PreParser;
jpayne@68 19 import shared.ReadStats;
jpayne@68 20 import shared.Shared;
jpayne@68 21 import shared.Timer;
jpayne@68 22 import shared.Tools;
jpayne@68 23 import stream.ConcurrentGenericReadInputStream;
jpayne@68 24 import stream.FASTQ;
jpayne@68 25 import stream.FastaReadInputStream;
jpayne@68 26 import structures.ByteBuilder;
jpayne@68 27 import structures.IntList;
jpayne@68 28
jpayne@68 29 /**
jpayne@68 30 * @author Brian Bushnell
jpayne@68 31 * @date Mar 10, 2015
jpayne@68 32 *
jpayne@68 33 */
jpayne@68 34 public class RenameGiToTaxid {
jpayne@68 35
jpayne@68 36 public static void main(String[] args){
jpayne@68 37 Timer t=new Timer();
jpayne@68 38 RenameGiToTaxid x=new RenameGiToTaxid(args);
jpayne@68 39 x.process(t);
jpayne@68 40
jpayne@68 41 //Close the print stream if it was redirected
jpayne@68 42 Shared.closeStream(x.outstream);
jpayne@68 43 }
jpayne@68 44
jpayne@68 45 public RenameGiToTaxid(String[] args){
jpayne@68 46
jpayne@68 47 {//Preparse block for help, config files, and outstream
jpayne@68 48 PreParser pp=new PreParser(args, getClass(), false);
jpayne@68 49 args=pp.args;
jpayne@68 50 outstream=pp.outstream;
jpayne@68 51 }
jpayne@68 52
jpayne@68 53 Shared.capBuffers(4);
jpayne@68 54 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
jpayne@68 55 ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true;
jpayne@68 56 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
jpayne@68 57 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
jpayne@68 58
jpayne@68 59 Parser parser=new Parser();
jpayne@68 60 for(int i=0; i<args.length; i++){
jpayne@68 61 String arg=args[i];
jpayne@68 62 String[] split=arg.split("=");
jpayne@68 63 String a=split[0].toLowerCase();
jpayne@68 64 String b=split.length>1 ? split[1] : null;
jpayne@68 65
jpayne@68 66 if(a.equals("prefix")){
jpayne@68 67 prefix=Parse.parseBoolean(b);
jpayne@68 68
jpayne@68 69 }else if(a.equals("server") || a.equals("useserver")){
jpayne@68 70 if(b!=null && b.startsWith("http")){
jpayne@68 71 useServer=true;
jpayne@68 72 String path=b;
jpayne@68 73 if(!path.endsWith("/")){path+="/";}
jpayne@68 74 Shared.setTaxServer(path);
jpayne@68 75 }else{
jpayne@68 76 useServer=Parse.parseBoolean(b);
jpayne@68 77 }
jpayne@68 78 }else if(a.equals("title")){
jpayne@68 79 title=(b==null ? ">" : (">"+b+"|")).getBytes();
jpayne@68 80 }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
jpayne@68 81 giTableFile=b;
jpayne@68 82 }else if(a.equals("accession")){
jpayne@68 83 accessionFile=b;
jpayne@68 84 }else if(a.equals("pattern")){
jpayne@68 85 patternFile=b;
jpayne@68 86 }else if(a.equals("tree") || a.equals("taxtree")){
jpayne@68 87 taxTreeFile=b;
jpayne@68 88 }else if(a.equals("invalid")){
jpayne@68 89 outInvalid=b;
jpayne@68 90 }else if(a.equals("deleteinvalid")){
jpayne@68 91 deleteInvalid=Parse.parseBoolean(b);
jpayne@68 92 }else if(a.equals("badheaders")){
jpayne@68 93 badHeaders=b;
jpayne@68 94 }else if(a.equals("maxbadheaders") || a.equals("maxinvalidheaders")){
jpayne@68 95 maxInvalidHeaders=Parse.parseKMG(b);
jpayne@68 96 }else if(a.equals("keepall")){
jpayne@68 97 keepAll=Parse.parseBoolean(b);
jpayne@68 98 }else if(a.equals("shrinknames")){
jpayne@68 99 shrinkNames=Parse.parseBoolean(b);
jpayne@68 100 }else if(a.equals("warn")){
jpayne@68 101 warnBadHeaders=Parse.parseBoolean(b);
jpayne@68 102 }
jpayne@68 103
jpayne@68 104 else if(a.equals("maxpigzprocesses")){
jpayne@68 105 AccessionToTaxid.maxPigzProcesses=Integer.parseInt(b);
jpayne@68 106 }else if(a.equals("skipparse")){
jpayne@68 107 AccessionToTaxid.skipParse=Parse.parseBoolean(b);
jpayne@68 108 }else if(a.equals("skiphash")){
jpayne@68 109 AccessionToTaxid.skipHash=Parse.parseBoolean(b);
jpayne@68 110 }
jpayne@68 111
jpayne@68 112 else if(a.equals("mode")){
jpayne@68 113 if(b!=null && Character.isDigit(b.charAt(0))){
jpayne@68 114 mode=Integer.parseInt(b);
jpayne@68 115 }else if("accession".equalsIgnoreCase(b)){
jpayne@68 116 mode=ACCESSION_MODE;
jpayne@68 117 }else if("unite".equalsIgnoreCase(b)){
jpayne@68 118 mode=UNITE_MODE;
jpayne@68 119 TaxTree.UNITE_MODE=true;
jpayne@68 120 }else if("gi".equalsIgnoreCase(b)){
jpayne@68 121 mode=GI_MODE;
jpayne@68 122 }else if("header".equalsIgnoreCase(b)){
jpayne@68 123 mode=HEADER_MODE;
jpayne@68 124 }else{
jpayne@68 125 assert(false) : "Bad mode: "+b;
jpayne@68 126 }
jpayne@68 127 }
jpayne@68 128
jpayne@68 129 else if(a.equals("verbose")){
jpayne@68 130 verbose=Parse.parseBoolean(b);
jpayne@68 131 ByteFile1.verbose=verbose;
jpayne@68 132 ByteFile2.verbose=verbose;
jpayne@68 133 stream.FastaReadInputStream.verbose=verbose;
jpayne@68 134 ConcurrentGenericReadInputStream.verbose=verbose;
jpayne@68 135 stream.FastqReadInputStream.verbose=verbose;
jpayne@68 136 ReadWrite.verbose=verbose;
jpayne@68 137 }else if(a.equals("in") || a.equals("in1")){
jpayne@68 138 assert(b!=null) : "Bad parameter: "+arg;
jpayne@68 139 if(new File(b).exists()){
jpayne@68 140 in1.add(b);
jpayne@68 141 }else{
jpayne@68 142 for(String bb : b.split(",")){
jpayne@68 143 in1.add(bb);
jpayne@68 144 }
jpayne@68 145 }
jpayne@68 146 }else if(new File(arg).exists()){ //For asterisk expansion
jpayne@68 147 in1.add(arg);
jpayne@68 148 }else if(parser.parse(arg, a, b)){
jpayne@68 149 //do nothing
jpayne@68 150 }else{
jpayne@68 151 outstream.println("Unknown parameter "+args[i]);
jpayne@68 152 assert(false) : "Unknown parameter "+args[i];
jpayne@68 153 // throw new RuntimeException("Unknown parameter "+args[i]);
jpayne@68 154 }
jpayne@68 155 }
jpayne@68 156
jpayne@68 157 if(useServer){
jpayne@68 158 giTableFile=null;
jpayne@68 159 accessionFile=null;
jpayne@68 160 patternFile=null;
jpayne@68 161 if(mode!=UNITE_MODE){taxTreeFile=null;}
jpayne@68 162 }//else if taxpath!=null... set them
jpayne@68 163
jpayne@68 164 {//Process parser fields
jpayne@68 165 Parser.processQuality();
jpayne@68 166
jpayne@68 167 maxReads=parser.maxReads;
jpayne@68 168
jpayne@68 169 overwrite=ReadStats.overwrite=parser.overwrite;
jpayne@68 170 append=ReadStats.append=parser.append;
jpayne@68 171
jpayne@68 172 out1=parser.out1;
jpayne@68 173 }
jpayne@68 174
jpayne@68 175 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();}
jpayne@68 176 if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();}
jpayne@68 177 if("auto".equalsIgnoreCase(accessionFile)){accessionFile=TaxTree.defaultAccessionFile();}
jpayne@68 178 if("auto".equalsIgnoreCase(patternFile)){patternFile=TaxTree.defaultPatternFile();}
jpayne@68 179
jpayne@68 180 assert(FastaReadInputStream.settingsOK());
jpayne@68 181
jpayne@68 182 if(in1==null || in1.isEmpty()){throw new RuntimeException("Error - at least one input file is required.");}
jpayne@68 183 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){
jpayne@68 184 ByteFile.FORCE_MODE_BF2=false;
jpayne@68 185 ByteFile.FORCE_MODE_BF1=true;
jpayne@68 186 }
jpayne@68 187
jpayne@68 188 if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
jpayne@68 189 assert(out1!=null) : "This program requires an output file.";
jpayne@68 190
jpayne@68 191 if(!Tools.testOutputFiles(overwrite, append, false, out1)){
jpayne@68 192 outstream.println((out1==null)+", "+out1);
jpayne@68 193 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
jpayne@68 194 }
jpayne@68 195 if(!Tools.testInputFiles(false, true, in1.toArray(new String[0]))){
jpayne@68 196 throw new RuntimeException("\nCan't read some input files.\n");
jpayne@68 197 }
jpayne@68 198
jpayne@68 199 ffout1=FileFormat.testOutput(out1, FileFormat.FA, null, true, overwrite, append, false);
jpayne@68 200 ffoutInvalid=FileFormat.testOutput(outInvalid, FileFormat.FA, null, true, overwrite, append, false);
jpayne@68 201 ffin1=new ArrayList<FileFormat>(in1.size());
jpayne@68 202 for(String s : in1){
jpayne@68 203 FileFormat ff=FileFormat.testInput(s, FileFormat.FA, null, true, true);
jpayne@68 204 ffin1.add(ff);
jpayne@68 205 }
jpayne@68 206
jpayne@68 207 if(ffoutInvalid!=null){keepAll=false;}
jpayne@68 208
jpayne@68 209 assert(giTableFile!=null || accessionFile!=null || TaxTree.SILVA_MODE || useServer) : "No gi or accession information loaded.";
jpayne@68 210
jpayne@68 211 if(taxTreeFile!=null){
jpayne@68 212 tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, false);
jpayne@68 213 assert(tree.nameMap!=null);
jpayne@68 214 }else{
jpayne@68 215 tree=null;
jpayne@68 216 if(!useServer){throw new RuntimeException("No tree specified.");}
jpayne@68 217 }
jpayne@68 218
jpayne@68 219 if(giTableFile!=null){
jpayne@68 220 GiToTaxid.initialize(giTableFile);
jpayne@68 221 }
jpayne@68 222
jpayne@68 223 if(patternFile!=null){
jpayne@68 224 Timer t=new Timer();
jpayne@68 225 AnalyzeAccession.loadCodeMap(patternFile);
jpayne@68 226 outstream.println("Loading pattern table.");
jpayne@68 227 t.stopAndPrint();
jpayne@68 228 }
jpayne@68 229
jpayne@68 230 if(accessionFile!=null){
jpayne@68 231 AccessionToTaxid.tree=tree;
jpayne@68 232 outstream.println("Loading accession table.");
jpayne@68 233 AccessionToTaxid.load(accessionFile);
jpayne@68 234 // System.gc();
jpayne@68 235 }
jpayne@68 236 }
jpayne@68 237
jpayne@68 238 void process(Timer t){
jpayne@68 239
jpayne@68 240 ByteStreamWriter bsw=(ffout1==null ? null : new ByteStreamWriter(ffout1)); //Actually, this is required.
jpayne@68 241 if(bsw!=null){bsw.start();}
jpayne@68 242
jpayne@68 243 ByteStreamWriter bswInvalid=null;
jpayne@68 244 if(ffoutInvalid!=null){
jpayne@68 245 bswInvalid=new ByteStreamWriter(ffoutInvalid);
jpayne@68 246 bswInvalid.start();
jpayne@68 247 }
jpayne@68 248
jpayne@68 249 ByteStreamWriter bswBadHeaders=null;
jpayne@68 250 if(badHeaders!=null) {
jpayne@68 251 bswBadHeaders=new ByteStreamWriter(badHeaders, overwrite, append, false);
jpayne@68 252 bswBadHeaders.start();
jpayne@68 253 }
jpayne@68 254
jpayne@68 255 final HashArray1D counts=(countTable && !prefix) ? new HashArray1D(256000, -1L, true) : null;
jpayne@68 256
jpayne@68 257 gffIn=false;
jpayne@68 258 for(FileFormat ffin : ffin1){
jpayne@68 259 gffIn=gffIn||ffin.gff();
jpayne@68 260 ByteFile bf=ByteFile.makeByteFile(ffin);
jpayne@68 261 if(useServer){
jpayne@68 262 processInner_server(bf, bsw, bswInvalid, bswBadHeaders, counts, ffin.format());
jpayne@68 263 }else{
jpayne@68 264 // IntList list=(useServer ? getIds(bf) : null);
jpayne@68 265 processInner(bf, bsw, bswInvalid, bswBadHeaders, counts, null);
jpayne@68 266 }
jpayne@68 267 }
jpayne@68 268
jpayne@68 269 if(bsw!=null){
jpayne@68 270 errorState|=bsw.poisonAndWait();
jpayne@68 271 if(deleteInvalid && invalidReads>0 && !ffout1.stdio()){
jpayne@68 272 try {
jpayne@68 273 System.err.println("Deleting "+out1);
jpayne@68 274 new File(out1).delete();
jpayne@68 275 } catch (Exception e) {
jpayne@68 276 System.err.println("An error occured while attempting to delete "+out1);
jpayne@68 277 e.printStackTrace();
jpayne@68 278 }
jpayne@68 279 }
jpayne@68 280 }
jpayne@68 281 if(bswInvalid!=null){errorState|=bswInvalid.poisonAndWait();}
jpayne@68 282 if(bswBadHeaders!=null){errorState|=bswBadHeaders.poisonAndWait();}
jpayne@68 283
jpayne@68 284 t.stop();
jpayne@68 285 if(!gffIn) {
jpayne@68 286 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
jpayne@68 287
jpayne@68 288 outstream.println();
jpayne@68 289 outstream.println("Valid Sequences: \t"+validReads);
jpayne@68 290 outstream.println("Valid Bases: \t"+validBases);
jpayne@68 291 outstream.println("Invalid Sequences: \t"+invalidReads);
jpayne@68 292 outstream.println("Invalid Bases: \t"+invalidBases);
jpayne@68 293 }else{
jpayne@68 294 outstream.println(Tools.timeLinesBytesProcessed(t, linesIn, basesProcessed, 8));
jpayne@68 295
jpayne@68 296 outstream.println();
jpayne@68 297 outstream.println("Valid Lines: \t"+validLines);
jpayne@68 298 outstream.println("Valid Bytes: \t"+validBases);
jpayne@68 299 outstream.println("Invalid Lines: \t"+invalidLines);
jpayne@68 300 outstream.println("Invalid Bytes: \t"+invalidBases);
jpayne@68 301 }
jpayne@68 302 if(counts!=null){
jpayne@68 303 outstream.println("Unique Taxa: \t"+taxaCounted);
jpayne@68 304 }
jpayne@68 305
jpayne@68 306 if(errorState){
jpayne@68 307 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
jpayne@68 308 }
jpayne@68 309 }
jpayne@68 310
jpayne@68 311 //Unused; not efficient
jpayne@68 312 // public IntList getIds(ByteFile bf){
jpayne@68 313 // IntList ids=new IntList();
jpayne@68 314 //
jpayne@68 315 // int readsProcessedInner=0;
jpayne@68 316 //
jpayne@68 317 // byte[] line=bf.nextLine();
jpayne@68 318 // ByteBuilder bb=new ByteBuilder();
jpayne@68 319 // while(line!=null){
jpayne@68 320 // if(line.length>0 && line[0]=='>'){
jpayne@68 321 // readsProcessedInner++;
jpayne@68 322 // if(maxReads>0 && readsProcessedInner>maxReads){break;}
jpayne@68 323 //
jpayne@68 324 // for(int i=1; i<line.length; i++){
jpayne@68 325 // byte b=line[i];
jpayne@68 326 // if(b==' ' || b=='.'){break;}
jpayne@68 327 // else{bb.append(b);}
jpayne@68 328 // }
jpayne@68 329 // bb.append(',');
jpayne@68 330 // if(bb.length()>100000){
jpayne@68 331 // bb.setLength(bb.length()-1);
jpayne@68 332 // int[] ret;
jpayne@68 333 // if(mode==ACCESSION_MODE){
jpayne@68 334 // ret=TaxClient.accessionToTaxidArray(bb.toString());
jpayne@68 335 // }else if(mode==GI_MODE){
jpayne@68 336 // ret=TaxClient.giToTaxidArray(bb.toString());
jpayne@68 337 // }else{
jpayne@68 338 // ret=TaxClient.headerToTaxidArray(bb.toString());
jpayne@68 339 // }
jpayne@68 340 // assert(ret!=null) : bb.toString();
jpayne@68 341 // for(int i : ret){ids.add(i);}
jpayne@68 342 // bb.clear();
jpayne@68 343 // }
jpayne@68 344 // }
jpayne@68 345 // line=bf.nextLine();
jpayne@68 346 // }
jpayne@68 347 // if(bb.length()>0){
jpayne@68 348 // bb.setLength(bb.length()-1);
jpayne@68 349 // int[] ret;
jpayne@68 350 // if(mode==ACCESSION_MODE){
jpayne@68 351 // ret=TaxClient.accessionToTaxidArray(bb.toString());
jpayne@68 352 // }else if(mode==GI_MODE){
jpayne@68 353 // ret=TaxClient.giToTaxidArray(bb.toString());
jpayne@68 354 // }else{
jpayne@68 355 // ret=TaxClient.headerToTaxidArray(bb.toString());
jpayne@68 356 // }
jpayne@68 357 // assert(ret!=null) : bb.toString();
jpayne@68 358 // for(int i : ret){ids.add(i);}
jpayne@68 359 // bb.clear();
jpayne@68 360 // }
jpayne@68 361 //
jpayne@68 362 // bf.reset();
jpayne@68 363 // return ids;
jpayne@68 364 // }
jpayne@68 365
jpayne@68 366 private void processInner(ByteFile bf, ByteStreamWriter bsw, ByteStreamWriter bswInvalid, ByteStreamWriter bswBadHeaders, HashArray1D counts, IntList ids){
jpayne@68 367
jpayne@68 368 int readsProcessedInner=0;
jpayne@68 369
jpayne@68 370 byte[] line=bf.nextLine();
jpayne@68 371 boolean valid=false;
jpayne@68 372 while(line!=null){
jpayne@68 373 if(line.length>0 && line[0]=='>'){
jpayne@68 374 readsProcessedInner++;
jpayne@68 375 readsProcessed++;
jpayne@68 376 if(maxReads>0 && readsProcessed>maxReads){break;}
jpayne@68 377 int initial=1, terminal=line.length;
jpayne@68 378 final int number;
jpayne@68 379 if(ids==null){
jpayne@68 380 final TaxNode tn;
jpayne@68 381
jpayne@68 382 {
jpayne@68 383 {
jpayne@68 384 // Handles renumbering when the format is correct but the number is wrong.
jpayne@68 385 if(Tools.startsWith(line, ">tid|")){
jpayne@68 386 initial=6;
jpayne@68 387 while(initial<=line.length && line[initial-1]!='|'){initial++;}
jpayne@68 388 }else if(Tools.startsWith(line, ">ncbi|")){
jpayne@68 389 initial=7;
jpayne@68 390 while(initial<=line.length && line[initial-1]!='|'){initial++;}
jpayne@68 391 }
jpayne@68 392 }
jpayne@68 393
jpayne@68 394 if(shrinkNames){//This is for nr/nt
jpayne@68 395 for(int i=initial; i<terminal; i++){
jpayne@68 396 if(line[i]==1){//SOH
jpayne@68 397 terminal=i;
jpayne@68 398 }
jpayne@68 399 }
jpayne@68 400 }
jpayne@68 401
jpayne@68 402 String s=new String(line, initial, terminal-initial);
jpayne@68 403
jpayne@68 404 tn=tree.parseNodeFromHeader(s, true);
jpayne@68 405 }
jpayne@68 406 number=(tn==null ? -1 : tn.id);
jpayne@68 407 }else{
jpayne@68 408 number=ids.get((int)(readsProcessedInner-1));
jpayne@68 409
jpayne@68 410 if(shrinkNames){//This is for nr/nt
jpayne@68 411 for(int i=initial; i<terminal; i++){
jpayne@68 412 if(line[i]==1){//SOH
jpayne@68 413 terminal=i;
jpayne@68 414 }
jpayne@68 415 }
jpayne@68 416 }
jpayne@68 417 }
jpayne@68 418
jpayne@68 419 valid=(number>=0);
jpayne@68 420 if(valid){
jpayne@68 421 validReads++;
jpayne@68 422 bsw.print(title);
jpayne@68 423 bsw.print(number);
jpayne@68 424 if(prefix){
jpayne@68 425 bsw.print('|');
jpayne@68 426 for(int i=initial; i<terminal; i++){
jpayne@68 427 bsw.print(line[i]);
jpayne@68 428 }
jpayne@68 429 }else if(counts!=null){
jpayne@68 430 bsw.print('|');
jpayne@68 431 int count=counts.increment(number, 1);
jpayne@68 432 bsw.print(count);
jpayne@68 433 if(count==1){taxaCounted++;}
jpayne@68 434 }
jpayne@68 435 bsw.println();
jpayne@68 436 }else{
jpayne@68 437 invalidReads++;
jpayne@68 438 if(deleteInvalid){
jpayne@68 439 System.err.println("Invalid sequence detected; aborting.\n");
jpayne@68 440 break;
jpayne@68 441 }
jpayne@68 442 if(bswBadHeaders!=null){bswBadHeaders.println(line);}
jpayne@68 443 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
jpayne@68 444 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders+"\n"+new String(line));
jpayne@68 445 }
jpayne@68 446 if(keepAll){
jpayne@68 447 if(shrinkNames){
jpayne@68 448 for(int i=0; i<terminal; i++){
jpayne@68 449 bsw.print(line[i]);
jpayne@68 450 }
jpayne@68 451 bsw.println();
jpayne@68 452 }else{
jpayne@68 453 bsw.println(line);
jpayne@68 454 }
jpayne@68 455 }else if(bswInvalid!=null){
jpayne@68 456 if(shrinkNames){
jpayne@68 457 for(int i=0; i<terminal; i++){
jpayne@68 458 bswInvalid.print(line[i]);
jpayne@68 459 }
jpayne@68 460 bswInvalid.println();
jpayne@68 461 }else{
jpayne@68 462 bswInvalid.println(line);
jpayne@68 463 }
jpayne@68 464 }
jpayne@68 465 }
jpayne@68 466 }else{
jpayne@68 467 basesProcessed+=line.length;
jpayne@68 468 if(valid || keepAll){
jpayne@68 469 if(valid){validBases+=line.length;}
jpayne@68 470 else{invalidBases+=line.length;}
jpayne@68 471 bsw.println(line);
jpayne@68 472 }else{
jpayne@68 473 invalidBases+=line.length;
jpayne@68 474 if(bswInvalid!=null){
jpayne@68 475 bswInvalid.println(line);
jpayne@68 476 }
jpayne@68 477 }
jpayne@68 478 }
jpayne@68 479 line=bf.nextLine();
jpayne@68 480 }
jpayne@68 481
jpayne@68 482 errorState|=bf.close();
jpayne@68 483 }
jpayne@68 484
jpayne@68 485 private static boolean looksLikeRealAccession(byte[] line){
jpayne@68 486 int space=Tools.indexOf(line, ' ');
jpayne@68 487 if(space<0){space=line.length;}
jpayne@68 488 if(space>18 || space<4){return false;}
jpayne@68 489 //... hmm... this is a pretty short list for false cases!
jpayne@68 490 int dot=-1;
jpayne@68 491 for(int i=0; i<space; i++){
jpayne@68 492 if(line[i]=='.'){
jpayne@68 493 if(dot>=0){return false;}//Only 1 dot allowed
jpayne@68 494 dot=i;
jpayne@68 495 }
jpayne@68 496 }
jpayne@68 497 if(dot>0){
jpayne@68 498 if(dot!=space-2){return false;}
jpayne@68 499 }
jpayne@68 500 for(int i=0; i<space; i++){
jpayne@68 501 byte b=line[i];
jpayne@68 502 if(b!='_' && b!='-' && b!='.' && !Tools.isLetterOrDigit(b)){return false;}
jpayne@68 503 }
jpayne@68 504 return true;
jpayne@68 505 }
jpayne@68 506
jpayne@68 507 void appendHeaderLine(byte[] line, ByteBuilder bb){
jpayne@68 508 assert(line[0]=='>' || line[0]=='@') : new String(line);
jpayne@68 509
jpayne@68 510 if(mode==ACCESSION_MODE){
jpayne@68 511 for(int i=1; i<line.length; i++){
jpayne@68 512 byte b=line[i];
jpayne@68 513 if(b==' ' || b=='.'){break;}
jpayne@68 514 else{bb.append(b);}
jpayne@68 515 }
jpayne@68 516 }else if(mode==GI_MODE){
jpayne@68 517 for(int i=1; i<line.length; i++){
jpayne@68 518 byte b=line[i];
jpayne@68 519 if(b==' ' || b=='|'){break;}
jpayne@68 520 else{bb.append(b);}
jpayne@68 521 }
jpayne@68 522 }else if(mode==UNITE_MODE){
jpayne@68 523 int initial=Tools.indexOf(line, '|');
jpayne@68 524 for(int i=initial+1; i<line.length; i++){
jpayne@68 525 byte b=line[i];
jpayne@68 526 if(b==' ' || b=='.' || b=='|'){break;}
jpayne@68 527 else{bb.append(b);}
jpayne@68 528 }
jpayne@68 529 }else{
jpayne@68 530 for(int i=1; i<line.length; i++){
jpayne@68 531 byte b=line[i];
jpayne@68 532 bb.append(b);
jpayne@68 533 }
jpayne@68 534 }
jpayne@68 535 bb.append(',');
jpayne@68 536 }
jpayne@68 537
jpayne@68 538 private void updateHeadersFromServer(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders, int format){
jpayne@68 539 if(format==FileFormat.FA){
jpayne@68 540 updateHeadersFromServer_fasta(lines, counts, bswBadHeaders);
jpayne@68 541 }else if(format==FileFormat.GFF){
jpayne@68 542 updateHeadersFromServer_gff(lines, counts, bswBadHeaders);
jpayne@68 543 }else{
jpayne@68 544 assert(false) : "Unsupported type: "+format;
jpayne@68 545 }
jpayne@68 546 }
jpayne@68 547
jpayne@68 548 private void updateHeadersFromServer_fasta(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders){
jpayne@68 549 ByteBuilder bb=new ByteBuilder();
jpayne@68 550 ArrayList<String> names=new ArrayList<String>();
jpayne@68 551 for(byte[] line : lines){
jpayne@68 552 if(line[0]=='>' && !Tools.startsWith(line, ">tid")){
jpayne@68 553 appendHeaderLine(line, bb);
jpayne@68 554 if(mode==UNITE_MODE){
jpayne@68 555 int bar=Tools.indexOf(line, '|');
jpayne@68 556 names.add(new String(line, 1, bar-1));
jpayne@68 557 }
jpayne@68 558 }
jpayne@68 559 }
jpayne@68 560 if(bb.length()<1){return;}
jpayne@68 561
jpayne@68 562 assert(bb.endsWith(','));
jpayne@68 563 bb.length--;
jpayne@68 564
jpayne@68 565 // System.err.println("Sending '"+bb+"'");
jpayne@68 566
jpayne@68 567 final int[] serverIds;
jpayne@68 568 if(mode==ACCESSION_MODE || mode==UNITE_MODE){
jpayne@68 569 serverIds=TaxClient.accessionToTaxidArray(bb.toString());
jpayne@68 570 }else if(mode==GI_MODE){
jpayne@68 571 serverIds=TaxClient.giToTaxidArray(bb.toString());
jpayne@68 572 }else{
jpayne@68 573 serverIds=TaxClient.headerToTaxidArray(bb.toString());
jpayne@68 574 }
jpayne@68 575 assert(serverIds!=null) : "Null response for '"+bb.toString()+"'";
jpayne@68 576 bb.clear();
jpayne@68 577
jpayne@68 578 if(!names.isEmpty()){
jpayne@68 579 assert(tree!=null) : "Need to load a TaxTree.";
jpayne@68 580 assert(names.size()==serverIds.length);
jpayne@68 581 for(int i=0; i<serverIds.length; i++){
jpayne@68 582 final String name=names.get(i);
jpayne@68 583 if(serverIds[i]<0){
jpayne@68 584 TaxNode tn=tree.getNodeByName(name);
jpayne@68 585 if(tn!=null){serverIds[i]=tn.id;}
jpayne@68 586 // else {
jpayne@68 587 // assert(false) : names.get(i);
jpayne@68 588 // }
jpayne@68 589 }else{
jpayne@68 590 //Sometimes the species gets renamed.
jpayne@68 591 // TaxNode tn=tree.getNodeByName(name);
jpayne@68 592 // if(tn==null || tn.id==serverIds[i]) {System.err.println(name+", "+serverIds[i]+", "+tn+", "+tree.getNodesByName(name));}
jpayne@68 593 }
jpayne@68 594 }
jpayne@68 595 }
jpayne@68 596
jpayne@68 597 for(int lineNum=0, serverNum=0; lineNum<=lines.size(); lineNum++){
jpayne@68 598 byte[] line=lines.get(lineNum);
jpayne@68 599 if(line[0]=='>' && !Tools.startsWith(line, ">tid")){
jpayne@68 600 bb.clear();
jpayne@68 601 final int tid=serverIds[serverNum];
jpayne@68 602 if(tid<0){
jpayne@68 603 //WARN
jpayne@68 604 if(bswBadHeaders!=null){
jpayne@68 605 bswBadHeaders.print(tid).tab();
jpayne@68 606 bswBadHeaders.print(looksLikeRealAccession(line)).tab();
jpayne@68 607 bswBadHeaders.println(line);
jpayne@68 608 }else if(warnBadHeaders){
jpayne@68 609 System.err.println(tid+"\t"+looksLikeRealAccession(line)+"\t"+new String(line));
jpayne@68 610 }
jpayne@68 611 }
jpayne@68 612 int initial=1, terminal=line.length;
jpayne@68 613 if(shrinkNames){//This is for nr/nt
jpayne@68 614 for(int i=initial; i<terminal; i++){
jpayne@68 615 if(line[i]==1){//SOH
jpayne@68 616 terminal=i;
jpayne@68 617 }
jpayne@68 618 }
jpayne@68 619 }
jpayne@68 620
jpayne@68 621 bb.append(title);
jpayne@68 622 bb.append(tid);
jpayne@68 623 if(prefix){
jpayne@68 624 bb.append('|');
jpayne@68 625 for(int i=initial; i<terminal; i++){
jpayne@68 626 bb.append(line[i]);
jpayne@68 627 }
jpayne@68 628 }else if(counts!=null && tid>=0){
jpayne@68 629 bb.append('|');
jpayne@68 630 int count=counts.increment(tid, 1);
jpayne@68 631 bb.append(count);
jpayne@68 632 if(count==1){taxaCounted++;}
jpayne@68 633 }
jpayne@68 634
jpayne@68 635 lines.set(lineNum, bb.toBytes());
jpayne@68 636
jpayne@68 637 serverNum++;
jpayne@68 638 if(serverNum>=serverIds.length){break;}
jpayne@68 639 }
jpayne@68 640 }
jpayne@68 641 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
jpayne@68 642 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders);
jpayne@68 643 }
jpayne@68 644 }
jpayne@68 645
jpayne@68 646 private void updateHeadersFromServer_gff(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders){
jpayne@68 647 ByteBuilder bb=new ByteBuilder();
jpayne@68 648 ArrayList<String> names=new ArrayList<String>();
jpayne@68 649 for(byte[] line : lines){
jpayne@68 650 if(line[0]!='#' && !Tools.startsWith(line, "tid")){
jpayne@68 651 if(bb.length()>0){bb.append(',');}
jpayne@68 652 for(byte b : line){
jpayne@68 653 if(b=='\t'){break;}
jpayne@68 654 bb.append(b);
jpayne@68 655 }
jpayne@68 656 }
jpayne@68 657 }
jpayne@68 658 if(bb.length()<1){return;}
jpayne@68 659
jpayne@68 660 // assert(false) : bb;
jpayne@68 661
jpayne@68 662 // System.err.println("Sending '"+bb+"'");
jpayne@68 663
jpayne@68 664 int[] serverIds;
jpayne@68 665 if(mode==ACCESSION_MODE || mode==UNITE_MODE){
jpayne@68 666 serverIds=TaxClient.accessionToTaxidArray(bb.toString());
jpayne@68 667 }else if(mode==GI_MODE){
jpayne@68 668 serverIds=TaxClient.giToTaxidArray(bb.toString());
jpayne@68 669 }else{
jpayne@68 670 serverIds=TaxClient.headerToTaxidArray(bb.toString());
jpayne@68 671 }
jpayne@68 672 if(serverIds==null){
jpayne@68 673 KillSwitch.kill("Null response for '"+bb.toString()+"'");
jpayne@68 674 }
jpayne@68 675 // assert(serverIds!=null) : "Null response for '"+bb.toString()+"'";
jpayne@68 676 bb.clear();
jpayne@68 677
jpayne@68 678 if(!names.isEmpty()){
jpayne@68 679 assert(tree!=null) : "Need to load a TaxTree.";
jpayne@68 680 assert(names.size()==serverIds.length);
jpayne@68 681 for(int i=0; i<serverIds.length; i++){
jpayne@68 682 final String name=names.get(i);
jpayne@68 683 if(serverIds[i]<0){
jpayne@68 684 TaxNode tn=tree.getNodeByName(name);
jpayne@68 685 if(tn!=null){serverIds[i]=tn.id;}
jpayne@68 686 // else {
jpayne@68 687 // assert(false) : names.get(i);
jpayne@68 688 // }
jpayne@68 689 }else{
jpayne@68 690 //Sometimes the species gets renamed.
jpayne@68 691 // TaxNode tn=tree.getNodeByName(name);
jpayne@68 692 // if(tn==null || tn.id==serverIds[i]) {System.err.println(name+", "+serverIds[i]+", "+tn+", "+tree.getNodesByName(name));}
jpayne@68 693 }
jpayne@68 694 }
jpayne@68 695 }
jpayne@68 696
jpayne@68 697 for(int lineNum=0, serverNum=0; lineNum<=lines.size(); lineNum++){
jpayne@68 698 byte[] line=lines.get(lineNum);
jpayne@68 699 if(line[0]!='#' && !Tools.startsWith(line, "tid")){
jpayne@68 700 bb.clear();
jpayne@68 701 final int tid=serverIds[serverNum];
jpayne@68 702 if(tid<0){
jpayne@68 703 //WARN
jpayne@68 704 if(bswBadHeaders!=null){
jpayne@68 705 bswBadHeaders.print(tid).tab();
jpayne@68 706 bswBadHeaders.print(looksLikeRealAccession(line)).tab();
jpayne@68 707 bswBadHeaders.println(line);
jpayne@68 708 }else if(warnBadHeaders){
jpayne@68 709 System.err.println(tid+"\t"+looksLikeRealAccession(line)+"\t"+new String(line));
jpayne@68 710 }
jpayne@68 711 }
jpayne@68 712
jpayne@68 713 bb.append("tid|");
jpayne@68 714 bb.append(tid);
jpayne@68 715 if(prefix){
jpayne@68 716 bb.append('|');
jpayne@68 717 bb.append(line);
jpayne@68 718 }else if(counts!=null && tid>=0){
jpayne@68 719 bb.append('|');
jpayne@68 720 int count=counts.increment(tid, 1);
jpayne@68 721 bb.append(count);
jpayne@68 722 if(count==1){taxaCounted++;}
jpayne@68 723 }
jpayne@68 724
jpayne@68 725 lines.set(lineNum, bb.toBytes());
jpayne@68 726
jpayne@68 727 serverNum++;
jpayne@68 728 if(serverNum>=serverIds.length){break;}
jpayne@68 729 }
jpayne@68 730 }
jpayne@68 731 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
jpayne@68 732 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders);
jpayne@68 733 }
jpayne@68 734 }
jpayne@68 735
jpayne@68 736 private void processInner_server(ByteFile bf, ByteStreamWriter bsw, ByteStreamWriter bswInvalid, ByteStreamWriter bswBadHeaders, HashArray1D counts, int format){
jpayne@68 737
jpayne@68 738 ArrayList<byte[]> lines=new ArrayList<byte[]>();
jpayne@68 739 byte[] line=bf.nextLine();
jpayne@68 740 boolean valid=false;
jpayne@68 741 long storedBytes=0;
jpayne@68 742
jpayne@68 743 while(line!=null){
jpayne@68 744
jpayne@68 745 if(line.length>0){
jpayne@68 746 linesIn++;
jpayne@68 747 lines.add(line);
jpayne@68 748 storedBytes+=line.length;
jpayne@68 749 if(storedBytes>=maxStoredBytes){
jpayne@68 750 updateHeadersFromServer(lines, counts, bswBadHeaders, format);
jpayne@68 751 valid=dumpBuffer(lines, valid, bsw, bswInvalid);
jpayne@68 752 lines=new ArrayList<byte[]>();
jpayne@68 753 storedBytes=0;
jpayne@68 754 if(deleteInvalid && invalidReads>0){
jpayne@68 755 System.err.println("Invalid sequence detected; aborting.\n"
jpayne@68 756 + "Input file: \t"+bf.name()+"\n"
jpayne@68 757 + "Output file: \t"+(bsw==null ? "null" : bsw.fname)+"\n"
jpayne@68 758 + "Line: \t"+new String(line)+"\n");
jpayne@68 759 break;
jpayne@68 760 }
jpayne@68 761 }
jpayne@68 762 }
jpayne@68 763 line=bf.nextLine();
jpayne@68 764 }
jpayne@68 765
jpayne@68 766 if(storedBytes>0){
jpayne@68 767 updateHeadersFromServer(lines, counts, bswBadHeaders, format);
jpayne@68 768 valid=dumpBuffer(lines, valid, bsw, bswInvalid);
jpayne@68 769 lines=new ArrayList<byte[]>();
jpayne@68 770 storedBytes=0;
jpayne@68 771 }
jpayne@68 772
jpayne@68 773 errorState|=bf.close();
jpayne@68 774 }
jpayne@68 775
jpayne@68 776 private boolean dumpBuffer(ArrayList<byte[]> lines, boolean valid, ByteStreamWriter bsw, ByteStreamWriter bswInvalid){
jpayne@68 777
jpayne@68 778 for(byte[] line : lines){
jpayne@68 779
jpayne@68 780 if(line.length>0 && line[0]=='>'){
jpayne@68 781 readsProcessed++;
jpayne@68 782 if(maxReads>0 && readsProcessed>maxReads){break;}
jpayne@68 783
jpayne@68 784 if(Tools.startsWith(line, invalidTitle)){
jpayne@68 785 valid=false;
jpayne@68 786 invalidReads++;
jpayne@68 787 invalidLines++;
jpayne@68 788 if(deleteInvalid){break;}
jpayne@68 789 }else{
jpayne@68 790 assert(Tools.startsWith(line, title));
jpayne@68 791 valid=true;
jpayne@68 792 validReads++;
jpayne@68 793 validLines++;
jpayne@68 794 }
jpayne@68 795 }else if(gffIn){
jpayne@68 796 basesProcessed+=line.length;
jpayne@68 797 valid=!Tools.startsWith(line, invalidGffTitle);
jpayne@68 798 if(valid){
jpayne@68 799 validBases+=line.length;
jpayne@68 800 validLines++;
jpayne@68 801 }else{
jpayne@68 802 invalidBases+=line.length;
jpayne@68 803 invalidLines++;
jpayne@68 804 }
jpayne@68 805 }else{
jpayne@68 806 basesProcessed+=line.length;
jpayne@68 807 if(valid){
jpayne@68 808 validBases+=line.length;
jpayne@68 809 validLines++;
jpayne@68 810 }else{
jpayne@68 811 invalidBases+=line.length;
jpayne@68 812 invalidLines++;
jpayne@68 813 }
jpayne@68 814 }
jpayne@68 815
jpayne@68 816 if(valid || keepAll){
jpayne@68 817 if(bsw!=null){bsw.println(line);}
jpayne@68 818 }else{
jpayne@68 819 if(bswInvalid!=null){bswInvalid.println(line);}
jpayne@68 820 }
jpayne@68 821 }
jpayne@68 822 return valid;
jpayne@68 823 }
jpayne@68 824
jpayne@68 825 /*--------------------------------------------------------------*/
jpayne@68 826
jpayne@68 827
jpayne@68 828 /*--------------------------------------------------------------*/
jpayne@68 829
jpayne@68 830 private LinkedHashSet<String> in1=new LinkedHashSet<String>();
jpayne@68 831 private String out1=null;
jpayne@68 832 private String outInvalid=null;
jpayne@68 833 private String badHeaders=null;
jpayne@68 834
jpayne@68 835 private String taxTreeFile=null;
jpayne@68 836 private String giTableFile=null;
jpayne@68 837 private String accessionFile=null;
jpayne@68 838 private String patternFile=null;
jpayne@68 839
jpayne@68 840 /*--------------------------------------------------------------*/
jpayne@68 841
jpayne@68 842 private long maxReads=-1;
jpayne@68 843
jpayne@68 844 private long validReads=0;
jpayne@68 845 private long validBases=0;
jpayne@68 846 private long invalidReads=0;
jpayne@68 847 private long invalidBases=0;
jpayne@68 848 private long taxaCounted=0;
jpayne@68 849
jpayne@68 850 private long linesIn=0;
jpayne@68 851 private long validLines=0;
jpayne@68 852 private long invalidLines=0;
jpayne@68 853
jpayne@68 854 private long maxStoredBytes=10000000;
jpayne@68 855
jpayne@68 856 private long readsProcessed=0, basesProcessed=0;
jpayne@68 857
jpayne@68 858 private boolean prefix=true;
jpayne@68 859 private boolean countTable=true;
jpayne@68 860 private boolean keepAll=true;
jpayne@68 861 private boolean shrinkNames=false;
jpayne@68 862 private boolean warnBadHeaders=true;
jpayne@68 863 private boolean useServer=false;
jpayne@68 864 /** Crash if the number of invalid headers exceeds this */
jpayne@68 865 private long maxInvalidHeaders=-1;
jpayne@68 866 /** Delete the output file if there are any invalid headers */
jpayne@68 867 private boolean deleteInvalid=false;
jpayne@68 868
jpayne@68 869 private int mode;
jpayne@68 870 private static final int ACCESSION_MODE=0, GI_MODE=1, HEADER_MODE=2, UNITE_MODE=3;
jpayne@68 871
jpayne@68 872 private boolean gffIn=false;
jpayne@68 873
jpayne@68 874 /*--------------------------------------------------------------*/
jpayne@68 875
jpayne@68 876 private final ArrayList<FileFormat> ffin1;
jpayne@68 877 private final FileFormat ffout1;
jpayne@68 878 private final FileFormat ffoutInvalid;
jpayne@68 879 private final TaxTree tree;
jpayne@68 880
jpayne@68 881 /*--------------------------------------------------------------*/
jpayne@68 882
jpayne@68 883 private PrintStream outstream=System.err;
jpayne@68 884 public static boolean verbose=false;
jpayne@68 885 public boolean errorState=false;
jpayne@68 886 private boolean overwrite=false;
jpayne@68 887 private boolean append=false;
jpayne@68 888
jpayne@68 889 private static byte[] title=">tid|".getBytes();
jpayne@68 890 private static byte[] invalidTitle=">tid|-1".getBytes();
jpayne@68 891 private static byte[] invalidGffTitle="tid|-1".getBytes();
jpayne@68 892
jpayne@68 893 }