jpayne@68
|
1 package tax;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.io.File;
|
jpayne@68
|
4 import java.io.PrintStream;
|
jpayne@68
|
5 import java.util.ArrayList;
|
jpayne@68
|
6 import java.util.LinkedHashSet;
|
jpayne@68
|
7
|
jpayne@68
|
8 import fileIO.ByteFile;
|
jpayne@68
|
9 import fileIO.ByteFile1;
|
jpayne@68
|
10 import fileIO.ByteFile2;
|
jpayne@68
|
11 import fileIO.ByteStreamWriter;
|
jpayne@68
|
12 import fileIO.FileFormat;
|
jpayne@68
|
13 import fileIO.ReadWrite;
|
jpayne@68
|
14 import kmer.HashArray1D;
|
jpayne@68
|
15 import shared.KillSwitch;
|
jpayne@68
|
16 import shared.Parse;
|
jpayne@68
|
17 import shared.Parser;
|
jpayne@68
|
18 import shared.PreParser;
|
jpayne@68
|
19 import shared.ReadStats;
|
jpayne@68
|
20 import shared.Shared;
|
jpayne@68
|
21 import shared.Timer;
|
jpayne@68
|
22 import shared.Tools;
|
jpayne@68
|
23 import stream.ConcurrentGenericReadInputStream;
|
jpayne@68
|
24 import stream.FASTQ;
|
jpayne@68
|
25 import stream.FastaReadInputStream;
|
jpayne@68
|
26 import structures.ByteBuilder;
|
jpayne@68
|
27 import structures.IntList;
|
jpayne@68
|
28
|
jpayne@68
|
29 /**
|
jpayne@68
|
30 * @author Brian Bushnell
|
jpayne@68
|
31 * @date Mar 10, 2015
|
jpayne@68
|
32 *
|
jpayne@68
|
33 */
|
jpayne@68
|
34 public class RenameGiToTaxid {
|
jpayne@68
|
35
|
jpayne@68
|
36 public static void main(String[] args){
|
jpayne@68
|
37 Timer t=new Timer();
|
jpayne@68
|
38 RenameGiToTaxid x=new RenameGiToTaxid(args);
|
jpayne@68
|
39 x.process(t);
|
jpayne@68
|
40
|
jpayne@68
|
41 //Close the print stream if it was redirected
|
jpayne@68
|
42 Shared.closeStream(x.outstream);
|
jpayne@68
|
43 }
|
jpayne@68
|
44
|
jpayne@68
|
45 public RenameGiToTaxid(String[] args){
|
jpayne@68
|
46
|
jpayne@68
|
47 {//Preparse block for help, config files, and outstream
|
jpayne@68
|
48 PreParser pp=new PreParser(args, getClass(), false);
|
jpayne@68
|
49 args=pp.args;
|
jpayne@68
|
50 outstream=pp.outstream;
|
jpayne@68
|
51 }
|
jpayne@68
|
52
|
jpayne@68
|
53 Shared.capBuffers(4);
|
jpayne@68
|
54 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
|
jpayne@68
|
55 ReadWrite.USE_BGZIP=ReadWrite.USE_UNBGZIP=ReadWrite.PREFER_BGZIP=true;
|
jpayne@68
|
56 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
|
jpayne@68
|
57 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
|
jpayne@68
|
58
|
jpayne@68
|
59 Parser parser=new Parser();
|
jpayne@68
|
60 for(int i=0; i<args.length; i++){
|
jpayne@68
|
61 String arg=args[i];
|
jpayne@68
|
62 String[] split=arg.split("=");
|
jpayne@68
|
63 String a=split[0].toLowerCase();
|
jpayne@68
|
64 String b=split.length>1 ? split[1] : null;
|
jpayne@68
|
65
|
jpayne@68
|
66 if(a.equals("prefix")){
|
jpayne@68
|
67 prefix=Parse.parseBoolean(b);
|
jpayne@68
|
68
|
jpayne@68
|
69 }else if(a.equals("server") || a.equals("useserver")){
|
jpayne@68
|
70 if(b!=null && b.startsWith("http")){
|
jpayne@68
|
71 useServer=true;
|
jpayne@68
|
72 String path=b;
|
jpayne@68
|
73 if(!path.endsWith("/")){path+="/";}
|
jpayne@68
|
74 Shared.setTaxServer(path);
|
jpayne@68
|
75 }else{
|
jpayne@68
|
76 useServer=Parse.parseBoolean(b);
|
jpayne@68
|
77 }
|
jpayne@68
|
78 }else if(a.equals("title")){
|
jpayne@68
|
79 title=(b==null ? ">" : (">"+b+"|")).getBytes();
|
jpayne@68
|
80 }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
|
jpayne@68
|
81 giTableFile=b;
|
jpayne@68
|
82 }else if(a.equals("accession")){
|
jpayne@68
|
83 accessionFile=b;
|
jpayne@68
|
84 }else if(a.equals("pattern")){
|
jpayne@68
|
85 patternFile=b;
|
jpayne@68
|
86 }else if(a.equals("tree") || a.equals("taxtree")){
|
jpayne@68
|
87 taxTreeFile=b;
|
jpayne@68
|
88 }else if(a.equals("invalid")){
|
jpayne@68
|
89 outInvalid=b;
|
jpayne@68
|
90 }else if(a.equals("deleteinvalid")){
|
jpayne@68
|
91 deleteInvalid=Parse.parseBoolean(b);
|
jpayne@68
|
92 }else if(a.equals("badheaders")){
|
jpayne@68
|
93 badHeaders=b;
|
jpayne@68
|
94 }else if(a.equals("maxbadheaders") || a.equals("maxinvalidheaders")){
|
jpayne@68
|
95 maxInvalidHeaders=Parse.parseKMG(b);
|
jpayne@68
|
96 }else if(a.equals("keepall")){
|
jpayne@68
|
97 keepAll=Parse.parseBoolean(b);
|
jpayne@68
|
98 }else if(a.equals("shrinknames")){
|
jpayne@68
|
99 shrinkNames=Parse.parseBoolean(b);
|
jpayne@68
|
100 }else if(a.equals("warn")){
|
jpayne@68
|
101 warnBadHeaders=Parse.parseBoolean(b);
|
jpayne@68
|
102 }
|
jpayne@68
|
103
|
jpayne@68
|
104 else if(a.equals("maxpigzprocesses")){
|
jpayne@68
|
105 AccessionToTaxid.maxPigzProcesses=Integer.parseInt(b);
|
jpayne@68
|
106 }else if(a.equals("skipparse")){
|
jpayne@68
|
107 AccessionToTaxid.skipParse=Parse.parseBoolean(b);
|
jpayne@68
|
108 }else if(a.equals("skiphash")){
|
jpayne@68
|
109 AccessionToTaxid.skipHash=Parse.parseBoolean(b);
|
jpayne@68
|
110 }
|
jpayne@68
|
111
|
jpayne@68
|
112 else if(a.equals("mode")){
|
jpayne@68
|
113 if(b!=null && Character.isDigit(b.charAt(0))){
|
jpayne@68
|
114 mode=Integer.parseInt(b);
|
jpayne@68
|
115 }else if("accession".equalsIgnoreCase(b)){
|
jpayne@68
|
116 mode=ACCESSION_MODE;
|
jpayne@68
|
117 }else if("unite".equalsIgnoreCase(b)){
|
jpayne@68
|
118 mode=UNITE_MODE;
|
jpayne@68
|
119 TaxTree.UNITE_MODE=true;
|
jpayne@68
|
120 }else if("gi".equalsIgnoreCase(b)){
|
jpayne@68
|
121 mode=GI_MODE;
|
jpayne@68
|
122 }else if("header".equalsIgnoreCase(b)){
|
jpayne@68
|
123 mode=HEADER_MODE;
|
jpayne@68
|
124 }else{
|
jpayne@68
|
125 assert(false) : "Bad mode: "+b;
|
jpayne@68
|
126 }
|
jpayne@68
|
127 }
|
jpayne@68
|
128
|
jpayne@68
|
129 else if(a.equals("verbose")){
|
jpayne@68
|
130 verbose=Parse.parseBoolean(b);
|
jpayne@68
|
131 ByteFile1.verbose=verbose;
|
jpayne@68
|
132 ByteFile2.verbose=verbose;
|
jpayne@68
|
133 stream.FastaReadInputStream.verbose=verbose;
|
jpayne@68
|
134 ConcurrentGenericReadInputStream.verbose=verbose;
|
jpayne@68
|
135 stream.FastqReadInputStream.verbose=verbose;
|
jpayne@68
|
136 ReadWrite.verbose=verbose;
|
jpayne@68
|
137 }else if(a.equals("in") || a.equals("in1")){
|
jpayne@68
|
138 assert(b!=null) : "Bad parameter: "+arg;
|
jpayne@68
|
139 if(new File(b).exists()){
|
jpayne@68
|
140 in1.add(b);
|
jpayne@68
|
141 }else{
|
jpayne@68
|
142 for(String bb : b.split(",")){
|
jpayne@68
|
143 in1.add(bb);
|
jpayne@68
|
144 }
|
jpayne@68
|
145 }
|
jpayne@68
|
146 }else if(new File(arg).exists()){ //For asterisk expansion
|
jpayne@68
|
147 in1.add(arg);
|
jpayne@68
|
148 }else if(parser.parse(arg, a, b)){
|
jpayne@68
|
149 //do nothing
|
jpayne@68
|
150 }else{
|
jpayne@68
|
151 outstream.println("Unknown parameter "+args[i]);
|
jpayne@68
|
152 assert(false) : "Unknown parameter "+args[i];
|
jpayne@68
|
153 // throw new RuntimeException("Unknown parameter "+args[i]);
|
jpayne@68
|
154 }
|
jpayne@68
|
155 }
|
jpayne@68
|
156
|
jpayne@68
|
157 if(useServer){
|
jpayne@68
|
158 giTableFile=null;
|
jpayne@68
|
159 accessionFile=null;
|
jpayne@68
|
160 patternFile=null;
|
jpayne@68
|
161 if(mode!=UNITE_MODE){taxTreeFile=null;}
|
jpayne@68
|
162 }//else if taxpath!=null... set them
|
jpayne@68
|
163
|
jpayne@68
|
164 {//Process parser fields
|
jpayne@68
|
165 Parser.processQuality();
|
jpayne@68
|
166
|
jpayne@68
|
167 maxReads=parser.maxReads;
|
jpayne@68
|
168
|
jpayne@68
|
169 overwrite=ReadStats.overwrite=parser.overwrite;
|
jpayne@68
|
170 append=ReadStats.append=parser.append;
|
jpayne@68
|
171
|
jpayne@68
|
172 out1=parser.out1;
|
jpayne@68
|
173 }
|
jpayne@68
|
174
|
jpayne@68
|
175 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();}
|
jpayne@68
|
176 if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();}
|
jpayne@68
|
177 if("auto".equalsIgnoreCase(accessionFile)){accessionFile=TaxTree.defaultAccessionFile();}
|
jpayne@68
|
178 if("auto".equalsIgnoreCase(patternFile)){patternFile=TaxTree.defaultPatternFile();}
|
jpayne@68
|
179
|
jpayne@68
|
180 assert(FastaReadInputStream.settingsOK());
|
jpayne@68
|
181
|
jpayne@68
|
182 if(in1==null || in1.isEmpty()){throw new RuntimeException("Error - at least one input file is required.");}
|
jpayne@68
|
183 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){
|
jpayne@68
|
184 ByteFile.FORCE_MODE_BF2=false;
|
jpayne@68
|
185 ByteFile.FORCE_MODE_BF1=true;
|
jpayne@68
|
186 }
|
jpayne@68
|
187
|
jpayne@68
|
188 if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
|
jpayne@68
|
189 assert(out1!=null) : "This program requires an output file.";
|
jpayne@68
|
190
|
jpayne@68
|
191 if(!Tools.testOutputFiles(overwrite, append, false, out1)){
|
jpayne@68
|
192 outstream.println((out1==null)+", "+out1);
|
jpayne@68
|
193 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
|
jpayne@68
|
194 }
|
jpayne@68
|
195 if(!Tools.testInputFiles(false, true, in1.toArray(new String[0]))){
|
jpayne@68
|
196 throw new RuntimeException("\nCan't read some input files.\n");
|
jpayne@68
|
197 }
|
jpayne@68
|
198
|
jpayne@68
|
199 ffout1=FileFormat.testOutput(out1, FileFormat.FA, null, true, overwrite, append, false);
|
jpayne@68
|
200 ffoutInvalid=FileFormat.testOutput(outInvalid, FileFormat.FA, null, true, overwrite, append, false);
|
jpayne@68
|
201 ffin1=new ArrayList<FileFormat>(in1.size());
|
jpayne@68
|
202 for(String s : in1){
|
jpayne@68
|
203 FileFormat ff=FileFormat.testInput(s, FileFormat.FA, null, true, true);
|
jpayne@68
|
204 ffin1.add(ff);
|
jpayne@68
|
205 }
|
jpayne@68
|
206
|
jpayne@68
|
207 if(ffoutInvalid!=null){keepAll=false;}
|
jpayne@68
|
208
|
jpayne@68
|
209 assert(giTableFile!=null || accessionFile!=null || TaxTree.SILVA_MODE || useServer) : "No gi or accession information loaded.";
|
jpayne@68
|
210
|
jpayne@68
|
211 if(taxTreeFile!=null){
|
jpayne@68
|
212 tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, false);
|
jpayne@68
|
213 assert(tree.nameMap!=null);
|
jpayne@68
|
214 }else{
|
jpayne@68
|
215 tree=null;
|
jpayne@68
|
216 if(!useServer){throw new RuntimeException("No tree specified.");}
|
jpayne@68
|
217 }
|
jpayne@68
|
218
|
jpayne@68
|
219 if(giTableFile!=null){
|
jpayne@68
|
220 GiToTaxid.initialize(giTableFile);
|
jpayne@68
|
221 }
|
jpayne@68
|
222
|
jpayne@68
|
223 if(patternFile!=null){
|
jpayne@68
|
224 Timer t=new Timer();
|
jpayne@68
|
225 AnalyzeAccession.loadCodeMap(patternFile);
|
jpayne@68
|
226 outstream.println("Loading pattern table.");
|
jpayne@68
|
227 t.stopAndPrint();
|
jpayne@68
|
228 }
|
jpayne@68
|
229
|
jpayne@68
|
230 if(accessionFile!=null){
|
jpayne@68
|
231 AccessionToTaxid.tree=tree;
|
jpayne@68
|
232 outstream.println("Loading accession table.");
|
jpayne@68
|
233 AccessionToTaxid.load(accessionFile);
|
jpayne@68
|
234 // System.gc();
|
jpayne@68
|
235 }
|
jpayne@68
|
236 }
|
jpayne@68
|
237
|
jpayne@68
|
238 void process(Timer t){
|
jpayne@68
|
239
|
jpayne@68
|
240 ByteStreamWriter bsw=(ffout1==null ? null : new ByteStreamWriter(ffout1)); //Actually, this is required.
|
jpayne@68
|
241 if(bsw!=null){bsw.start();}
|
jpayne@68
|
242
|
jpayne@68
|
243 ByteStreamWriter bswInvalid=null;
|
jpayne@68
|
244 if(ffoutInvalid!=null){
|
jpayne@68
|
245 bswInvalid=new ByteStreamWriter(ffoutInvalid);
|
jpayne@68
|
246 bswInvalid.start();
|
jpayne@68
|
247 }
|
jpayne@68
|
248
|
jpayne@68
|
249 ByteStreamWriter bswBadHeaders=null;
|
jpayne@68
|
250 if(badHeaders!=null) {
|
jpayne@68
|
251 bswBadHeaders=new ByteStreamWriter(badHeaders, overwrite, append, false);
|
jpayne@68
|
252 bswBadHeaders.start();
|
jpayne@68
|
253 }
|
jpayne@68
|
254
|
jpayne@68
|
255 final HashArray1D counts=(countTable && !prefix) ? new HashArray1D(256000, -1L, true) : null;
|
jpayne@68
|
256
|
jpayne@68
|
257 gffIn=false;
|
jpayne@68
|
258 for(FileFormat ffin : ffin1){
|
jpayne@68
|
259 gffIn=gffIn||ffin.gff();
|
jpayne@68
|
260 ByteFile bf=ByteFile.makeByteFile(ffin);
|
jpayne@68
|
261 if(useServer){
|
jpayne@68
|
262 processInner_server(bf, bsw, bswInvalid, bswBadHeaders, counts, ffin.format());
|
jpayne@68
|
263 }else{
|
jpayne@68
|
264 // IntList list=(useServer ? getIds(bf) : null);
|
jpayne@68
|
265 processInner(bf, bsw, bswInvalid, bswBadHeaders, counts, null);
|
jpayne@68
|
266 }
|
jpayne@68
|
267 }
|
jpayne@68
|
268
|
jpayne@68
|
269 if(bsw!=null){
|
jpayne@68
|
270 errorState|=bsw.poisonAndWait();
|
jpayne@68
|
271 if(deleteInvalid && invalidReads>0 && !ffout1.stdio()){
|
jpayne@68
|
272 try {
|
jpayne@68
|
273 System.err.println("Deleting "+out1);
|
jpayne@68
|
274 new File(out1).delete();
|
jpayne@68
|
275 } catch (Exception e) {
|
jpayne@68
|
276 System.err.println("An error occured while attempting to delete "+out1);
|
jpayne@68
|
277 e.printStackTrace();
|
jpayne@68
|
278 }
|
jpayne@68
|
279 }
|
jpayne@68
|
280 }
|
jpayne@68
|
281 if(bswInvalid!=null){errorState|=bswInvalid.poisonAndWait();}
|
jpayne@68
|
282 if(bswBadHeaders!=null){errorState|=bswBadHeaders.poisonAndWait();}
|
jpayne@68
|
283
|
jpayne@68
|
284 t.stop();
|
jpayne@68
|
285 if(!gffIn) {
|
jpayne@68
|
286 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
|
jpayne@68
|
287
|
jpayne@68
|
288 outstream.println();
|
jpayne@68
|
289 outstream.println("Valid Sequences: \t"+validReads);
|
jpayne@68
|
290 outstream.println("Valid Bases: \t"+validBases);
|
jpayne@68
|
291 outstream.println("Invalid Sequences: \t"+invalidReads);
|
jpayne@68
|
292 outstream.println("Invalid Bases: \t"+invalidBases);
|
jpayne@68
|
293 }else{
|
jpayne@68
|
294 outstream.println(Tools.timeLinesBytesProcessed(t, linesIn, basesProcessed, 8));
|
jpayne@68
|
295
|
jpayne@68
|
296 outstream.println();
|
jpayne@68
|
297 outstream.println("Valid Lines: \t"+validLines);
|
jpayne@68
|
298 outstream.println("Valid Bytes: \t"+validBases);
|
jpayne@68
|
299 outstream.println("Invalid Lines: \t"+invalidLines);
|
jpayne@68
|
300 outstream.println("Invalid Bytes: \t"+invalidBases);
|
jpayne@68
|
301 }
|
jpayne@68
|
302 if(counts!=null){
|
jpayne@68
|
303 outstream.println("Unique Taxa: \t"+taxaCounted);
|
jpayne@68
|
304 }
|
jpayne@68
|
305
|
jpayne@68
|
306 if(errorState){
|
jpayne@68
|
307 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
|
jpayne@68
|
308 }
|
jpayne@68
|
309 }
|
jpayne@68
|
310
|
jpayne@68
|
311 //Unused; not efficient
|
jpayne@68
|
312 // public IntList getIds(ByteFile bf){
|
jpayne@68
|
313 // IntList ids=new IntList();
|
jpayne@68
|
314 //
|
jpayne@68
|
315 // int readsProcessedInner=0;
|
jpayne@68
|
316 //
|
jpayne@68
|
317 // byte[] line=bf.nextLine();
|
jpayne@68
|
318 // ByteBuilder bb=new ByteBuilder();
|
jpayne@68
|
319 // while(line!=null){
|
jpayne@68
|
320 // if(line.length>0 && line[0]=='>'){
|
jpayne@68
|
321 // readsProcessedInner++;
|
jpayne@68
|
322 // if(maxReads>0 && readsProcessedInner>maxReads){break;}
|
jpayne@68
|
323 //
|
jpayne@68
|
324 // for(int i=1; i<line.length; i++){
|
jpayne@68
|
325 // byte b=line[i];
|
jpayne@68
|
326 // if(b==' ' || b=='.'){break;}
|
jpayne@68
|
327 // else{bb.append(b);}
|
jpayne@68
|
328 // }
|
jpayne@68
|
329 // bb.append(',');
|
jpayne@68
|
330 // if(bb.length()>100000){
|
jpayne@68
|
331 // bb.setLength(bb.length()-1);
|
jpayne@68
|
332 // int[] ret;
|
jpayne@68
|
333 // if(mode==ACCESSION_MODE){
|
jpayne@68
|
334 // ret=TaxClient.accessionToTaxidArray(bb.toString());
|
jpayne@68
|
335 // }else if(mode==GI_MODE){
|
jpayne@68
|
336 // ret=TaxClient.giToTaxidArray(bb.toString());
|
jpayne@68
|
337 // }else{
|
jpayne@68
|
338 // ret=TaxClient.headerToTaxidArray(bb.toString());
|
jpayne@68
|
339 // }
|
jpayne@68
|
340 // assert(ret!=null) : bb.toString();
|
jpayne@68
|
341 // for(int i : ret){ids.add(i);}
|
jpayne@68
|
342 // bb.clear();
|
jpayne@68
|
343 // }
|
jpayne@68
|
344 // }
|
jpayne@68
|
345 // line=bf.nextLine();
|
jpayne@68
|
346 // }
|
jpayne@68
|
347 // if(bb.length()>0){
|
jpayne@68
|
348 // bb.setLength(bb.length()-1);
|
jpayne@68
|
349 // int[] ret;
|
jpayne@68
|
350 // if(mode==ACCESSION_MODE){
|
jpayne@68
|
351 // ret=TaxClient.accessionToTaxidArray(bb.toString());
|
jpayne@68
|
352 // }else if(mode==GI_MODE){
|
jpayne@68
|
353 // ret=TaxClient.giToTaxidArray(bb.toString());
|
jpayne@68
|
354 // }else{
|
jpayne@68
|
355 // ret=TaxClient.headerToTaxidArray(bb.toString());
|
jpayne@68
|
356 // }
|
jpayne@68
|
357 // assert(ret!=null) : bb.toString();
|
jpayne@68
|
358 // for(int i : ret){ids.add(i);}
|
jpayne@68
|
359 // bb.clear();
|
jpayne@68
|
360 // }
|
jpayne@68
|
361 //
|
jpayne@68
|
362 // bf.reset();
|
jpayne@68
|
363 // return ids;
|
jpayne@68
|
364 // }
|
jpayne@68
|
365
|
jpayne@68
|
366 private void processInner(ByteFile bf, ByteStreamWriter bsw, ByteStreamWriter bswInvalid, ByteStreamWriter bswBadHeaders, HashArray1D counts, IntList ids){
|
jpayne@68
|
367
|
jpayne@68
|
368 int readsProcessedInner=0;
|
jpayne@68
|
369
|
jpayne@68
|
370 byte[] line=bf.nextLine();
|
jpayne@68
|
371 boolean valid=false;
|
jpayne@68
|
372 while(line!=null){
|
jpayne@68
|
373 if(line.length>0 && line[0]=='>'){
|
jpayne@68
|
374 readsProcessedInner++;
|
jpayne@68
|
375 readsProcessed++;
|
jpayne@68
|
376 if(maxReads>0 && readsProcessed>maxReads){break;}
|
jpayne@68
|
377 int initial=1, terminal=line.length;
|
jpayne@68
|
378 final int number;
|
jpayne@68
|
379 if(ids==null){
|
jpayne@68
|
380 final TaxNode tn;
|
jpayne@68
|
381
|
jpayne@68
|
382 {
|
jpayne@68
|
383 {
|
jpayne@68
|
384 // Handles renumbering when the format is correct but the number is wrong.
|
jpayne@68
|
385 if(Tools.startsWith(line, ">tid|")){
|
jpayne@68
|
386 initial=6;
|
jpayne@68
|
387 while(initial<=line.length && line[initial-1]!='|'){initial++;}
|
jpayne@68
|
388 }else if(Tools.startsWith(line, ">ncbi|")){
|
jpayne@68
|
389 initial=7;
|
jpayne@68
|
390 while(initial<=line.length && line[initial-1]!='|'){initial++;}
|
jpayne@68
|
391 }
|
jpayne@68
|
392 }
|
jpayne@68
|
393
|
jpayne@68
|
394 if(shrinkNames){//This is for nr/nt
|
jpayne@68
|
395 for(int i=initial; i<terminal; i++){
|
jpayne@68
|
396 if(line[i]==1){//SOH
|
jpayne@68
|
397 terminal=i;
|
jpayne@68
|
398 }
|
jpayne@68
|
399 }
|
jpayne@68
|
400 }
|
jpayne@68
|
401
|
jpayne@68
|
402 String s=new String(line, initial, terminal-initial);
|
jpayne@68
|
403
|
jpayne@68
|
404 tn=tree.parseNodeFromHeader(s, true);
|
jpayne@68
|
405 }
|
jpayne@68
|
406 number=(tn==null ? -1 : tn.id);
|
jpayne@68
|
407 }else{
|
jpayne@68
|
408 number=ids.get((int)(readsProcessedInner-1));
|
jpayne@68
|
409
|
jpayne@68
|
410 if(shrinkNames){//This is for nr/nt
|
jpayne@68
|
411 for(int i=initial; i<terminal; i++){
|
jpayne@68
|
412 if(line[i]==1){//SOH
|
jpayne@68
|
413 terminal=i;
|
jpayne@68
|
414 }
|
jpayne@68
|
415 }
|
jpayne@68
|
416 }
|
jpayne@68
|
417 }
|
jpayne@68
|
418
|
jpayne@68
|
419 valid=(number>=0);
|
jpayne@68
|
420 if(valid){
|
jpayne@68
|
421 validReads++;
|
jpayne@68
|
422 bsw.print(title);
|
jpayne@68
|
423 bsw.print(number);
|
jpayne@68
|
424 if(prefix){
|
jpayne@68
|
425 bsw.print('|');
|
jpayne@68
|
426 for(int i=initial; i<terminal; i++){
|
jpayne@68
|
427 bsw.print(line[i]);
|
jpayne@68
|
428 }
|
jpayne@68
|
429 }else if(counts!=null){
|
jpayne@68
|
430 bsw.print('|');
|
jpayne@68
|
431 int count=counts.increment(number, 1);
|
jpayne@68
|
432 bsw.print(count);
|
jpayne@68
|
433 if(count==1){taxaCounted++;}
|
jpayne@68
|
434 }
|
jpayne@68
|
435 bsw.println();
|
jpayne@68
|
436 }else{
|
jpayne@68
|
437 invalidReads++;
|
jpayne@68
|
438 if(deleteInvalid){
|
jpayne@68
|
439 System.err.println("Invalid sequence detected; aborting.\n");
|
jpayne@68
|
440 break;
|
jpayne@68
|
441 }
|
jpayne@68
|
442 if(bswBadHeaders!=null){bswBadHeaders.println(line);}
|
jpayne@68
|
443 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
|
jpayne@68
|
444 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders+"\n"+new String(line));
|
jpayne@68
|
445 }
|
jpayne@68
|
446 if(keepAll){
|
jpayne@68
|
447 if(shrinkNames){
|
jpayne@68
|
448 for(int i=0; i<terminal; i++){
|
jpayne@68
|
449 bsw.print(line[i]);
|
jpayne@68
|
450 }
|
jpayne@68
|
451 bsw.println();
|
jpayne@68
|
452 }else{
|
jpayne@68
|
453 bsw.println(line);
|
jpayne@68
|
454 }
|
jpayne@68
|
455 }else if(bswInvalid!=null){
|
jpayne@68
|
456 if(shrinkNames){
|
jpayne@68
|
457 for(int i=0; i<terminal; i++){
|
jpayne@68
|
458 bswInvalid.print(line[i]);
|
jpayne@68
|
459 }
|
jpayne@68
|
460 bswInvalid.println();
|
jpayne@68
|
461 }else{
|
jpayne@68
|
462 bswInvalid.println(line);
|
jpayne@68
|
463 }
|
jpayne@68
|
464 }
|
jpayne@68
|
465 }
|
jpayne@68
|
466 }else{
|
jpayne@68
|
467 basesProcessed+=line.length;
|
jpayne@68
|
468 if(valid || keepAll){
|
jpayne@68
|
469 if(valid){validBases+=line.length;}
|
jpayne@68
|
470 else{invalidBases+=line.length;}
|
jpayne@68
|
471 bsw.println(line);
|
jpayne@68
|
472 }else{
|
jpayne@68
|
473 invalidBases+=line.length;
|
jpayne@68
|
474 if(bswInvalid!=null){
|
jpayne@68
|
475 bswInvalid.println(line);
|
jpayne@68
|
476 }
|
jpayne@68
|
477 }
|
jpayne@68
|
478 }
|
jpayne@68
|
479 line=bf.nextLine();
|
jpayne@68
|
480 }
|
jpayne@68
|
481
|
jpayne@68
|
482 errorState|=bf.close();
|
jpayne@68
|
483 }
|
jpayne@68
|
484
|
jpayne@68
|
485 private static boolean looksLikeRealAccession(byte[] line){
|
jpayne@68
|
486 int space=Tools.indexOf(line, ' ');
|
jpayne@68
|
487 if(space<0){space=line.length;}
|
jpayne@68
|
488 if(space>18 || space<4){return false;}
|
jpayne@68
|
489 //... hmm... this is a pretty short list for false cases!
|
jpayne@68
|
490 int dot=-1;
|
jpayne@68
|
491 for(int i=0; i<space; i++){
|
jpayne@68
|
492 if(line[i]=='.'){
|
jpayne@68
|
493 if(dot>=0){return false;}//Only 1 dot allowed
|
jpayne@68
|
494 dot=i;
|
jpayne@68
|
495 }
|
jpayne@68
|
496 }
|
jpayne@68
|
497 if(dot>0){
|
jpayne@68
|
498 if(dot!=space-2){return false;}
|
jpayne@68
|
499 }
|
jpayne@68
|
500 for(int i=0; i<space; i++){
|
jpayne@68
|
501 byte b=line[i];
|
jpayne@68
|
502 if(b!='_' && b!='-' && b!='.' && !Tools.isLetterOrDigit(b)){return false;}
|
jpayne@68
|
503 }
|
jpayne@68
|
504 return true;
|
jpayne@68
|
505 }
|
jpayne@68
|
506
|
jpayne@68
|
507 void appendHeaderLine(byte[] line, ByteBuilder bb){
|
jpayne@68
|
508 assert(line[0]=='>' || line[0]=='@') : new String(line);
|
jpayne@68
|
509
|
jpayne@68
|
510 if(mode==ACCESSION_MODE){
|
jpayne@68
|
511 for(int i=1; i<line.length; i++){
|
jpayne@68
|
512 byte b=line[i];
|
jpayne@68
|
513 if(b==' ' || b=='.'){break;}
|
jpayne@68
|
514 else{bb.append(b);}
|
jpayne@68
|
515 }
|
jpayne@68
|
516 }else if(mode==GI_MODE){
|
jpayne@68
|
517 for(int i=1; i<line.length; i++){
|
jpayne@68
|
518 byte b=line[i];
|
jpayne@68
|
519 if(b==' ' || b=='|'){break;}
|
jpayne@68
|
520 else{bb.append(b);}
|
jpayne@68
|
521 }
|
jpayne@68
|
522 }else if(mode==UNITE_MODE){
|
jpayne@68
|
523 int initial=Tools.indexOf(line, '|');
|
jpayne@68
|
524 for(int i=initial+1; i<line.length; i++){
|
jpayne@68
|
525 byte b=line[i];
|
jpayne@68
|
526 if(b==' ' || b=='.' || b=='|'){break;}
|
jpayne@68
|
527 else{bb.append(b);}
|
jpayne@68
|
528 }
|
jpayne@68
|
529 }else{
|
jpayne@68
|
530 for(int i=1; i<line.length; i++){
|
jpayne@68
|
531 byte b=line[i];
|
jpayne@68
|
532 bb.append(b);
|
jpayne@68
|
533 }
|
jpayne@68
|
534 }
|
jpayne@68
|
535 bb.append(',');
|
jpayne@68
|
536 }
|
jpayne@68
|
537
|
jpayne@68
|
538 private void updateHeadersFromServer(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders, int format){
|
jpayne@68
|
539 if(format==FileFormat.FA){
|
jpayne@68
|
540 updateHeadersFromServer_fasta(lines, counts, bswBadHeaders);
|
jpayne@68
|
541 }else if(format==FileFormat.GFF){
|
jpayne@68
|
542 updateHeadersFromServer_gff(lines, counts, bswBadHeaders);
|
jpayne@68
|
543 }else{
|
jpayne@68
|
544 assert(false) : "Unsupported type: "+format;
|
jpayne@68
|
545 }
|
jpayne@68
|
546 }
|
jpayne@68
|
547
|
jpayne@68
|
548 private void updateHeadersFromServer_fasta(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders){
|
jpayne@68
|
549 ByteBuilder bb=new ByteBuilder();
|
jpayne@68
|
550 ArrayList<String> names=new ArrayList<String>();
|
jpayne@68
|
551 for(byte[] line : lines){
|
jpayne@68
|
552 if(line[0]=='>' && !Tools.startsWith(line, ">tid")){
|
jpayne@68
|
553 appendHeaderLine(line, bb);
|
jpayne@68
|
554 if(mode==UNITE_MODE){
|
jpayne@68
|
555 int bar=Tools.indexOf(line, '|');
|
jpayne@68
|
556 names.add(new String(line, 1, bar-1));
|
jpayne@68
|
557 }
|
jpayne@68
|
558 }
|
jpayne@68
|
559 }
|
jpayne@68
|
560 if(bb.length()<1){return;}
|
jpayne@68
|
561
|
jpayne@68
|
562 assert(bb.endsWith(','));
|
jpayne@68
|
563 bb.length--;
|
jpayne@68
|
564
|
jpayne@68
|
565 // System.err.println("Sending '"+bb+"'");
|
jpayne@68
|
566
|
jpayne@68
|
567 final int[] serverIds;
|
jpayne@68
|
568 if(mode==ACCESSION_MODE || mode==UNITE_MODE){
|
jpayne@68
|
569 serverIds=TaxClient.accessionToTaxidArray(bb.toString());
|
jpayne@68
|
570 }else if(mode==GI_MODE){
|
jpayne@68
|
571 serverIds=TaxClient.giToTaxidArray(bb.toString());
|
jpayne@68
|
572 }else{
|
jpayne@68
|
573 serverIds=TaxClient.headerToTaxidArray(bb.toString());
|
jpayne@68
|
574 }
|
jpayne@68
|
575 assert(serverIds!=null) : "Null response for '"+bb.toString()+"'";
|
jpayne@68
|
576 bb.clear();
|
jpayne@68
|
577
|
jpayne@68
|
578 if(!names.isEmpty()){
|
jpayne@68
|
579 assert(tree!=null) : "Need to load a TaxTree.";
|
jpayne@68
|
580 assert(names.size()==serverIds.length);
|
jpayne@68
|
581 for(int i=0; i<serverIds.length; i++){
|
jpayne@68
|
582 final String name=names.get(i);
|
jpayne@68
|
583 if(serverIds[i]<0){
|
jpayne@68
|
584 TaxNode tn=tree.getNodeByName(name);
|
jpayne@68
|
585 if(tn!=null){serverIds[i]=tn.id;}
|
jpayne@68
|
586 // else {
|
jpayne@68
|
587 // assert(false) : names.get(i);
|
jpayne@68
|
588 // }
|
jpayne@68
|
589 }else{
|
jpayne@68
|
590 //Sometimes the species gets renamed.
|
jpayne@68
|
591 // TaxNode tn=tree.getNodeByName(name);
|
jpayne@68
|
592 // if(tn==null || tn.id==serverIds[i]) {System.err.println(name+", "+serverIds[i]+", "+tn+", "+tree.getNodesByName(name));}
|
jpayne@68
|
593 }
|
jpayne@68
|
594 }
|
jpayne@68
|
595 }
|
jpayne@68
|
596
|
jpayne@68
|
597 for(int lineNum=0, serverNum=0; lineNum<=lines.size(); lineNum++){
|
jpayne@68
|
598 byte[] line=lines.get(lineNum);
|
jpayne@68
|
599 if(line[0]=='>' && !Tools.startsWith(line, ">tid")){
|
jpayne@68
|
600 bb.clear();
|
jpayne@68
|
601 final int tid=serverIds[serverNum];
|
jpayne@68
|
602 if(tid<0){
|
jpayne@68
|
603 //WARN
|
jpayne@68
|
604 if(bswBadHeaders!=null){
|
jpayne@68
|
605 bswBadHeaders.print(tid).tab();
|
jpayne@68
|
606 bswBadHeaders.print(looksLikeRealAccession(line)).tab();
|
jpayne@68
|
607 bswBadHeaders.println(line);
|
jpayne@68
|
608 }else if(warnBadHeaders){
|
jpayne@68
|
609 System.err.println(tid+"\t"+looksLikeRealAccession(line)+"\t"+new String(line));
|
jpayne@68
|
610 }
|
jpayne@68
|
611 }
|
jpayne@68
|
612 int initial=1, terminal=line.length;
|
jpayne@68
|
613 if(shrinkNames){//This is for nr/nt
|
jpayne@68
|
614 for(int i=initial; i<terminal; i++){
|
jpayne@68
|
615 if(line[i]==1){//SOH
|
jpayne@68
|
616 terminal=i;
|
jpayne@68
|
617 }
|
jpayne@68
|
618 }
|
jpayne@68
|
619 }
|
jpayne@68
|
620
|
jpayne@68
|
621 bb.append(title);
|
jpayne@68
|
622 bb.append(tid);
|
jpayne@68
|
623 if(prefix){
|
jpayne@68
|
624 bb.append('|');
|
jpayne@68
|
625 for(int i=initial; i<terminal; i++){
|
jpayne@68
|
626 bb.append(line[i]);
|
jpayne@68
|
627 }
|
jpayne@68
|
628 }else if(counts!=null && tid>=0){
|
jpayne@68
|
629 bb.append('|');
|
jpayne@68
|
630 int count=counts.increment(tid, 1);
|
jpayne@68
|
631 bb.append(count);
|
jpayne@68
|
632 if(count==1){taxaCounted++;}
|
jpayne@68
|
633 }
|
jpayne@68
|
634
|
jpayne@68
|
635 lines.set(lineNum, bb.toBytes());
|
jpayne@68
|
636
|
jpayne@68
|
637 serverNum++;
|
jpayne@68
|
638 if(serverNum>=serverIds.length){break;}
|
jpayne@68
|
639 }
|
jpayne@68
|
640 }
|
jpayne@68
|
641 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
|
jpayne@68
|
642 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders);
|
jpayne@68
|
643 }
|
jpayne@68
|
644 }
|
jpayne@68
|
645
|
jpayne@68
|
646 private void updateHeadersFromServer_gff(ArrayList<byte[]> lines, HashArray1D counts, ByteStreamWriter bswBadHeaders){
|
jpayne@68
|
647 ByteBuilder bb=new ByteBuilder();
|
jpayne@68
|
648 ArrayList<String> names=new ArrayList<String>();
|
jpayne@68
|
649 for(byte[] line : lines){
|
jpayne@68
|
650 if(line[0]!='#' && !Tools.startsWith(line, "tid")){
|
jpayne@68
|
651 if(bb.length()>0){bb.append(',');}
|
jpayne@68
|
652 for(byte b : line){
|
jpayne@68
|
653 if(b=='\t'){break;}
|
jpayne@68
|
654 bb.append(b);
|
jpayne@68
|
655 }
|
jpayne@68
|
656 }
|
jpayne@68
|
657 }
|
jpayne@68
|
658 if(bb.length()<1){return;}
|
jpayne@68
|
659
|
jpayne@68
|
660 // assert(false) : bb;
|
jpayne@68
|
661
|
jpayne@68
|
662 // System.err.println("Sending '"+bb+"'");
|
jpayne@68
|
663
|
jpayne@68
|
664 int[] serverIds;
|
jpayne@68
|
665 if(mode==ACCESSION_MODE || mode==UNITE_MODE){
|
jpayne@68
|
666 serverIds=TaxClient.accessionToTaxidArray(bb.toString());
|
jpayne@68
|
667 }else if(mode==GI_MODE){
|
jpayne@68
|
668 serverIds=TaxClient.giToTaxidArray(bb.toString());
|
jpayne@68
|
669 }else{
|
jpayne@68
|
670 serverIds=TaxClient.headerToTaxidArray(bb.toString());
|
jpayne@68
|
671 }
|
jpayne@68
|
672 if(serverIds==null){
|
jpayne@68
|
673 KillSwitch.kill("Null response for '"+bb.toString()+"'");
|
jpayne@68
|
674 }
|
jpayne@68
|
675 // assert(serverIds!=null) : "Null response for '"+bb.toString()+"'";
|
jpayne@68
|
676 bb.clear();
|
jpayne@68
|
677
|
jpayne@68
|
678 if(!names.isEmpty()){
|
jpayne@68
|
679 assert(tree!=null) : "Need to load a TaxTree.";
|
jpayne@68
|
680 assert(names.size()==serverIds.length);
|
jpayne@68
|
681 for(int i=0; i<serverIds.length; i++){
|
jpayne@68
|
682 final String name=names.get(i);
|
jpayne@68
|
683 if(serverIds[i]<0){
|
jpayne@68
|
684 TaxNode tn=tree.getNodeByName(name);
|
jpayne@68
|
685 if(tn!=null){serverIds[i]=tn.id;}
|
jpayne@68
|
686 // else {
|
jpayne@68
|
687 // assert(false) : names.get(i);
|
jpayne@68
|
688 // }
|
jpayne@68
|
689 }else{
|
jpayne@68
|
690 //Sometimes the species gets renamed.
|
jpayne@68
|
691 // TaxNode tn=tree.getNodeByName(name);
|
jpayne@68
|
692 // if(tn==null || tn.id==serverIds[i]) {System.err.println(name+", "+serverIds[i]+", "+tn+", "+tree.getNodesByName(name));}
|
jpayne@68
|
693 }
|
jpayne@68
|
694 }
|
jpayne@68
|
695 }
|
jpayne@68
|
696
|
jpayne@68
|
697 for(int lineNum=0, serverNum=0; lineNum<=lines.size(); lineNum++){
|
jpayne@68
|
698 byte[] line=lines.get(lineNum);
|
jpayne@68
|
699 if(line[0]!='#' && !Tools.startsWith(line, "tid")){
|
jpayne@68
|
700 bb.clear();
|
jpayne@68
|
701 final int tid=serverIds[serverNum];
|
jpayne@68
|
702 if(tid<0){
|
jpayne@68
|
703 //WARN
|
jpayne@68
|
704 if(bswBadHeaders!=null){
|
jpayne@68
|
705 bswBadHeaders.print(tid).tab();
|
jpayne@68
|
706 bswBadHeaders.print(looksLikeRealAccession(line)).tab();
|
jpayne@68
|
707 bswBadHeaders.println(line);
|
jpayne@68
|
708 }else if(warnBadHeaders){
|
jpayne@68
|
709 System.err.println(tid+"\t"+looksLikeRealAccession(line)+"\t"+new String(line));
|
jpayne@68
|
710 }
|
jpayne@68
|
711 }
|
jpayne@68
|
712
|
jpayne@68
|
713 bb.append("tid|");
|
jpayne@68
|
714 bb.append(tid);
|
jpayne@68
|
715 if(prefix){
|
jpayne@68
|
716 bb.append('|');
|
jpayne@68
|
717 bb.append(line);
|
jpayne@68
|
718 }else if(counts!=null && tid>=0){
|
jpayne@68
|
719 bb.append('|');
|
jpayne@68
|
720 int count=counts.increment(tid, 1);
|
jpayne@68
|
721 bb.append(count);
|
jpayne@68
|
722 if(count==1){taxaCounted++;}
|
jpayne@68
|
723 }
|
jpayne@68
|
724
|
jpayne@68
|
725 lines.set(lineNum, bb.toBytes());
|
jpayne@68
|
726
|
jpayne@68
|
727 serverNum++;
|
jpayne@68
|
728 if(serverNum>=serverIds.length){break;}
|
jpayne@68
|
729 }
|
jpayne@68
|
730 }
|
jpayne@68
|
731 if(maxInvalidHeaders>=0 && invalidReads>maxInvalidHeaders){
|
jpayne@68
|
732 KillSwitch.kill("Maximum bad headers exceeded: "+maxInvalidHeaders);
|
jpayne@68
|
733 }
|
jpayne@68
|
734 }
|
jpayne@68
|
735
|
jpayne@68
|
736 private void processInner_server(ByteFile bf, ByteStreamWriter bsw, ByteStreamWriter bswInvalid, ByteStreamWriter bswBadHeaders, HashArray1D counts, int format){
|
jpayne@68
|
737
|
jpayne@68
|
738 ArrayList<byte[]> lines=new ArrayList<byte[]>();
|
jpayne@68
|
739 byte[] line=bf.nextLine();
|
jpayne@68
|
740 boolean valid=false;
|
jpayne@68
|
741 long storedBytes=0;
|
jpayne@68
|
742
|
jpayne@68
|
743 while(line!=null){
|
jpayne@68
|
744
|
jpayne@68
|
745 if(line.length>0){
|
jpayne@68
|
746 linesIn++;
|
jpayne@68
|
747 lines.add(line);
|
jpayne@68
|
748 storedBytes+=line.length;
|
jpayne@68
|
749 if(storedBytes>=maxStoredBytes){
|
jpayne@68
|
750 updateHeadersFromServer(lines, counts, bswBadHeaders, format);
|
jpayne@68
|
751 valid=dumpBuffer(lines, valid, bsw, bswInvalid);
|
jpayne@68
|
752 lines=new ArrayList<byte[]>();
|
jpayne@68
|
753 storedBytes=0;
|
jpayne@68
|
754 if(deleteInvalid && invalidReads>0){
|
jpayne@68
|
755 System.err.println("Invalid sequence detected; aborting.\n"
|
jpayne@68
|
756 + "Input file: \t"+bf.name()+"\n"
|
jpayne@68
|
757 + "Output file: \t"+(bsw==null ? "null" : bsw.fname)+"\n"
|
jpayne@68
|
758 + "Line: \t"+new String(line)+"\n");
|
jpayne@68
|
759 break;
|
jpayne@68
|
760 }
|
jpayne@68
|
761 }
|
jpayne@68
|
762 }
|
jpayne@68
|
763 line=bf.nextLine();
|
jpayne@68
|
764 }
|
jpayne@68
|
765
|
jpayne@68
|
766 if(storedBytes>0){
|
jpayne@68
|
767 updateHeadersFromServer(lines, counts, bswBadHeaders, format);
|
jpayne@68
|
768 valid=dumpBuffer(lines, valid, bsw, bswInvalid);
|
jpayne@68
|
769 lines=new ArrayList<byte[]>();
|
jpayne@68
|
770 storedBytes=0;
|
jpayne@68
|
771 }
|
jpayne@68
|
772
|
jpayne@68
|
773 errorState|=bf.close();
|
jpayne@68
|
774 }
|
jpayne@68
|
775
|
jpayne@68
|
776 private boolean dumpBuffer(ArrayList<byte[]> lines, boolean valid, ByteStreamWriter bsw, ByteStreamWriter bswInvalid){
|
jpayne@68
|
777
|
jpayne@68
|
778 for(byte[] line : lines){
|
jpayne@68
|
779
|
jpayne@68
|
780 if(line.length>0 && line[0]=='>'){
|
jpayne@68
|
781 readsProcessed++;
|
jpayne@68
|
782 if(maxReads>0 && readsProcessed>maxReads){break;}
|
jpayne@68
|
783
|
jpayne@68
|
784 if(Tools.startsWith(line, invalidTitle)){
|
jpayne@68
|
785 valid=false;
|
jpayne@68
|
786 invalidReads++;
|
jpayne@68
|
787 invalidLines++;
|
jpayne@68
|
788 if(deleteInvalid){break;}
|
jpayne@68
|
789 }else{
|
jpayne@68
|
790 assert(Tools.startsWith(line, title));
|
jpayne@68
|
791 valid=true;
|
jpayne@68
|
792 validReads++;
|
jpayne@68
|
793 validLines++;
|
jpayne@68
|
794 }
|
jpayne@68
|
795 }else if(gffIn){
|
jpayne@68
|
796 basesProcessed+=line.length;
|
jpayne@68
|
797 valid=!Tools.startsWith(line, invalidGffTitle);
|
jpayne@68
|
798 if(valid){
|
jpayne@68
|
799 validBases+=line.length;
|
jpayne@68
|
800 validLines++;
|
jpayne@68
|
801 }else{
|
jpayne@68
|
802 invalidBases+=line.length;
|
jpayne@68
|
803 invalidLines++;
|
jpayne@68
|
804 }
|
jpayne@68
|
805 }else{
|
jpayne@68
|
806 basesProcessed+=line.length;
|
jpayne@68
|
807 if(valid){
|
jpayne@68
|
808 validBases+=line.length;
|
jpayne@68
|
809 validLines++;
|
jpayne@68
|
810 }else{
|
jpayne@68
|
811 invalidBases+=line.length;
|
jpayne@68
|
812 invalidLines++;
|
jpayne@68
|
813 }
|
jpayne@68
|
814 }
|
jpayne@68
|
815
|
jpayne@68
|
816 if(valid || keepAll){
|
jpayne@68
|
817 if(bsw!=null){bsw.println(line);}
|
jpayne@68
|
818 }else{
|
jpayne@68
|
819 if(bswInvalid!=null){bswInvalid.println(line);}
|
jpayne@68
|
820 }
|
jpayne@68
|
821 }
|
jpayne@68
|
822 return valid;
|
jpayne@68
|
823 }
|
jpayne@68
|
824
|
jpayne@68
|
825 /*--------------------------------------------------------------*/
|
jpayne@68
|
826
|
jpayne@68
|
827
|
jpayne@68
|
828 /*--------------------------------------------------------------*/
|
jpayne@68
|
829
|
jpayne@68
|
830 private LinkedHashSet<String> in1=new LinkedHashSet<String>();
|
jpayne@68
|
831 private String out1=null;
|
jpayne@68
|
832 private String outInvalid=null;
|
jpayne@68
|
833 private String badHeaders=null;
|
jpayne@68
|
834
|
jpayne@68
|
835 private String taxTreeFile=null;
|
jpayne@68
|
836 private String giTableFile=null;
|
jpayne@68
|
837 private String accessionFile=null;
|
jpayne@68
|
838 private String patternFile=null;
|
jpayne@68
|
839
|
jpayne@68
|
840 /*--------------------------------------------------------------*/
|
jpayne@68
|
841
|
jpayne@68
|
842 private long maxReads=-1;
|
jpayne@68
|
843
|
jpayne@68
|
844 private long validReads=0;
|
jpayne@68
|
845 private long validBases=0;
|
jpayne@68
|
846 private long invalidReads=0;
|
jpayne@68
|
847 private long invalidBases=0;
|
jpayne@68
|
848 private long taxaCounted=0;
|
jpayne@68
|
849
|
jpayne@68
|
850 private long linesIn=0;
|
jpayne@68
|
851 private long validLines=0;
|
jpayne@68
|
852 private long invalidLines=0;
|
jpayne@68
|
853
|
jpayne@68
|
854 private long maxStoredBytes=10000000;
|
jpayne@68
|
855
|
jpayne@68
|
856 private long readsProcessed=0, basesProcessed=0;
|
jpayne@68
|
857
|
jpayne@68
|
858 private boolean prefix=true;
|
jpayne@68
|
859 private boolean countTable=true;
|
jpayne@68
|
860 private boolean keepAll=true;
|
jpayne@68
|
861 private boolean shrinkNames=false;
|
jpayne@68
|
862 private boolean warnBadHeaders=true;
|
jpayne@68
|
863 private boolean useServer=false;
|
jpayne@68
|
864 /** Crash if the number of invalid headers exceeds this */
|
jpayne@68
|
865 private long maxInvalidHeaders=-1;
|
jpayne@68
|
866 /** Delete the output file if there are any invalid headers */
|
jpayne@68
|
867 private boolean deleteInvalid=false;
|
jpayne@68
|
868
|
jpayne@68
|
869 private int mode;
|
jpayne@68
|
870 private static final int ACCESSION_MODE=0, GI_MODE=1, HEADER_MODE=2, UNITE_MODE=3;
|
jpayne@68
|
871
|
jpayne@68
|
872 private boolean gffIn=false;
|
jpayne@68
|
873
|
jpayne@68
|
874 /*--------------------------------------------------------------*/
|
jpayne@68
|
875
|
jpayne@68
|
876 private final ArrayList<FileFormat> ffin1;
|
jpayne@68
|
877 private final FileFormat ffout1;
|
jpayne@68
|
878 private final FileFormat ffoutInvalid;
|
jpayne@68
|
879 private final TaxTree tree;
|
jpayne@68
|
880
|
jpayne@68
|
881 /*--------------------------------------------------------------*/
|
jpayne@68
|
882
|
jpayne@68
|
883 private PrintStream outstream=System.err;
|
jpayne@68
|
884 public static boolean verbose=false;
|
jpayne@68
|
885 public boolean errorState=false;
|
jpayne@68
|
886 private boolean overwrite=false;
|
jpayne@68
|
887 private boolean append=false;
|
jpayne@68
|
888
|
jpayne@68
|
889 private static byte[] title=">tid|".getBytes();
|
jpayne@68
|
890 private static byte[] invalidTitle=">tid|-1".getBytes();
|
jpayne@68
|
891 private static byte[] invalidGffTitle="tid|-1".getBytes();
|
jpayne@68
|
892
|
jpayne@68
|
893 }
|