Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/AnalyzeAccession.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package tax; | |
2 | |
3 import java.io.File; | |
4 import java.io.PrintStream; | |
5 import java.util.ArrayList; | |
6 import java.util.Arrays; | |
7 import java.util.Collections; | |
8 import java.util.HashMap; | |
9 import java.util.Locale; | |
10 import java.util.Map.Entry; | |
11 | |
12 import fileIO.ByteFile; | |
13 import fileIO.ByteFile1; | |
14 import fileIO.ByteFile2; | |
15 import fileIO.ByteStreamWriter; | |
16 import fileIO.FileFormat; | |
17 import fileIO.ReadWrite; | |
18 import fileIO.TextFile; | |
19 import shared.Parse; | |
20 import shared.Parser; | |
21 import shared.PreParser; | |
22 import shared.Shared; | |
23 import shared.Timer; | |
24 import shared.Tools; | |
25 import stream.ConcurrentGenericReadInputStream; | |
26 import stream.FastaReadInputStream; | |
27 import structures.ByteBuilder; | |
28 import structures.ListNum; | |
29 import structures.StringNum; | |
30 import template.Accumulator; | |
31 import template.ThreadWaiter; | |
32 | |
33 /** | |
34 * Counts patterns in Accessions. | |
35 * Handles hashing for Accession to TaxID lookups. | |
36 * @author Brian Bushnell | |
37 * @date May 9, 2018 | |
38 * | |
39 */ | |
40 public class AnalyzeAccession implements Accumulator<AnalyzeAccession.ProcessThread> { | |
41 | |
42 public static void main(String[] args){ | |
43 //Start a timer immediately upon code entrance. | |
44 Timer t=new Timer(); | |
45 | |
46 //Create an instance of this class | |
47 AnalyzeAccession x=new AnalyzeAccession(args); | |
48 | |
49 //Run the object | |
50 x.process(t); | |
51 | |
52 //Close the print stream if it was redirected | |
53 Shared.closeStream(x.outstream); | |
54 } | |
55 | |
56 public AnalyzeAccession(String[] args){ | |
57 | |
58 {//Preparse block for help, config files, and outstream | |
59 PreParser pp=new PreParser(args, getClass(), false); | |
60 args=pp.args; | |
61 outstream=pp.outstream; | |
62 } | |
63 | |
64 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; | |
65 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
66 | |
67 Parser parser=new Parser(); | |
68 for(int i=0; i<args.length; i++){ | |
69 String arg=args[i]; | |
70 String[] split=arg.split("="); | |
71 String a=split[0].toLowerCase(); | |
72 String b=split.length>1 ? split[1] : null; | |
73 | |
74 if(a.equals("verbose")){ | |
75 verbose=Parse.parseBoolean(b); | |
76 ByteFile1.verbose=verbose; | |
77 ByteFile2.verbose=verbose; | |
78 stream.FastaReadInputStream.verbose=verbose; | |
79 ConcurrentGenericReadInputStream.verbose=verbose; | |
80 stream.FastqReadInputStream.verbose=verbose; | |
81 ReadWrite.verbose=verbose; | |
82 }else if(a.equals("in")){ | |
83 if(b==null){in.clear();} | |
84 else{ | |
85 String[] split2=b.split(","); | |
86 for(String s2 : split2){ | |
87 in.add(s2); | |
88 } | |
89 } | |
90 }else if(a.equals("perfile")){ | |
91 perFile=Parse.parseBoolean(b); | |
92 }else if(b==null && new File(arg).exists()){ | |
93 in.add(arg); | |
94 }else if(parser.parse(arg, a, b)){ | |
95 //do nothing | |
96 }else{ | |
97 outstream.println("Unknown parameter "+args[i]); | |
98 assert(false) : "Unknown parameter "+args[i]; | |
99 // throw new RuntimeException("Unknown parameter "+args[i]); | |
100 } | |
101 } | |
102 | |
103 {//Process parser fields | |
104 overwrite=parser.overwrite; | |
105 append=parser.append; | |
106 | |
107 out=parser.out1; | |
108 } | |
109 | |
110 assert(FastaReadInputStream.settingsOK()); | |
111 | |
112 if(in==null){throw new RuntimeException("Error - at least one input file is required.");} | |
113 | |
114 // if(!ByteFile.FORCE_MODE_BF2){ | |
115 // ByteFile.FORCE_MODE_BF2=false; | |
116 // ByteFile.FORCE_MODE_BF1=true; | |
117 // } | |
118 | |
119 if(out!=null && out.equalsIgnoreCase("null")){out=null;} | |
120 | |
121 if(!Tools.testOutputFiles(overwrite, append, false, out)){ | |
122 outstream.println((out==null)+", "+out); | |
123 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out+"\n"); | |
124 } | |
125 | |
126 ffout=FileFormat.testOutput(out, FileFormat.TXT, null, true, overwrite, append, false); | |
127 ffina=new FileFormat[in.size()]; | |
128 for(int i=0; i<in.size(); i++){ | |
129 ffina[i]=FileFormat.testInput(in.get(i), FileFormat.TXT, null, true, false); | |
130 } | |
131 } | |
132 | |
133 void process(Timer t){ | |
134 | |
135 if(perFile) { | |
136 process_perFile(); | |
137 }else{ | |
138 for(FileFormat ffin : ffina){ | |
139 process_inner(ffin); | |
140 } | |
141 } | |
142 | |
143 if(ffout!=null){ | |
144 ByteStreamWriter bsw=new ByteStreamWriter(ffout); | |
145 bsw.println("#Pattern\tCount\tCombos\tBits"); | |
146 ArrayList<StringNum> list=new ArrayList<StringNum>(); | |
147 list.addAll(countMap.values()); | |
148 Collections.sort(list); | |
149 Collections.reverse(list); | |
150 for(StringNum sn : list){ | |
151 double combos=1; | |
152 for(int i=0; i<sn.s.length(); i++){ | |
153 char c=sn.s.charAt(i); | |
154 if(c=='D'){combos*=10;} | |
155 else if(c=='L'){combos*=26;} | |
156 } | |
157 bsw.print(sn.toString().getBytes()); | |
158 bsw.println("\t"+(long)combos+"\t"+String.format(Locale.ROOT, "%.2f", Tools.log2(combos))); | |
159 } | |
160 bsw.start(); | |
161 errorState|=bsw.poisonAndWait(); | |
162 } | |
163 | |
164 t.stop(); | |
165 | |
166 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8)); | |
167 | |
168 outstream.println(); | |
169 outstream.println("Valid Lines: \t"+linesOut); | |
170 outstream.println("Invalid Lines: \t"+(linesProcessed-linesOut)); | |
171 | |
172 if(errorState){ | |
173 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
174 } | |
175 } | |
176 | |
177 void process_inner(FileFormat ffin){ | |
178 | |
179 ByteFile bf=ByteFile.makeByteFile(ffin); | |
180 | |
181 final int threads=Tools.min(8, Shared.threads()); | |
182 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); | |
183 for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));} | |
184 boolean success=ThreadWaiter.startAndWait(alpt, this); | |
185 errorState|=!success; | |
186 } | |
187 | |
188 | |
189 void process_perFile(){ | |
190 ArrayList<ArrayList<ProcessThread>> perFileList=new ArrayList<ArrayList<ProcessThread>>(ffina.length); | |
191 for(FileFormat ffin : ffina) { | |
192 ByteFile bf=ByteFile.makeByteFile(ffin); | |
193 | |
194 final int threads=Tools.min(16, Shared.threads()); | |
195 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); | |
196 for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));} | |
197 perFileList.add(alpt); | |
198 ThreadWaiter.startThreads(alpt); | |
199 } | |
200 for(ArrayList<ProcessThread> alpt : perFileList){ | |
201 boolean success=ThreadWaiter.waitForThreads(alpt, this); | |
202 errorState|=!success; | |
203 } | |
204 } | |
205 | |
206 /*--------------------------------------------------------------*/ | |
207 | |
208 static class ProcessThread extends Thread { | |
209 | |
210 ProcessThread(ByteFile bf_){ | |
211 bf=bf_; | |
212 } | |
213 | |
214 @Override | |
215 public void run() { | |
216 final StringBuilder buffer=new StringBuilder(128); | |
217 for(ListNum<byte[]> lines=bf.nextList(); lines!=null; lines=bf.nextList()){ | |
218 assert(lines.size()>0); | |
219 if(lines.id==0){ | |
220 //This one is not really important; the header could be missing. | |
221 assert(Tools.startsWith(lines.get(0), "accession")) : bf.name()+"[0]: "+new String(lines.get(0)); | |
222 }else{ | |
223 assert(!Tools.startsWith(lines.get(0), "accession")) : bf.name()+"["+lines.id+"]: "+new String(lines.get(0)); | |
224 } | |
225 for(byte[] line : lines){ | |
226 if(line.length>0){ | |
227 linesProcessedT++; | |
228 bytesProcessedT+=(line.length+1); | |
229 | |
230 boolean valid=lines.id>0 || !(Tools.startsWith(line, "accession")); //Skips test for most lines | |
231 | |
232 if(valid){ | |
233 linesOutT++; | |
234 increment(line, buffer); | |
235 } | |
236 } | |
237 } | |
238 } | |
239 } | |
240 | |
241 void increment(byte[] line, StringBuilder buffer){ | |
242 buffer.setLength(0); | |
243 for(int i=0; i<line.length; i++){ | |
244 final byte b=line[i]; | |
245 if(b==' ' || b=='\t' || b=='.' || b==':'){break;} | |
246 final char b2=(char)remap[b]; | |
247 assert(b2!='?' || b=='+') : "unprocessed symbol in "+new String(line)+"\n"+"'"+(char)b+"'"; | |
248 buffer.append(b2); | |
249 } | |
250 String key=buffer.toString(); | |
251 StringNum value=countMapT.get(key); | |
252 if(value!=null){value.increment();} | |
253 else{countMapT.put(key, new StringNum(key, 1));} | |
254 } | |
255 | |
256 private HashMap<String, StringNum> countMapT=new HashMap<String, StringNum>(); | |
257 private final ByteFile bf; | |
258 long linesProcessedT=0; | |
259 long linesOutT=0; | |
260 long bytesProcessedT=0; | |
261 | |
262 } | |
263 | |
264 /*--------------------------------------------------------------*/ | |
265 | |
266 @Override | |
267 public void accumulate(ProcessThread t) { | |
268 linesProcessed+=t.linesProcessedT; | |
269 linesOut+=t.linesOutT; | |
270 bytesProcessed+=t.bytesProcessedT; | |
271 for(Entry<String, StringNum> e : t.countMapT.entrySet()){ | |
272 StringNum value=e.getValue(); | |
273 final String key=e.getKey(); | |
274 StringNum old=countMap.get(key); | |
275 if(old==null){countMap.put(key, value);} | |
276 else{old.add(value);} | |
277 } | |
278 } | |
279 | |
280 @Override | |
281 public boolean success() { | |
282 return !errorState; | |
283 } | |
284 | |
285 /*--------------------------------------------------------------*/ | |
286 | |
287 public static long combos(String s){ | |
288 double combos=1; | |
289 for(int i=0; i<s.length(); i++){ | |
290 char c=s.charAt(i); | |
291 if(c=='D'){combos*=10;} | |
292 else if(c=='L'){combos*=26;} | |
293 } | |
294 return (combos>=Long.MAX_VALUE ? Long.MAX_VALUE : (long)Math.ceil(combos)); | |
295 } | |
296 | |
297 public static long combos(byte[] s){ | |
298 double combos=1; | |
299 for(int i=0; i<s.length; i++){ | |
300 byte c=s[i]; | |
301 if(c=='D'){combos*=10;} | |
302 else if(c=='L'){combos*=26;} | |
303 } | |
304 return (combos>=Long.MAX_VALUE ? -1 : (long)Math.ceil(combos)); | |
305 } | |
306 | |
307 /*--------------------------------------------------------------*/ | |
308 | |
309 public static HashMap<String, Integer> loadCodeMap(String fname){ | |
310 assert(codeMap==null); | |
311 TextFile tf=new TextFile(fname); | |
312 ArrayList<String> list=new ArrayList<String>(); | |
313 for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ | |
314 if(!line.startsWith("#")){ | |
315 String[] split=line.split("\t"); | |
316 list.add(split[0]); | |
317 } | |
318 } | |
319 HashMap<String, Integer> map=new HashMap<String, Integer>(list.size()*3); | |
320 codeBits=(int)Math.ceil(Tools.log2(list.size())); | |
321 final int patternBits=63-codeBits; | |
322 final long maxCombos=((1L<<(patternBits-1))-1); | |
323 for(int i=0; i<list.size(); i++){ | |
324 String s=list.get(i); | |
325 longestPattern=Tools.max(longestPattern, s.length()); | |
326 long combos=combos(s); | |
327 if(combos<0 || combos>=maxCombos){map.put(s, -1);} | |
328 else{map.put(s, i);} | |
329 } | |
330 codeMap=map; | |
331 return map; | |
332 } | |
333 | |
334 public static long digitize(String s){ | |
335 String pattern=remap(s); | |
336 Integer code=codeMap.get(pattern); | |
337 if(code==null){return -2;} | |
338 if(code.intValue()<0){return -1;} | |
339 | |
340 long number=0; | |
341 for(int i=0; i<pattern.length(); i++){ | |
342 char c=s.charAt(i); | |
343 char p=pattern.charAt(i); | |
344 if(p=='-' || p=='?'){ | |
345 //do nothing | |
346 }else if(p=='D'){ | |
347 number=(number*10)+(c-'0'); | |
348 }else if(p=='L'){ | |
349 number=(number*26)+(Tools.toUpperCase(c)-'A'); | |
350 }else{ | |
351 assert(false) : s; | |
352 } | |
353 } | |
354 number=(number<<codeBits)+code; | |
355 return number; | |
356 } | |
357 | |
358 public static long digitize(byte[] s){ | |
359 String pattern=remap(s); | |
360 Integer code=codeMap.get(pattern); | |
361 if(code==null){return -2;} | |
362 if(code.intValue()<0){return -1;} | |
363 | |
364 long number=0; | |
365 for(int i=0; i<pattern.length(); i++){ | |
366 byte c=s[i]; | |
367 char p=pattern.charAt(i); | |
368 if(p=='-' || p=='?'){ | |
369 //do nothing | |
370 }else if(p=='D'){ | |
371 number=(number*10)+(c-'0'); | |
372 }else if(p=='L'){ | |
373 number=(number*26)+(Tools.toUpperCase(c)-'A'); | |
374 }else{ | |
375 assert(false) : new String(s); | |
376 } | |
377 } | |
378 number=(number<<codeBits)+code; | |
379 return number; | |
380 } | |
381 | |
382 public static String remap(String s){ | |
383 if(s==null || s.length()<1){return "";} | |
384 ByteBuilder buffer=new ByteBuilder(s.length()); | |
385 for(int i=0; i<s.length(); i++){ | |
386 final char b=s.charAt(i); | |
387 if(b==' ' || b=='\t' || b=='.' || b==':'){break;} | |
388 buffer.append((char)remap[b]); | |
389 } | |
390 return buffer.toString(); | |
391 } | |
392 | |
393 public static String remap(byte[] s){ | |
394 ByteBuilder buffer=new ByteBuilder(s.length); | |
395 for(int i=0; i<s.length; i++){ | |
396 final byte b=s[i]; | |
397 if(b==' ' || b=='\t' || b=='.' || b==':'){break;} | |
398 buffer.append((char)remap[b]); | |
399 } | |
400 return buffer.toString(); | |
401 } | |
402 | |
403 /*--------------------------------------------------------------*/ | |
404 | |
405 private ArrayList<String> in=new ArrayList<String>(); | |
406 private String out=null; | |
407 private boolean perFile=true; | |
408 | |
409 /*--------------------------------------------------------------*/ | |
410 | |
411 private HashMap<String, StringNum> countMap=new HashMap<String, StringNum>(); | |
412 public static HashMap<String, Integer> codeMap; | |
413 private static int codeBits=-1; | |
414 private static int longestPattern=-1; | |
415 | |
416 private long linesProcessed=0; | |
417 private long linesOut=0; | |
418 private long bytesProcessed=0; | |
419 private long bytesOut=0; | |
420 | |
421 /*--------------------------------------------------------------*/ | |
422 | |
423 private final FileFormat[] ffina; | |
424 private final FileFormat ffout; | |
425 | |
426 private static final byte[] remap=makeRemap(); | |
427 | |
428 private static byte[] makeRemap(){ | |
429 byte[] array=new byte[128]; | |
430 Arrays.fill(array, (byte)'?'); | |
431 for(int i='A'; i<='Z'; i++){array[i]='L';} | |
432 for(int i='a'; i<='z'; i++){array[i]='L';} | |
433 for(int i='0'; i<='9'; i++){array[i]='D';} | |
434 array['_']=array['-']='-'; | |
435 return array; | |
436 } | |
437 | |
438 /*--------------------------------------------------------------*/ | |
439 | |
440 private PrintStream outstream=System.err; | |
441 public static boolean verbose=false; | |
442 public boolean errorState=false; | |
443 private boolean overwrite=false; | |
444 private boolean append=false; | |
445 | |
446 } |