comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/AnalyzeAccession.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package tax;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.Collections;
8 import java.util.HashMap;
9 import java.util.Locale;
10 import java.util.Map.Entry;
11
12 import fileIO.ByteFile;
13 import fileIO.ByteFile1;
14 import fileIO.ByteFile2;
15 import fileIO.ByteStreamWriter;
16 import fileIO.FileFormat;
17 import fileIO.ReadWrite;
18 import fileIO.TextFile;
19 import shared.Parse;
20 import shared.Parser;
21 import shared.PreParser;
22 import shared.Shared;
23 import shared.Timer;
24 import shared.Tools;
25 import stream.ConcurrentGenericReadInputStream;
26 import stream.FastaReadInputStream;
27 import structures.ByteBuilder;
28 import structures.ListNum;
29 import structures.StringNum;
30 import template.Accumulator;
31 import template.ThreadWaiter;
32
33 /**
34 * Counts patterns in Accessions.
35 * Handles hashing for Accession to TaxID lookups.
36 * @author Brian Bushnell
37 * @date May 9, 2018
38 *
39 */
40 public class AnalyzeAccession implements Accumulator<AnalyzeAccession.ProcessThread> {
41
42 public static void main(String[] args){
43 //Start a timer immediately upon code entrance.
44 Timer t=new Timer();
45
46 //Create an instance of this class
47 AnalyzeAccession x=new AnalyzeAccession(args);
48
49 //Run the object
50 x.process(t);
51
52 //Close the print stream if it was redirected
53 Shared.closeStream(x.outstream);
54 }
55
56 public AnalyzeAccession(String[] args){
57
58 {//Preparse block for help, config files, and outstream
59 PreParser pp=new PreParser(args, getClass(), false);
60 args=pp.args;
61 outstream=pp.outstream;
62 }
63
64 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
65 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
66
67 Parser parser=new Parser();
68 for(int i=0; i<args.length; i++){
69 String arg=args[i];
70 String[] split=arg.split("=");
71 String a=split[0].toLowerCase();
72 String b=split.length>1 ? split[1] : null;
73
74 if(a.equals("verbose")){
75 verbose=Parse.parseBoolean(b);
76 ByteFile1.verbose=verbose;
77 ByteFile2.verbose=verbose;
78 stream.FastaReadInputStream.verbose=verbose;
79 ConcurrentGenericReadInputStream.verbose=verbose;
80 stream.FastqReadInputStream.verbose=verbose;
81 ReadWrite.verbose=verbose;
82 }else if(a.equals("in")){
83 if(b==null){in.clear();}
84 else{
85 String[] split2=b.split(",");
86 for(String s2 : split2){
87 in.add(s2);
88 }
89 }
90 }else if(a.equals("perfile")){
91 perFile=Parse.parseBoolean(b);
92 }else if(b==null && new File(arg).exists()){
93 in.add(arg);
94 }else if(parser.parse(arg, a, b)){
95 //do nothing
96 }else{
97 outstream.println("Unknown parameter "+args[i]);
98 assert(false) : "Unknown parameter "+args[i];
99 // throw new RuntimeException("Unknown parameter "+args[i]);
100 }
101 }
102
103 {//Process parser fields
104 overwrite=parser.overwrite;
105 append=parser.append;
106
107 out=parser.out1;
108 }
109
110 assert(FastaReadInputStream.settingsOK());
111
112 if(in==null){throw new RuntimeException("Error - at least one input file is required.");}
113
114 // if(!ByteFile.FORCE_MODE_BF2){
115 // ByteFile.FORCE_MODE_BF2=false;
116 // ByteFile.FORCE_MODE_BF1=true;
117 // }
118
119 if(out!=null && out.equalsIgnoreCase("null")){out=null;}
120
121 if(!Tools.testOutputFiles(overwrite, append, false, out)){
122 outstream.println((out==null)+", "+out);
123 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out+"\n");
124 }
125
126 ffout=FileFormat.testOutput(out, FileFormat.TXT, null, true, overwrite, append, false);
127 ffina=new FileFormat[in.size()];
128 for(int i=0; i<in.size(); i++){
129 ffina[i]=FileFormat.testInput(in.get(i), FileFormat.TXT, null, true, false);
130 }
131 }
132
133 void process(Timer t){
134
135 if(perFile) {
136 process_perFile();
137 }else{
138 for(FileFormat ffin : ffina){
139 process_inner(ffin);
140 }
141 }
142
143 if(ffout!=null){
144 ByteStreamWriter bsw=new ByteStreamWriter(ffout);
145 bsw.println("#Pattern\tCount\tCombos\tBits");
146 ArrayList<StringNum> list=new ArrayList<StringNum>();
147 list.addAll(countMap.values());
148 Collections.sort(list);
149 Collections.reverse(list);
150 for(StringNum sn : list){
151 double combos=1;
152 for(int i=0; i<sn.s.length(); i++){
153 char c=sn.s.charAt(i);
154 if(c=='D'){combos*=10;}
155 else if(c=='L'){combos*=26;}
156 }
157 bsw.print(sn.toString().getBytes());
158 bsw.println("\t"+(long)combos+"\t"+String.format(Locale.ROOT, "%.2f", Tools.log2(combos)));
159 }
160 bsw.start();
161 errorState|=bsw.poisonAndWait();
162 }
163
164 t.stop();
165
166 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8));
167
168 outstream.println();
169 outstream.println("Valid Lines: \t"+linesOut);
170 outstream.println("Invalid Lines: \t"+(linesProcessed-linesOut));
171
172 if(errorState){
173 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
174 }
175 }
176
177 void process_inner(FileFormat ffin){
178
179 ByteFile bf=ByteFile.makeByteFile(ffin);
180
181 final int threads=Tools.min(8, Shared.threads());
182 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
183 for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));}
184 boolean success=ThreadWaiter.startAndWait(alpt, this);
185 errorState|=!success;
186 }
187
188
189 void process_perFile(){
190 ArrayList<ArrayList<ProcessThread>> perFileList=new ArrayList<ArrayList<ProcessThread>>(ffina.length);
191 for(FileFormat ffin : ffina) {
192 ByteFile bf=ByteFile.makeByteFile(ffin);
193
194 final int threads=Tools.min(16, Shared.threads());
195 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
196 for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));}
197 perFileList.add(alpt);
198 ThreadWaiter.startThreads(alpt);
199 }
200 for(ArrayList<ProcessThread> alpt : perFileList){
201 boolean success=ThreadWaiter.waitForThreads(alpt, this);
202 errorState|=!success;
203 }
204 }
205
206 /*--------------------------------------------------------------*/
207
208 static class ProcessThread extends Thread {
209
210 ProcessThread(ByteFile bf_){
211 bf=bf_;
212 }
213
214 @Override
215 public void run() {
216 final StringBuilder buffer=new StringBuilder(128);
217 for(ListNum<byte[]> lines=bf.nextList(); lines!=null; lines=bf.nextList()){
218 assert(lines.size()>0);
219 if(lines.id==0){
220 //This one is not really important; the header could be missing.
221 assert(Tools.startsWith(lines.get(0), "accession")) : bf.name()+"[0]: "+new String(lines.get(0));
222 }else{
223 assert(!Tools.startsWith(lines.get(0), "accession")) : bf.name()+"["+lines.id+"]: "+new String(lines.get(0));
224 }
225 for(byte[] line : lines){
226 if(line.length>0){
227 linesProcessedT++;
228 bytesProcessedT+=(line.length+1);
229
230 boolean valid=lines.id>0 || !(Tools.startsWith(line, "accession")); //Skips test for most lines
231
232 if(valid){
233 linesOutT++;
234 increment(line, buffer);
235 }
236 }
237 }
238 }
239 }
240
241 void increment(byte[] line, StringBuilder buffer){
242 buffer.setLength(0);
243 for(int i=0; i<line.length; i++){
244 final byte b=line[i];
245 if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
246 final char b2=(char)remap[b];
247 assert(b2!='?' || b=='+') : "unprocessed symbol in "+new String(line)+"\n"+"'"+(char)b+"'";
248 buffer.append(b2);
249 }
250 String key=buffer.toString();
251 StringNum value=countMapT.get(key);
252 if(value!=null){value.increment();}
253 else{countMapT.put(key, new StringNum(key, 1));}
254 }
255
256 private HashMap<String, StringNum> countMapT=new HashMap<String, StringNum>();
257 private final ByteFile bf;
258 long linesProcessedT=0;
259 long linesOutT=0;
260 long bytesProcessedT=0;
261
262 }
263
264 /*--------------------------------------------------------------*/
265
266 @Override
267 public void accumulate(ProcessThread t) {
268 linesProcessed+=t.linesProcessedT;
269 linesOut+=t.linesOutT;
270 bytesProcessed+=t.bytesProcessedT;
271 for(Entry<String, StringNum> e : t.countMapT.entrySet()){
272 StringNum value=e.getValue();
273 final String key=e.getKey();
274 StringNum old=countMap.get(key);
275 if(old==null){countMap.put(key, value);}
276 else{old.add(value);}
277 }
278 }
279
280 @Override
281 public boolean success() {
282 return !errorState;
283 }
284
285 /*--------------------------------------------------------------*/
286
287 public static long combos(String s){
288 double combos=1;
289 for(int i=0; i<s.length(); i++){
290 char c=s.charAt(i);
291 if(c=='D'){combos*=10;}
292 else if(c=='L'){combos*=26;}
293 }
294 return (combos>=Long.MAX_VALUE ? Long.MAX_VALUE : (long)Math.ceil(combos));
295 }
296
297 public static long combos(byte[] s){
298 double combos=1;
299 for(int i=0; i<s.length; i++){
300 byte c=s[i];
301 if(c=='D'){combos*=10;}
302 else if(c=='L'){combos*=26;}
303 }
304 return (combos>=Long.MAX_VALUE ? -1 : (long)Math.ceil(combos));
305 }
306
307 /*--------------------------------------------------------------*/
308
309 public static HashMap<String, Integer> loadCodeMap(String fname){
310 assert(codeMap==null);
311 TextFile tf=new TextFile(fname);
312 ArrayList<String> list=new ArrayList<String>();
313 for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
314 if(!line.startsWith("#")){
315 String[] split=line.split("\t");
316 list.add(split[0]);
317 }
318 }
319 HashMap<String, Integer> map=new HashMap<String, Integer>(list.size()*3);
320 codeBits=(int)Math.ceil(Tools.log2(list.size()));
321 final int patternBits=63-codeBits;
322 final long maxCombos=((1L<<(patternBits-1))-1);
323 for(int i=0; i<list.size(); i++){
324 String s=list.get(i);
325 longestPattern=Tools.max(longestPattern, s.length());
326 long combos=combos(s);
327 if(combos<0 || combos>=maxCombos){map.put(s, -1);}
328 else{map.put(s, i);}
329 }
330 codeMap=map;
331 return map;
332 }
333
334 public static long digitize(String s){
335 String pattern=remap(s);
336 Integer code=codeMap.get(pattern);
337 if(code==null){return -2;}
338 if(code.intValue()<0){return -1;}
339
340 long number=0;
341 for(int i=0; i<pattern.length(); i++){
342 char c=s.charAt(i);
343 char p=pattern.charAt(i);
344 if(p=='-' || p=='?'){
345 //do nothing
346 }else if(p=='D'){
347 number=(number*10)+(c-'0');
348 }else if(p=='L'){
349 number=(number*26)+(Tools.toUpperCase(c)-'A');
350 }else{
351 assert(false) : s;
352 }
353 }
354 number=(number<<codeBits)+code;
355 return number;
356 }
357
358 public static long digitize(byte[] s){
359 String pattern=remap(s);
360 Integer code=codeMap.get(pattern);
361 if(code==null){return -2;}
362 if(code.intValue()<0){return -1;}
363
364 long number=0;
365 for(int i=0; i<pattern.length(); i++){
366 byte c=s[i];
367 char p=pattern.charAt(i);
368 if(p=='-' || p=='?'){
369 //do nothing
370 }else if(p=='D'){
371 number=(number*10)+(c-'0');
372 }else if(p=='L'){
373 number=(number*26)+(Tools.toUpperCase(c)-'A');
374 }else{
375 assert(false) : new String(s);
376 }
377 }
378 number=(number<<codeBits)+code;
379 return number;
380 }
381
382 public static String remap(String s){
383 if(s==null || s.length()<1){return "";}
384 ByteBuilder buffer=new ByteBuilder(s.length());
385 for(int i=0; i<s.length(); i++){
386 final char b=s.charAt(i);
387 if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
388 buffer.append((char)remap[b]);
389 }
390 return buffer.toString();
391 }
392
393 public static String remap(byte[] s){
394 ByteBuilder buffer=new ByteBuilder(s.length);
395 for(int i=0; i<s.length; i++){
396 final byte b=s[i];
397 if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
398 buffer.append((char)remap[b]);
399 }
400 return buffer.toString();
401 }
402
403 /*--------------------------------------------------------------*/
404
405 private ArrayList<String> in=new ArrayList<String>();
406 private String out=null;
407 private boolean perFile=true;
408
409 /*--------------------------------------------------------------*/
410
411 private HashMap<String, StringNum> countMap=new HashMap<String, StringNum>();
412 public static HashMap<String, Integer> codeMap;
413 private static int codeBits=-1;
414 private static int longestPattern=-1;
415
416 private long linesProcessed=0;
417 private long linesOut=0;
418 private long bytesProcessed=0;
419 private long bytesOut=0;
420
421 /*--------------------------------------------------------------*/
422
423 private final FileFormat[] ffina;
424 private final FileFormat ffout;
425
426 private static final byte[] remap=makeRemap();
427
428 private static byte[] makeRemap(){
429 byte[] array=new byte[128];
430 Arrays.fill(array, (byte)'?');
431 for(int i='A'; i<='Z'; i++){array[i]='L';}
432 for(int i='a'; i<='z'; i++){array[i]='L';}
433 for(int i='0'; i<='9'; i++){array[i]='D';}
434 array['_']=array['-']='-';
435 return array;
436 }
437
438 /*--------------------------------------------------------------*/
439
440 private PrintStream outstream=System.err;
441 public static boolean verbose=false;
442 public boolean errorState=false;
443 private boolean overwrite=false;
444 private boolean append=false;
445
446 }