annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/AnalyzeAccession.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package tax;
jpayne@68 2
jpayne@68 3 import java.io.File;
jpayne@68 4 import java.io.PrintStream;
jpayne@68 5 import java.util.ArrayList;
jpayne@68 6 import java.util.Arrays;
jpayne@68 7 import java.util.Collections;
jpayne@68 8 import java.util.HashMap;
jpayne@68 9 import java.util.Locale;
jpayne@68 10 import java.util.Map.Entry;
jpayne@68 11
jpayne@68 12 import fileIO.ByteFile;
jpayne@68 13 import fileIO.ByteFile1;
jpayne@68 14 import fileIO.ByteFile2;
jpayne@68 15 import fileIO.ByteStreamWriter;
jpayne@68 16 import fileIO.FileFormat;
jpayne@68 17 import fileIO.ReadWrite;
jpayne@68 18 import fileIO.TextFile;
jpayne@68 19 import shared.Parse;
jpayne@68 20 import shared.Parser;
jpayne@68 21 import shared.PreParser;
jpayne@68 22 import shared.Shared;
jpayne@68 23 import shared.Timer;
jpayne@68 24 import shared.Tools;
jpayne@68 25 import stream.ConcurrentGenericReadInputStream;
jpayne@68 26 import stream.FastaReadInputStream;
jpayne@68 27 import structures.ByteBuilder;
jpayne@68 28 import structures.ListNum;
jpayne@68 29 import structures.StringNum;
jpayne@68 30 import template.Accumulator;
jpayne@68 31 import template.ThreadWaiter;
jpayne@68 32
jpayne@68 33 /**
jpayne@68 34 * Counts patterns in Accessions.
jpayne@68 35 * Handles hashing for Accession to TaxID lookups.
jpayne@68 36 * @author Brian Bushnell
jpayne@68 37 * @date May 9, 2018
jpayne@68 38 *
jpayne@68 39 */
jpayne@68 40 public class AnalyzeAccession implements Accumulator<AnalyzeAccession.ProcessThread> {
jpayne@68 41
jpayne@68 42 public static void main(String[] args){
jpayne@68 43 //Start a timer immediately upon code entrance.
jpayne@68 44 Timer t=new Timer();
jpayne@68 45
jpayne@68 46 //Create an instance of this class
jpayne@68 47 AnalyzeAccession x=new AnalyzeAccession(args);
jpayne@68 48
jpayne@68 49 //Run the object
jpayne@68 50 x.process(t);
jpayne@68 51
jpayne@68 52 //Close the print stream if it was redirected
jpayne@68 53 Shared.closeStream(x.outstream);
jpayne@68 54 }
jpayne@68 55
jpayne@68 56 public AnalyzeAccession(String[] args){
jpayne@68 57
jpayne@68 58 {//Preparse block for help, config files, and outstream
jpayne@68 59 PreParser pp=new PreParser(args, getClass(), false);
jpayne@68 60 args=pp.args;
jpayne@68 61 outstream=pp.outstream;
jpayne@68 62 }
jpayne@68 63
jpayne@68 64 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
jpayne@68 65 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
jpayne@68 66
jpayne@68 67 Parser parser=new Parser();
jpayne@68 68 for(int i=0; i<args.length; i++){
jpayne@68 69 String arg=args[i];
jpayne@68 70 String[] split=arg.split("=");
jpayne@68 71 String a=split[0].toLowerCase();
jpayne@68 72 String b=split.length>1 ? split[1] : null;
jpayne@68 73
jpayne@68 74 if(a.equals("verbose")){
jpayne@68 75 verbose=Parse.parseBoolean(b);
jpayne@68 76 ByteFile1.verbose=verbose;
jpayne@68 77 ByteFile2.verbose=verbose;
jpayne@68 78 stream.FastaReadInputStream.verbose=verbose;
jpayne@68 79 ConcurrentGenericReadInputStream.verbose=verbose;
jpayne@68 80 stream.FastqReadInputStream.verbose=verbose;
jpayne@68 81 ReadWrite.verbose=verbose;
jpayne@68 82 }else if(a.equals("in")){
jpayne@68 83 if(b==null){in.clear();}
jpayne@68 84 else{
jpayne@68 85 String[] split2=b.split(",");
jpayne@68 86 for(String s2 : split2){
jpayne@68 87 in.add(s2);
jpayne@68 88 }
jpayne@68 89 }
jpayne@68 90 }else if(a.equals("perfile")){
jpayne@68 91 perFile=Parse.parseBoolean(b);
jpayne@68 92 }else if(b==null && new File(arg).exists()){
jpayne@68 93 in.add(arg);
jpayne@68 94 }else if(parser.parse(arg, a, b)){
jpayne@68 95 //do nothing
jpayne@68 96 }else{
jpayne@68 97 outstream.println("Unknown parameter "+args[i]);
jpayne@68 98 assert(false) : "Unknown parameter "+args[i];
jpayne@68 99 // throw new RuntimeException("Unknown parameter "+args[i]);
jpayne@68 100 }
jpayne@68 101 }
jpayne@68 102
jpayne@68 103 {//Process parser fields
jpayne@68 104 overwrite=parser.overwrite;
jpayne@68 105 append=parser.append;
jpayne@68 106
jpayne@68 107 out=parser.out1;
jpayne@68 108 }
jpayne@68 109
jpayne@68 110 assert(FastaReadInputStream.settingsOK());
jpayne@68 111
jpayne@68 112 if(in==null){throw new RuntimeException("Error - at least one input file is required.");}
jpayne@68 113
jpayne@68 114 // if(!ByteFile.FORCE_MODE_BF2){
jpayne@68 115 // ByteFile.FORCE_MODE_BF2=false;
jpayne@68 116 // ByteFile.FORCE_MODE_BF1=true;
jpayne@68 117 // }
jpayne@68 118
jpayne@68 119 if(out!=null && out.equalsIgnoreCase("null")){out=null;}
jpayne@68 120
jpayne@68 121 if(!Tools.testOutputFiles(overwrite, append, false, out)){
jpayne@68 122 outstream.println((out==null)+", "+out);
jpayne@68 123 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out+"\n");
jpayne@68 124 }
jpayne@68 125
jpayne@68 126 ffout=FileFormat.testOutput(out, FileFormat.TXT, null, true, overwrite, append, false);
jpayne@68 127 ffina=new FileFormat[in.size()];
jpayne@68 128 for(int i=0; i<in.size(); i++){
jpayne@68 129 ffina[i]=FileFormat.testInput(in.get(i), FileFormat.TXT, null, true, false);
jpayne@68 130 }
jpayne@68 131 }
jpayne@68 132
jpayne@68 133 void process(Timer t){
jpayne@68 134
jpayne@68 135 if(perFile) {
jpayne@68 136 process_perFile();
jpayne@68 137 }else{
jpayne@68 138 for(FileFormat ffin : ffina){
jpayne@68 139 process_inner(ffin);
jpayne@68 140 }
jpayne@68 141 }
jpayne@68 142
jpayne@68 143 if(ffout!=null){
jpayne@68 144 ByteStreamWriter bsw=new ByteStreamWriter(ffout);
jpayne@68 145 bsw.println("#Pattern\tCount\tCombos\tBits");
jpayne@68 146 ArrayList<StringNum> list=new ArrayList<StringNum>();
jpayne@68 147 list.addAll(countMap.values());
jpayne@68 148 Collections.sort(list);
jpayne@68 149 Collections.reverse(list);
jpayne@68 150 for(StringNum sn : list){
jpayne@68 151 double combos=1;
jpayne@68 152 for(int i=0; i<sn.s.length(); i++){
jpayne@68 153 char c=sn.s.charAt(i);
jpayne@68 154 if(c=='D'){combos*=10;}
jpayne@68 155 else if(c=='L'){combos*=26;}
jpayne@68 156 }
jpayne@68 157 bsw.print(sn.toString().getBytes());
jpayne@68 158 bsw.println("\t"+(long)combos+"\t"+String.format(Locale.ROOT, "%.2f", Tools.log2(combos)));
jpayne@68 159 }
jpayne@68 160 bsw.start();
jpayne@68 161 errorState|=bsw.poisonAndWait();
jpayne@68 162 }
jpayne@68 163
jpayne@68 164 t.stop();
jpayne@68 165
jpayne@68 166 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8));
jpayne@68 167
jpayne@68 168 outstream.println();
jpayne@68 169 outstream.println("Valid Lines: \t"+linesOut);
jpayne@68 170 outstream.println("Invalid Lines: \t"+(linesProcessed-linesOut));
jpayne@68 171
jpayne@68 172 if(errorState){
jpayne@68 173 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
jpayne@68 174 }
jpayne@68 175 }
jpayne@68 176
jpayne@68 177 void process_inner(FileFormat ffin){
jpayne@68 178
jpayne@68 179 ByteFile bf=ByteFile.makeByteFile(ffin);
jpayne@68 180
jpayne@68 181 final int threads=Tools.min(8, Shared.threads());
jpayne@68 182 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
jpayne@68 183 for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));}
jpayne@68 184 boolean success=ThreadWaiter.startAndWait(alpt, this);
jpayne@68 185 errorState|=!success;
jpayne@68 186 }
jpayne@68 187
jpayne@68 188
jpayne@68 189 void process_perFile(){
jpayne@68 190 ArrayList<ArrayList<ProcessThread>> perFileList=new ArrayList<ArrayList<ProcessThread>>(ffina.length);
jpayne@68 191 for(FileFormat ffin : ffina) {
jpayne@68 192 ByteFile bf=ByteFile.makeByteFile(ffin);
jpayne@68 193
jpayne@68 194 final int threads=Tools.min(16, Shared.threads());
jpayne@68 195 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
jpayne@68 196 for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));}
jpayne@68 197 perFileList.add(alpt);
jpayne@68 198 ThreadWaiter.startThreads(alpt);
jpayne@68 199 }
jpayne@68 200 for(ArrayList<ProcessThread> alpt : perFileList){
jpayne@68 201 boolean success=ThreadWaiter.waitForThreads(alpt, this);
jpayne@68 202 errorState|=!success;
jpayne@68 203 }
jpayne@68 204 }
jpayne@68 205
jpayne@68 206 /*--------------------------------------------------------------*/
jpayne@68 207
jpayne@68 208 static class ProcessThread extends Thread {
jpayne@68 209
jpayne@68 210 ProcessThread(ByteFile bf_){
jpayne@68 211 bf=bf_;
jpayne@68 212 }
jpayne@68 213
jpayne@68 214 @Override
jpayne@68 215 public void run() {
jpayne@68 216 final StringBuilder buffer=new StringBuilder(128);
jpayne@68 217 for(ListNum<byte[]> lines=bf.nextList(); lines!=null; lines=bf.nextList()){
jpayne@68 218 assert(lines.size()>0);
jpayne@68 219 if(lines.id==0){
jpayne@68 220 //This one is not really important; the header could be missing.
jpayne@68 221 assert(Tools.startsWith(lines.get(0), "accession")) : bf.name()+"[0]: "+new String(lines.get(0));
jpayne@68 222 }else{
jpayne@68 223 assert(!Tools.startsWith(lines.get(0), "accession")) : bf.name()+"["+lines.id+"]: "+new String(lines.get(0));
jpayne@68 224 }
jpayne@68 225 for(byte[] line : lines){
jpayne@68 226 if(line.length>0){
jpayne@68 227 linesProcessedT++;
jpayne@68 228 bytesProcessedT+=(line.length+1);
jpayne@68 229
jpayne@68 230 boolean valid=lines.id>0 || !(Tools.startsWith(line, "accession")); //Skips test for most lines
jpayne@68 231
jpayne@68 232 if(valid){
jpayne@68 233 linesOutT++;
jpayne@68 234 increment(line, buffer);
jpayne@68 235 }
jpayne@68 236 }
jpayne@68 237 }
jpayne@68 238 }
jpayne@68 239 }
jpayne@68 240
jpayne@68 241 void increment(byte[] line, StringBuilder buffer){
jpayne@68 242 buffer.setLength(0);
jpayne@68 243 for(int i=0; i<line.length; i++){
jpayne@68 244 final byte b=line[i];
jpayne@68 245 if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
jpayne@68 246 final char b2=(char)remap[b];
jpayne@68 247 assert(b2!='?' || b=='+') : "unprocessed symbol in "+new String(line)+"\n"+"'"+(char)b+"'";
jpayne@68 248 buffer.append(b2);
jpayne@68 249 }
jpayne@68 250 String key=buffer.toString();
jpayne@68 251 StringNum value=countMapT.get(key);
jpayne@68 252 if(value!=null){value.increment();}
jpayne@68 253 else{countMapT.put(key, new StringNum(key, 1));}
jpayne@68 254 }
jpayne@68 255
jpayne@68 256 private HashMap<String, StringNum> countMapT=new HashMap<String, StringNum>();
jpayne@68 257 private final ByteFile bf;
jpayne@68 258 long linesProcessedT=0;
jpayne@68 259 long linesOutT=0;
jpayne@68 260 long bytesProcessedT=0;
jpayne@68 261
jpayne@68 262 }
jpayne@68 263
jpayne@68 264 /*--------------------------------------------------------------*/
jpayne@68 265
jpayne@68 266 @Override
jpayne@68 267 public void accumulate(ProcessThread t) {
jpayne@68 268 linesProcessed+=t.linesProcessedT;
jpayne@68 269 linesOut+=t.linesOutT;
jpayne@68 270 bytesProcessed+=t.bytesProcessedT;
jpayne@68 271 for(Entry<String, StringNum> e : t.countMapT.entrySet()){
jpayne@68 272 StringNum value=e.getValue();
jpayne@68 273 final String key=e.getKey();
jpayne@68 274 StringNum old=countMap.get(key);
jpayne@68 275 if(old==null){countMap.put(key, value);}
jpayne@68 276 else{old.add(value);}
jpayne@68 277 }
jpayne@68 278 }
jpayne@68 279
jpayne@68 280 @Override
jpayne@68 281 public boolean success() {
jpayne@68 282 return !errorState;
jpayne@68 283 }
jpayne@68 284
jpayne@68 285 /*--------------------------------------------------------------*/
jpayne@68 286
jpayne@68 287 public static long combos(String s){
jpayne@68 288 double combos=1;
jpayne@68 289 for(int i=0; i<s.length(); i++){
jpayne@68 290 char c=s.charAt(i);
jpayne@68 291 if(c=='D'){combos*=10;}
jpayne@68 292 else if(c=='L'){combos*=26;}
jpayne@68 293 }
jpayne@68 294 return (combos>=Long.MAX_VALUE ? Long.MAX_VALUE : (long)Math.ceil(combos));
jpayne@68 295 }
jpayne@68 296
jpayne@68 297 public static long combos(byte[] s){
jpayne@68 298 double combos=1;
jpayne@68 299 for(int i=0; i<s.length; i++){
jpayne@68 300 byte c=s[i];
jpayne@68 301 if(c=='D'){combos*=10;}
jpayne@68 302 else if(c=='L'){combos*=26;}
jpayne@68 303 }
jpayne@68 304 return (combos>=Long.MAX_VALUE ? -1 : (long)Math.ceil(combos));
jpayne@68 305 }
jpayne@68 306
jpayne@68 307 /*--------------------------------------------------------------*/
jpayne@68 308
jpayne@68 309 public static HashMap<String, Integer> loadCodeMap(String fname){
jpayne@68 310 assert(codeMap==null);
jpayne@68 311 TextFile tf=new TextFile(fname);
jpayne@68 312 ArrayList<String> list=new ArrayList<String>();
jpayne@68 313 for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
jpayne@68 314 if(!line.startsWith("#")){
jpayne@68 315 String[] split=line.split("\t");
jpayne@68 316 list.add(split[0]);
jpayne@68 317 }
jpayne@68 318 }
jpayne@68 319 HashMap<String, Integer> map=new HashMap<String, Integer>(list.size()*3);
jpayne@68 320 codeBits=(int)Math.ceil(Tools.log2(list.size()));
jpayne@68 321 final int patternBits=63-codeBits;
jpayne@68 322 final long maxCombos=((1L<<(patternBits-1))-1);
jpayne@68 323 for(int i=0; i<list.size(); i++){
jpayne@68 324 String s=list.get(i);
jpayne@68 325 longestPattern=Tools.max(longestPattern, s.length());
jpayne@68 326 long combos=combos(s);
jpayne@68 327 if(combos<0 || combos>=maxCombos){map.put(s, -1);}
jpayne@68 328 else{map.put(s, i);}
jpayne@68 329 }
jpayne@68 330 codeMap=map;
jpayne@68 331 return map;
jpayne@68 332 }
jpayne@68 333
jpayne@68 334 public static long digitize(String s){
jpayne@68 335 String pattern=remap(s);
jpayne@68 336 Integer code=codeMap.get(pattern);
jpayne@68 337 if(code==null){return -2;}
jpayne@68 338 if(code.intValue()<0){return -1;}
jpayne@68 339
jpayne@68 340 long number=0;
jpayne@68 341 for(int i=0; i<pattern.length(); i++){
jpayne@68 342 char c=s.charAt(i);
jpayne@68 343 char p=pattern.charAt(i);
jpayne@68 344 if(p=='-' || p=='?'){
jpayne@68 345 //do nothing
jpayne@68 346 }else if(p=='D'){
jpayne@68 347 number=(number*10)+(c-'0');
jpayne@68 348 }else if(p=='L'){
jpayne@68 349 number=(number*26)+(Tools.toUpperCase(c)-'A');
jpayne@68 350 }else{
jpayne@68 351 assert(false) : s;
jpayne@68 352 }
jpayne@68 353 }
jpayne@68 354 number=(number<<codeBits)+code;
jpayne@68 355 return number;
jpayne@68 356 }
jpayne@68 357
jpayne@68 358 public static long digitize(byte[] s){
jpayne@68 359 String pattern=remap(s);
jpayne@68 360 Integer code=codeMap.get(pattern);
jpayne@68 361 if(code==null){return -2;}
jpayne@68 362 if(code.intValue()<0){return -1;}
jpayne@68 363
jpayne@68 364 long number=0;
jpayne@68 365 for(int i=0; i<pattern.length(); i++){
jpayne@68 366 byte c=s[i];
jpayne@68 367 char p=pattern.charAt(i);
jpayne@68 368 if(p=='-' || p=='?'){
jpayne@68 369 //do nothing
jpayne@68 370 }else if(p=='D'){
jpayne@68 371 number=(number*10)+(c-'0');
jpayne@68 372 }else if(p=='L'){
jpayne@68 373 number=(number*26)+(Tools.toUpperCase(c)-'A');
jpayne@68 374 }else{
jpayne@68 375 assert(false) : new String(s);
jpayne@68 376 }
jpayne@68 377 }
jpayne@68 378 number=(number<<codeBits)+code;
jpayne@68 379 return number;
jpayne@68 380 }
jpayne@68 381
jpayne@68 382 public static String remap(String s){
jpayne@68 383 if(s==null || s.length()<1){return "";}
jpayne@68 384 ByteBuilder buffer=new ByteBuilder(s.length());
jpayne@68 385 for(int i=0; i<s.length(); i++){
jpayne@68 386 final char b=s.charAt(i);
jpayne@68 387 if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
jpayne@68 388 buffer.append((char)remap[b]);
jpayne@68 389 }
jpayne@68 390 return buffer.toString();
jpayne@68 391 }
jpayne@68 392
jpayne@68 393 public static String remap(byte[] s){
jpayne@68 394 ByteBuilder buffer=new ByteBuilder(s.length);
jpayne@68 395 for(int i=0; i<s.length; i++){
jpayne@68 396 final byte b=s[i];
jpayne@68 397 if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
jpayne@68 398 buffer.append((char)remap[b]);
jpayne@68 399 }
jpayne@68 400 return buffer.toString();
jpayne@68 401 }
jpayne@68 402
jpayne@68 403 /*--------------------------------------------------------------*/
jpayne@68 404
jpayne@68 405 private ArrayList<String> in=new ArrayList<String>();
jpayne@68 406 private String out=null;
jpayne@68 407 private boolean perFile=true;
jpayne@68 408
jpayne@68 409 /*--------------------------------------------------------------*/
jpayne@68 410
jpayne@68 411 private HashMap<String, StringNum> countMap=new HashMap<String, StringNum>();
jpayne@68 412 public static HashMap<String, Integer> codeMap;
jpayne@68 413 private static int codeBits=-1;
jpayne@68 414 private static int longestPattern=-1;
jpayne@68 415
jpayne@68 416 private long linesProcessed=0;
jpayne@68 417 private long linesOut=0;
jpayne@68 418 private long bytesProcessed=0;
jpayne@68 419 private long bytesOut=0;
jpayne@68 420
jpayne@68 421 /*--------------------------------------------------------------*/
jpayne@68 422
jpayne@68 423 private final FileFormat[] ffina;
jpayne@68 424 private final FileFormat ffout;
jpayne@68 425
jpayne@68 426 private static final byte[] remap=makeRemap();
jpayne@68 427
jpayne@68 428 private static byte[] makeRemap(){
jpayne@68 429 byte[] array=new byte[128];
jpayne@68 430 Arrays.fill(array, (byte)'?');
jpayne@68 431 for(int i='A'; i<='Z'; i++){array[i]='L';}
jpayne@68 432 for(int i='a'; i<='z'; i++){array[i]='L';}
jpayne@68 433 for(int i='0'; i<='9'; i++){array[i]='D';}
jpayne@68 434 array['_']=array['-']='-';
jpayne@68 435 return array;
jpayne@68 436 }
jpayne@68 437
jpayne@68 438 /*--------------------------------------------------------------*/
jpayne@68 439
jpayne@68 440 private PrintStream outstream=System.err;
jpayne@68 441 public static boolean verbose=false;
jpayne@68 442 public boolean errorState=false;
jpayne@68 443 private boolean overwrite=false;
jpayne@68 444 private boolean append=false;
jpayne@68 445
jpayne@68 446 }