annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/CutGff.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package gff;
jpayne@68 2
jpayne@68 3 import java.io.File;
jpayne@68 4 import java.io.PrintStream;
jpayne@68 5 import java.util.ArrayList;
jpayne@68 6 import java.util.Arrays;
jpayne@68 7 import java.util.HashMap;
jpayne@68 8 import java.util.concurrent.atomic.AtomicInteger;
jpayne@68 9 import java.util.concurrent.atomic.AtomicLong;
jpayne@68 10
jpayne@68 11 import aligner.Alignment;
jpayne@68 12 import fileIO.ByteFile;
jpayne@68 13 import fileIO.ByteStreamWriter;
jpayne@68 14 import fileIO.FileFormat;
jpayne@68 15 import fileIO.ReadWrite;
jpayne@68 16 import prok.PGMTools;
jpayne@68 17 import prok.ProkObject;
jpayne@68 18 import shared.Parse;
jpayne@68 19 import shared.Parser;
jpayne@68 20 import shared.PreParser;
jpayne@68 21 import shared.ReadStats;
jpayne@68 22 import shared.Shared;
jpayne@68 23 import shared.Timer;
jpayne@68 24 import shared.Tools;
jpayne@68 25 import stream.ConcurrentReadOutputStream;
jpayne@68 26 import stream.Read;
jpayne@68 27 import stream.ReadInputStream;
jpayne@68 28 import structures.ByteBuilder;
jpayne@68 29 import tax.GiToTaxid;
jpayne@68 30 import tax.TaxClient;
jpayne@68 31 import tax.TaxTree;
jpayne@68 32 import template.Accumulator;
jpayne@68 33 import template.ThreadWaiter;
jpayne@68 34
jpayne@68 35 public class CutGff implements Accumulator<CutGff.ProcessThread> {
jpayne@68 36
jpayne@68 37 /*--------------------------------------------------------------*/
jpayne@68 38 /*---------------- Initialization ----------------*/
jpayne@68 39 /*--------------------------------------------------------------*/
jpayne@68 40
jpayne@68 41 /**
jpayne@68 42 * Code entrance from the command line.
jpayne@68 43 * @param args Command line arguments
jpayne@68 44 */
jpayne@68 45 public static void main(String[] args){
jpayne@68 46 //Start a timer immediately upon code entrance.
jpayne@68 47 Timer t=new Timer();
jpayne@68 48
jpayne@68 49 //Create an instance of this class
jpayne@68 50 CutGff x=new CutGff(args);
jpayne@68 51
jpayne@68 52 //Run the object
jpayne@68 53 x.process(t);
jpayne@68 54
jpayne@68 55 //Close the print stream if it was redirected
jpayne@68 56 Shared.closeStream(x.outstream);
jpayne@68 57 }
jpayne@68 58
jpayne@68 59 /**
jpayne@68 60 * Constructor.
jpayne@68 61 * @param args Command line arguments
jpayne@68 62 */
jpayne@68 63 public CutGff(String[] args){
jpayne@68 64 {//Preparse block for help, config files, and outstream
jpayne@68 65 PreParser pp=new PreParser(args, null/*getClass()*/, false);
jpayne@68 66 args=pp.args;
jpayne@68 67 outstream=pp.outstream;
jpayne@68 68 }
jpayne@68 69
jpayne@68 70 //Set shared static variables prior to parsing
jpayne@68 71 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
jpayne@68 72 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
jpayne@68 73
jpayne@68 74 Shared.TRIM_READ_COMMENTS=Shared.TRIM_RNAME=true;
jpayne@68 75 Read.TO_UPPER_CASE=true;
jpayne@68 76 Read.VALIDATE_IN_CONSTRUCTOR=true;
jpayne@68 77 GffLine.parseAttributes=true;
jpayne@68 78
jpayne@68 79 {//Parse the arguments
jpayne@68 80 final Parser parser=parse(args);
jpayne@68 81 overwrite=parser.overwrite;
jpayne@68 82 append=parser.append;
jpayne@68 83
jpayne@68 84 out=parser.out1;
jpayne@68 85 }
jpayne@68 86
jpayne@68 87 if(alignRibo){
jpayne@68 88 //Load sequences
jpayne@68 89 ProkObject.loadConsensusSequenceFromFile(false, false);
jpayne@68 90 }
jpayne@68 91
jpayne@68 92 fixExtensions(); //Add or remove .gz or .bz2 as needed
jpayne@68 93 checkFileExistence(); //Ensure files can be read and written
jpayne@68 94 checkStatics(); //Adjust file-related static fields as needed for this program
jpayne@68 95
jpayne@68 96 ffout=FileFormat.testOutput(out, FileFormat.PGM, null, true, overwrite, append, false);
jpayne@68 97 }
jpayne@68 98
jpayne@68 99 /*--------------------------------------------------------------*/
jpayne@68 100 /*---------------- Initialization Helpers ----------------*/
jpayne@68 101 /*--------------------------------------------------------------*/
jpayne@68 102
jpayne@68 103 /** Parse arguments from the command line */
jpayne@68 104 private Parser parse(String[] args){
jpayne@68 105
jpayne@68 106 Parser parser=new Parser();
jpayne@68 107 parser.overwrite=overwrite;
jpayne@68 108 for(int i=0; i<args.length; i++){
jpayne@68 109 String arg=args[i];
jpayne@68 110 String[] split=arg.split("=");
jpayne@68 111 String a=split[0].toLowerCase();
jpayne@68 112 String b=split.length>1 ? split[1] : null;
jpayne@68 113 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
jpayne@68 114
jpayne@68 115 // outstream.println(arg+", "+a+", "+b);
jpayne@68 116 if(PGMTools.parseStatic(arg, a, b)){
jpayne@68 117 //do nothing
jpayne@68 118 }else if(a.equals("in") || a.equals("infna") || a.equals("fnain") || a.equals("fna") || a.equals("ref")){
jpayne@68 119 assert(b!=null);
jpayne@68 120 Tools.addFiles(b, fnaList);
jpayne@68 121 }else if(a.equals("gff") || a.equals("ingff") || a.equals("gffin")){
jpayne@68 122 assert(b!=null);
jpayne@68 123 Tools.addFiles(b, gffList);
jpayne@68 124 }else if(a.equals("verbose")){
jpayne@68 125 verbose=Parse.parseBoolean(b);
jpayne@68 126 ReadWrite.verbose=verbose;
jpayne@68 127 }else if(a.equals("alignribo") || a.equals("align")){
jpayne@68 128 alignRibo=Parse.parseBoolean(b);
jpayne@68 129 }else if(a.equals("adjustendpoints")){
jpayne@68 130 adjustEndpoints=Parse.parseBoolean(b);
jpayne@68 131 }else if(a.equalsIgnoreCase("slop16s") || a.equalsIgnoreCase("16sslop") || a.equalsIgnoreCase("ssuslop")){
jpayne@68 132 ssuSlop=Integer.parseInt(b);
jpayne@68 133 }else if(a.equalsIgnoreCase("slop23s") || a.equalsIgnoreCase("23sslop") || a.equalsIgnoreCase("lsuslop")){
jpayne@68 134 lsuSlop=Integer.parseInt(b);
jpayne@68 135 }else if(a.equalsIgnoreCase("maxns") || a.equalsIgnoreCase("maxundefined")){
jpayne@68 136 maxNs=Integer.parseInt(b);
jpayne@68 137 }else if(a.equalsIgnoreCase("maxnrate") || a.equalsIgnoreCase("maxnfraction")){
jpayne@68 138 maxNFraction=Integer.parseInt(b);
jpayne@68 139 }else if(a.equals("invert")){
jpayne@68 140 invert=Parse.parseBoolean(b);
jpayne@68 141 }else if(a.equals("type") || a.equals("types")){
jpayne@68 142 types=b;
jpayne@68 143 }else if(a.equals("attributes") || a.equals("requiredattributes")){
jpayne@68 144 requiredAttributes=b.split(",");
jpayne@68 145 }else if(a.equals("banattributes") || a.equals("bannedattributes")){
jpayne@68 146 bannedAttributes=b.split(",");
jpayne@68 147 }else if(a.equals("banpartial")){
jpayne@68 148 banPartial=Parse.parseBoolean(b);
jpayne@68 149 }
jpayne@68 150
jpayne@68 151 else if(a.equalsIgnoreCase("renameByTaxID")){
jpayne@68 152 renameByTaxID=Parse.parseBoolean(b);
jpayne@68 153 }else if(a.equals("taxmode")){
jpayne@68 154 if("accession".equalsIgnoreCase(b)){
jpayne@68 155 taxMode=ACCESSION_MODE;
jpayne@68 156 }else if("header".equalsIgnoreCase(b)){
jpayne@68 157 taxMode=HEADER_MODE;
jpayne@68 158 }else if("gi".equalsIgnoreCase(b)){
jpayne@68 159 taxMode=GI_MODE;
jpayne@68 160 }else if("taxid".equalsIgnoreCase(b)){
jpayne@68 161 taxMode=TAXID_MODE;
jpayne@68 162 }else{
jpayne@68 163 assert(false) : "Bad tax mode: "+b;
jpayne@68 164 }
jpayne@68 165 }else if(a.equals("requirepresent")){
jpayne@68 166 requirePresent=Parse.parseBoolean(b);
jpayne@68 167 }else if(a.equalsIgnoreCase("onePerFile")){
jpayne@68 168 onePerFile=Parse.parseBoolean(b);
jpayne@68 169 }else if(a.equalsIgnoreCase("pickBest") || a.equalsIgnoreCase("findBest") || a.equalsIgnoreCase("keepBest")){
jpayne@68 170 pickBest=Parse.parseBoolean(b);
jpayne@68 171 }
jpayne@68 172
jpayne@68 173 else if(a.equals("minlen")){
jpayne@68 174 minLen=Integer.parseInt(b);
jpayne@68 175 }else if(a.equals("maxlen")){
jpayne@68 176 maxLen=Integer.parseInt(b);
jpayne@68 177 }
jpayne@68 178
jpayne@68 179 else if(ProkObject.parse(arg, a, b)){
jpayne@68 180 //do nothing
jpayne@68 181 }else if(parser.parse(arg, a, b)){
jpayne@68 182 //do nothing
jpayne@68 183 }else if(arg.indexOf('=')<0 && new File(arg).exists() && FileFormat.isFastaFile(arg)){
jpayne@68 184 fnaList.add(arg);
jpayne@68 185 }else{
jpayne@68 186 outstream.println("Unknown parameter "+args[i]);
jpayne@68 187 assert(false) : "Unknown parameter "+args[i];
jpayne@68 188 // throw new RuntimeException("Unknown parameter "+args[i]);
jpayne@68 189 }
jpayne@68 190 }
jpayne@68 191
jpayne@68 192 ArrayList<String> banned=new ArrayList<String>();
jpayne@68 193 if(banPartial){banned.add("partial=true");}
jpayne@68 194 if(bannedAttributes!=null){
jpayne@68 195 for(String s : bannedAttributes){banned.add(s);}
jpayne@68 196 }
jpayne@68 197 bannedAttributes=banned.isEmpty() ? null : banned.toArray(new String[0]);
jpayne@68 198
jpayne@68 199 if(gffList.isEmpty()){
jpayne@68 200 for(String s : fnaList){
jpayne@68 201 String prefix=ReadWrite.stripExtension(s);
jpayne@68 202 String gff=prefix+".gff";
jpayne@68 203 File f=new File(gff);
jpayne@68 204 if(!f.exists()){
jpayne@68 205 String gz=gff+".gz";
jpayne@68 206 f=new File(gz);
jpayne@68 207 assert(f.exists() && f.canRead()) : "Can't read file "+gff;
jpayne@68 208 gff=gz;
jpayne@68 209 }
jpayne@68 210 gffList.add(gff);
jpayne@68 211 }
jpayne@68 212 }
jpayne@68 213 assert(gffList.size()==fnaList.size()) : "Number of fna and gff files do not match: "+fnaList.size()+", "+gffList.size();
jpayne@68 214 return parser;
jpayne@68 215 }
jpayne@68 216
jpayne@68 217 /** Add or remove .gz or .bz2 as needed */
jpayne@68 218 private void fixExtensions(){
jpayne@68 219 fnaList=Tools.fixExtension(fnaList);
jpayne@68 220 gffList=Tools.fixExtension(gffList);
jpayne@68 221 if(fnaList.isEmpty()){throw new RuntimeException("Error - at least one input file is required.");}
jpayne@68 222 }
jpayne@68 223
jpayne@68 224 /** Ensure files can be read and written */
jpayne@68 225 private void checkFileExistence(){
jpayne@68 226 //Ensure output files can be written
jpayne@68 227 if(!Tools.testOutputFiles(overwrite, append, false, out)){
jpayne@68 228 outstream.println((out==null)+", "+out);
jpayne@68 229 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out+"\n");
jpayne@68 230 }
jpayne@68 231
jpayne@68 232 //Ensure input files can be read
jpayne@68 233 ArrayList<String> foo=new ArrayList<String>();
jpayne@68 234 foo.addAll(fnaList);
jpayne@68 235 foo.addAll(gffList);
jpayne@68 236 if(!Tools.testInputFiles(false, true, foo.toArray(new String[0]))){
jpayne@68 237 throw new RuntimeException("\nCan't read some input files.\n");
jpayne@68 238 }
jpayne@68 239
jpayne@68 240 //Ensure that no file was specified multiple times
jpayne@68 241 foo.add(out);
jpayne@68 242 if(!Tools.testForDuplicateFiles(true, foo.toArray(new String[0]))){
jpayne@68 243 throw new RuntimeException("\nSome file names were specified multiple times.\n");
jpayne@68 244 }
jpayne@68 245 }
jpayne@68 246
jpayne@68 247 /** Adjust file-related static fields as needed for this program */
jpayne@68 248 private static void checkStatics(){
jpayne@68 249 //Adjust the number of threads for input file reading
jpayne@68 250 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
jpayne@68 251 ByteFile.FORCE_MODE_BF2=true;
jpayne@68 252 }
jpayne@68 253 }
jpayne@68 254
jpayne@68 255 /*--------------------------------------------------------------*/
jpayne@68 256 /*---------------- Actual Code ----------------*/
jpayne@68 257 /*--------------------------------------------------------------*/
jpayne@68 258
jpayne@68 259
jpayne@68 260
jpayne@68 261 /*--------------------------------------------------------------*/
jpayne@68 262
jpayne@68 263 public void process(Timer t){
jpayne@68 264 if(Shared.threads()>2 && fnaList.size()>1){
jpayne@68 265 processMT(t);
jpayne@68 266 }else{
jpayne@68 267 processST(t);
jpayne@68 268 }
jpayne@68 269 }
jpayne@68 270
jpayne@68 271 public void processST(Timer t){
jpayne@68 272 ByteStreamWriter bsw=(ffout==null ? null : new ByteStreamWriter(ffout));
jpayne@68 273 if(bsw!=null){bsw.start();}
jpayne@68 274
jpayne@68 275 for(int i=0; i<fnaList.size(); i++){
jpayne@68 276 processFileST(fnaList.get(i), gffList.get(i), types, bsw);
jpayne@68 277 }
jpayne@68 278
jpayne@68 279 if(bsw!=null){bsw.poisonAndWait();}
jpayne@68 280 t.stop();
jpayne@68 281 if(ffout!=null){outstream.println("Wrote "+out);}
jpayne@68 282 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
jpayne@68 283 outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false));
jpayne@68 284 if(alignRibo){
jpayne@68 285 outstream.println(Tools.number("Flipped: ", flipped.get(), 8));
jpayne@68 286 outstream.println(Tools.number("Failed Alignment: ", failed.get(), 8));
jpayne@68 287 }
jpayne@68 288 }
jpayne@68 289
jpayne@68 290 public void processMT(Timer t){
jpayne@68 291
jpayne@68 292 //Optionally create a read output stream
jpayne@68 293 final ConcurrentReadOutputStream ros=makeCros();
jpayne@68 294
jpayne@68 295 //Reset counters
jpayne@68 296 readsProcessed=readsOut=0;
jpayne@68 297 basesProcessed=basesOut=0;
jpayne@68 298
jpayne@68 299 //Process the reads in separate threads
jpayne@68 300 spawnThreads(ros);
jpayne@68 301
jpayne@68 302 if(verbose){outstream.println("Finished; closing streams.");}
jpayne@68 303
jpayne@68 304 //Write anything that was accumulated by ReadStats
jpayne@68 305 errorState|=ReadStats.writeAll();
jpayne@68 306 //Close the read streams
jpayne@68 307 errorState|=ReadWrite.closeStream(ros);
jpayne@68 308
jpayne@68 309 //Report timing and results
jpayne@68 310 t.stop();
jpayne@68 311 if(ffout!=null){outstream.println("Wrote "+out);}
jpayne@68 312 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
jpayne@68 313 outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false));
jpayne@68 314 if(alignRibo){
jpayne@68 315 outstream.println(Tools.number("Flipped: ", flipped.get(), 8));
jpayne@68 316 outstream.println(Tools.number("Failed Alignment: ", failed.get(), 8));
jpayne@68 317 }
jpayne@68 318
jpayne@68 319 //Throw an exception of there was an error in a thread
jpayne@68 320 if(errorState){
jpayne@68 321 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
jpayne@68 322 }
jpayne@68 323 }
jpayne@68 324
jpayne@68 325 /*--------------------------------------------------------------*/
jpayne@68 326
jpayne@68 327 private boolean hasAttributes(GffLine gline){
jpayne@68 328 int len=gline.length();
jpayne@68 329 if(len<minLen || len>maxLen){return false;}
jpayne@68 330 if(hasAttributes(gline, bannedAttributes)){return false;}
jpayne@68 331 return requiredAttributes==null || hasAttributes(gline, requiredAttributes);
jpayne@68 332 }
jpayne@68 333
jpayne@68 334 private static boolean hasAttributes(GffLine gline, String[] attributes){
jpayne@68 335 if(attributes==null){return false;}
jpayne@68 336 for(String s : attributes){
jpayne@68 337 if(gline.attributes.contains(s)){
jpayne@68 338 return true;
jpayne@68 339 }
jpayne@68 340 }
jpayne@68 341 return false;
jpayne@68 342 }
jpayne@68 343
jpayne@68 344 private void processFileST(String fna, String gff, String types, ByteStreamWriter bsw){
jpayne@68 345 ArrayList<Read> reads=processFile(fna, gff, types);
jpayne@68 346 if(reads!=null){
jpayne@68 347 for(Read r : reads){
jpayne@68 348 if(bsw!=null){bsw.println(r);}
jpayne@68 349 }
jpayne@68 350 }
jpayne@68 351 }
jpayne@68 352
jpayne@68 353 private ArrayList<Read> processFile(String fna, String gff, String types){
jpayne@68 354 ArrayList<GffLine> lines=GffLine.loadGffFile(gff, types, false);
jpayne@68 355
jpayne@68 356 ArrayList<Read> list=ReadInputStream.toReads(fna, FileFormat.FA, -1);
jpayne@68 357 HashMap<String, Read> map=new HashMap<String, Read>();
jpayne@68 358
jpayne@68 359 for(Read r : list){
jpayne@68 360 readsProcessed++;
jpayne@68 361 basesProcessed+=r.length();
jpayne@68 362 map.put(r.id, r);
jpayne@68 363 }
jpayne@68 364
jpayne@68 365 if(renameByTaxID){//Note this must be AFTER adding to the hashmap.
jpayne@68 366 renameByTaxID(list);
jpayne@68 367 }
jpayne@68 368
jpayne@68 369 ArrayList<Read> outList=processLines(lines, map, invert);
jpayne@68 370
jpayne@68 371 if(invert){
jpayne@68 372 for(Read r : list){
jpayne@68 373 readsOut++;
jpayne@68 374 basesOut+=r.length();
jpayne@68 375 }
jpayne@68 376 return list;
jpayne@68 377 }else{
jpayne@68 378 if(outList!=null){
jpayne@68 379 for(Read r : outList){
jpayne@68 380 readsOut++;
jpayne@68 381 basesOut+=r.length();
jpayne@68 382 }
jpayne@68 383 }
jpayne@68 384 return outList;
jpayne@68 385 }
jpayne@68 386 }
jpayne@68 387
jpayne@68 388 private void renameByTaxID(ArrayList<Read> list){
jpayne@68 389 ByteBuilder bb=new ByteBuilder();
jpayne@68 390 for(Read r : list){
jpayne@68 391 if(bb.length>0){bb.comma();}
jpayne@68 392 bb.append(taxMode==HEADER_MODE ? r.id : taxMode==ACCESSION_MODE ? parseAccession(r.id) : parseGi(r.id));
jpayne@68 393 }
jpayne@68 394 final int[] ids;
jpayne@68 395 if(taxMode==ACCESSION_MODE){
jpayne@68 396 ids=TaxClient.accessionToTaxidArray(bb.toString());
jpayne@68 397 }else if(taxMode==GI_MODE){
jpayne@68 398 ids=TaxClient.giToTaxidArray(bb.toString());
jpayne@68 399 }else if(taxMode==HEADER_MODE){
jpayne@68 400 ids=TaxClient.headerToTaxidArray(bb.toString());
jpayne@68 401 }else if(taxMode==TAXID_MODE){
jpayne@68 402 ids=parseTaxIds(list);
jpayne@68 403 }else{
jpayne@68 404 throw new RuntimeException("Bad mode: "+TAXID_MODE);
jpayne@68 405 }
jpayne@68 406 assert(ids.length==list.size()) : ids.length+", "+list.size();
jpayne@68 407 for(int i=0; i<ids.length; i++){
jpayne@68 408 Read r=list.get(i);
jpayne@68 409 int id=ids[i];
jpayne@68 410
jpayne@68 411 if(r.id!=null && r.id.startsWith("tid|")){
jpayne@68 412 id=TaxTree.parseHeaderStatic(r.id);
jpayne@68 413 r.obj=id;
jpayne@68 414 }else {
jpayne@68 415 assert(id>=0 || !requirePresent) : "Can't find taxID for header: "+id+", "+r.name();
jpayne@68 416
jpayne@68 417 r.obj=id;
jpayne@68 418 r.id="tid|"+id+"|"+id;
jpayne@68 419 }
jpayne@68 420 }
jpayne@68 421 }
jpayne@68 422
jpayne@68 423 private int[] parseTaxIds(ArrayList<Read> list){
jpayne@68 424 int[] ids=new int[list.size()];
jpayne@68 425 for(int i=0; i<list.size(); i++){
jpayne@68 426 Read r=list.get(i);
jpayne@68 427 int x=GiToTaxid.parseTaxidNumber(r.id, '|');
jpayne@68 428 ids[i]=x;
jpayne@68 429 }
jpayne@68 430 return ids;
jpayne@68 431 }
jpayne@68 432
jpayne@68 433 private String parseAccession(String id){
jpayne@68 434 int dot=id.indexOf('.');
jpayne@68 435 int space=id.indexOf(' ');
jpayne@68 436 // assert(dot>0 && space>0) : "Unexpected header format: "+id+"\nTry header mode instead of accession mode.";
jpayne@68 437 int limit=Tools.min((dot<0 ? id.length() : dot), (space<0 ? id.length() : space));
jpayne@68 438 return id.substring(0, limit);
jpayne@68 439 }
jpayne@68 440
jpayne@68 441 private String parseGi(String id){
jpayne@68 442 assert(id.startsWith("gi|"));
jpayne@68 443 String[] split=id.split("|");
jpayne@68 444 return split[1];
jpayne@68 445 }
jpayne@68 446
jpayne@68 447 private ArrayList<Read> processLines(ArrayList<GffLine> lines, HashMap<String, Read> map, boolean invertSelection){
jpayne@68 448 ArrayList<Read> list=null;
jpayne@68 449 for(GffLine gline : lines){
jpayne@68 450 if(hasAttributes(gline)){
jpayne@68 451 Read scaf=map.get(gline.seqid);
jpayne@68 452 assert(scaf!=null) : "Can't find "+gline.seqid+" in "+map.keySet();
jpayne@68 453
jpayne@68 454 boolean pass=true;
jpayne@68 455 Float identity=null;
jpayne@68 456 if(alignRibo && gline.inbounds(scaf.length())){
jpayne@68 457 int type=gline.prokType();
jpayne@68 458 identity=align(gline, scaf.bases, type);
jpayne@68 459 if(identity==null){pass=false;}
jpayne@68 460 }
jpayne@68 461
jpayne@68 462 if(pass){
jpayne@68 463 final int start=gline.start-1;
jpayne@68 464 final int stop=gline.stop-1;
jpayne@68 465
jpayne@68 466 if(invertSelection){
jpayne@68 467 byte[] bases=scaf.bases;
jpayne@68 468 for(int i=start; i<=stop; i++){
jpayne@68 469 if(i>=0 && i<bases.length){
jpayne@68 470 bases[i]='N';
jpayne@68 471 }
jpayne@68 472 }
jpayne@68 473 }else{
jpayne@68 474 if(start>=0 && stop<scaf.length()){
jpayne@68 475 String id=gline.attributes;
jpayne@68 476 if(renameByTaxID){
jpayne@68 477 id="tid|"+scaf.obj+"|"+id;
jpayne@68 478 }
jpayne@68 479 Read r=new Read(Arrays.copyOfRange(scaf.bases, start, stop+1), null, id, 1);
jpayne@68 480 r.obj=identity;
jpayne@68 481
jpayne@68 482 assert(!r.containsLowercase()) : r.toFasta()+"\n"
jpayne@68 483 + "validated="+r.validated()+", scaf.validated="+scaf.validated()+", tuc="+Read.TO_UPPER_CASE+", vic="+Read.VALIDATE_IN_CONSTRUCTOR;
jpayne@68 484 if(maxNs>=0 || maxNFraction>=0){
jpayne@68 485 long allowed=Tools.min(maxNs>=0 ? maxNs : r.length(), (long)(r.length()*(maxNFraction>=0 ? maxNFraction : 1)));
jpayne@68 486 if(r.countUndefined()>allowed){r=null;}
jpayne@68 487 }
jpayne@68 488
jpayne@68 489 if(r!=null){
jpayne@68 490 if(gline.strand==1){r.reverseComplement();}
jpayne@68 491 if(list==null){list=new ArrayList<Read>(8);}
jpayne@68 492 list.add(r);
jpayne@68 493 if(onePerFile && !pickBest){break;}
jpayne@68 494 }
jpayne@68 495 }
jpayne@68 496 }
jpayne@68 497 }
jpayne@68 498 }
jpayne@68 499 }
jpayne@68 500 if(pickBest && list!=null && list.size()>1){
jpayne@68 501 Read best=null;
jpayne@68 502 float bestID=0;
jpayne@68 503 for(Read r : list){
jpayne@68 504 float identity=(r.obj==null ? 0.001f : (Float)r.obj);
jpayne@68 505 if(identity>bestID){
jpayne@68 506 bestID=identity;
jpayne@68 507 best=r;
jpayne@68 508 }
jpayne@68 509 }
jpayne@68 510 assert(best!=null);
jpayne@68 511 list.clear();
jpayne@68 512 list.add(best);
jpayne@68 513 }
jpayne@68 514 return list;
jpayne@68 515 }
jpayne@68 516
jpayne@68 517 private Float align(GffLine gline, byte[] scaf, int type){
jpayne@68 518 Read[] consensusReads=ProkObject.consensusReads(type);
jpayne@68 519 if(consensusReads==null || consensusReads.length==0){
jpayne@68 520 assert(false) : type+"\n"+gline.toString();
jpayne@68 521 return null;
jpayne@68 522 }
jpayne@68 523 byte[] universal=consensusReads[0].bases;
jpayne@68 524 float minIdentity=ProkObject.minID(type)*ID_MULT;
jpayne@68 525 if(universal==null){assert(false); return 1F;}
jpayne@68 526
jpayne@68 527 int start=gline.start-1;
jpayne@68 528 int stop=gline.stop-1;
jpayne@68 529 assert(start<=stop) : start+", "+stop+", "+scaf.length;
jpayne@68 530 assert(start>=0 && start<scaf.length) : start+", "+stop+", "+scaf.length;
jpayne@68 531 assert(stop>=0 && stop<scaf.length) : start+", "+stop+", "+scaf.length;
jpayne@68 532 // final int a=Tools.max(0, start-(adjustEndpoints ? 100 : 20));
jpayne@68 533 // final int b=Tools.min(scaf.length-1, stop+(adjustEndpoints ? 100 : 20));
jpayne@68 534 final int a=Tools.max(0, start);
jpayne@68 535 final int b=Tools.min(scaf.length-1, stop);
jpayne@68 536
jpayne@68 537 byte[] ref=Arrays.copyOfRange(scaf, a, b+1);
jpayne@68 538 Read r=new Read(ref, null, 0);
jpayne@68 539 if(gline.strand==GffLine.MINUS){r.reverseComplement();}
jpayne@68 540
jpayne@68 541 Alignment plus=new Alignment(r);
jpayne@68 542 plus.align(universal);
jpayne@68 543 // if(plus.id>=minIdentity){return true;}
jpayne@68 544
jpayne@68 545 r.reverseComplement();
jpayne@68 546 Alignment minus=new Alignment(r);
jpayne@68 547 minus.align(universal);
jpayne@68 548
jpayne@68 549 Alignment best=null;
jpayne@68 550 if(plus.id>=minus.id){
jpayne@68 551 best=plus;
jpayne@68 552 // System.err.println("Kept: "+plus.id+" \t"+minus.id);
jpayne@68 553 }else{
jpayne@68 554 best=minus;
jpayne@68 555 if(minus.id>=minIdentity){
jpayne@68 556 if(verbose) {System.err.println("Flipped: "+plus.id+" \t"+minus.id+"");}
jpayne@68 557 flipped.incrementAndGet();
jpayne@68 558 gline.strand=Shared.MINUS;
jpayne@68 559 }
jpayne@68 560 }
jpayne@68 561 if(best.id>=minIdentity){
jpayne@68 562 return best.id;
jpayne@68 563 }else{
jpayne@68 564 if(verbose) {System.err.println("Failed alignment: "+plus.id+" \t"+minus.id);}
jpayne@68 565 failed.incrementAndGet();
jpayne@68 566 return null;
jpayne@68 567 }
jpayne@68 568 //
jpayne@68 569 // int[] coords=KillSwitch.allocInt1D(2);
jpayne@68 570 // float id1=align(universal, scaf, a, b, minIdentity, coords);
jpayne@68 571 // final int rstart=coords[0], rstop=coords[1];
jpayne@68 572 // // assert(false) : rstart+", "+rstop+", "+(rstop-rstart+1)+", "+start+", "+stop;
jpayne@68 573 // if(id1<minIdentity){
jpayne@68 574 // System.err.println("Low identity: "+String.format("%.2s", 100*id1));
jpayne@68 575 // return false;
jpayne@68 576 // }
jpayne@68 577 // if(adjustEndpoints){
jpayne@68 578 // int slop=(flag==4 ? AnalyzeGenes.lsuSlop : AnalyzeGenes.ssuSlop);
jpayne@68 579 // if(Tools.absdif(start, rstart)>slop){
jpayne@68 580 // // System.err.println("rstart:\t"+start+" -> "+rstart);
jpayne@68 581 // start=rstart;
jpayne@68 582 // }
jpayne@68 583 // if(Tools.absdif(stop, rstop)>slop){
jpayne@68 584 // // System.err.println("rstop: \t"+stop+" -> "+rstop);
jpayne@68 585 // stop=rstop;
jpayne@68 586 // }
jpayne@68 587 // }
jpayne@68 588 }
jpayne@68 589
jpayne@68 590 /*--------------------------------------------------------------*/
jpayne@68 591 /*---------------- Thread Management ----------------*/
jpayne@68 592 /*--------------------------------------------------------------*/
jpayne@68 593
jpayne@68 594 private ConcurrentReadOutputStream makeCros(){
jpayne@68 595 if(ffout==null){return null;}
jpayne@68 596
jpayne@68 597 //Select output buffer size based on whether it needs to be ordered
jpayne@68 598 final int buff=(ordered ? Tools.mid(16, 128, (Shared.threads()*2)/3) : 8);
jpayne@68 599
jpayne@68 600 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ffout, null, buff, null, false);
jpayne@68 601 ros.start(); //Start the stream
jpayne@68 602 return ros;
jpayne@68 603 }
jpayne@68 604
jpayne@68 605 /** Spawn process threads */
jpayne@68 606 private void spawnThreads(ConcurrentReadOutputStream ros){
jpayne@68 607 //Do anything necessary prior to processing
jpayne@68 608
jpayne@68 609 //Determine how many threads may be used
jpayne@68 610 final int threads=Tools.min(Shared.threads(), fnaList.size());
jpayne@68 611
jpayne@68 612 //Controls access to input files
jpayne@68 613 AtomicInteger atom=new AtomicInteger(0);
jpayne@68 614
jpayne@68 615 //Fill a list with ProcessThreads
jpayne@68 616 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
jpayne@68 617 for(int i=0; i<threads; i++){
jpayne@68 618 alpt.add(new ProcessThread(atom, ros, i));
jpayne@68 619 }
jpayne@68 620
jpayne@68 621 //Start the threads and wait for them to finish
jpayne@68 622 boolean success=ThreadWaiter.startAndWait(alpt, this);
jpayne@68 623 errorState&=!success;
jpayne@68 624
jpayne@68 625 //Do anything necessary after processing
jpayne@68 626
jpayne@68 627 }
jpayne@68 628
jpayne@68 629 @Override
jpayne@68 630 public final void accumulate(ProcessThread pt){
jpayne@68 631 readsProcessed+=pt.readsProcessedT;
jpayne@68 632 basesProcessed+=pt.basesProcessedT;
jpayne@68 633 readsOut+=pt.readsOutT;
jpayne@68 634 basesOut+=pt.basesOutT;
jpayne@68 635 errorState|=(!pt.success);
jpayne@68 636 }
jpayne@68 637
jpayne@68 638 @Override
jpayne@68 639 public final boolean success(){return !errorState;}
jpayne@68 640
jpayne@68 641 /*--------------------------------------------------------------*/
jpayne@68 642 /*---------------- Inner Classes ----------------*/
jpayne@68 643 /*--------------------------------------------------------------*/
jpayne@68 644
jpayne@68 645 /** This class is static to prevent accidental writing to shared variables.
jpayne@68 646 * It is safe to remove the static modifier. */
jpayne@68 647 class ProcessThread extends Thread {
jpayne@68 648
jpayne@68 649 //Constructor
jpayne@68 650 ProcessThread(final AtomicInteger atom_, ConcurrentReadOutputStream ros_, final int tid_){
jpayne@68 651 atom=atom_;
jpayne@68 652 ros=ros_;
jpayne@68 653 tid=tid_;
jpayne@68 654 }
jpayne@68 655
jpayne@68 656 //Called by start()
jpayne@68 657 @Override
jpayne@68 658 public void run(){
jpayne@68 659 //Do anything necessary prior to processing
jpayne@68 660
jpayne@68 661 for(int fnum=atom.getAndIncrement(), lim=fnaList.size(); fnum<lim; fnum=atom.getAndIncrement()) {
jpayne@68 662 String fna=fnaList.get(fnum);
jpayne@68 663 String gff=gffList.get(fnum);
jpayne@68 664 //Process the reads
jpayne@68 665 ArrayList<Read> list=processFileT(fna, gff, types);
jpayne@68 666 if(ros!=null){
jpayne@68 667 if(list==null){list=dummy;}
jpayne@68 668 ros.add(list, fnum);
jpayne@68 669 }
jpayne@68 670 }
jpayne@68 671
jpayne@68 672 //Do anything necessary after processing
jpayne@68 673
jpayne@68 674 //Indicate successful exit status
jpayne@68 675 success=true;
jpayne@68 676 }
jpayne@68 677
jpayne@68 678 //Duplicated
jpayne@68 679 private ArrayList<Read> processFileT(String fna, String gff, String types){
jpayne@68 680 ArrayList<GffLine> lines=GffLine.loadGffFile(gff, types, false);
jpayne@68 681
jpayne@68 682 ArrayList<Read> list=ReadInputStream.toReads(fna, FileFormat.FA, -1);
jpayne@68 683 HashMap<String, Read> map=new HashMap<String, Read>();
jpayne@68 684
jpayne@68 685 for(Read r : list){
jpayne@68 686 readsProcessedT++;
jpayne@68 687 basesProcessedT+=r.length();
jpayne@68 688 map.put(r.id, r);
jpayne@68 689 }
jpayne@68 690
jpayne@68 691 if(renameByTaxID){//Note this must be AFTER adding to the hashmap.
jpayne@68 692 renameByTaxID(list);
jpayne@68 693 }
jpayne@68 694 // assert(false) : renameByTaxID+", "+list.size();
jpayne@68 695
jpayne@68 696 ArrayList<Read> outList=processLines(lines, map, invert);
jpayne@68 697
jpayne@68 698 if(invert){
jpayne@68 699 for(Read r : list){
jpayne@68 700 readsOutT++;
jpayne@68 701 basesOutT+=r.length();
jpayne@68 702 }
jpayne@68 703 return list;
jpayne@68 704 }else{
jpayne@68 705 if(outList!=null){
jpayne@68 706 for(Read r : outList){
jpayne@68 707 readsOutT++;
jpayne@68 708 basesOutT+=r.length();
jpayne@68 709 }
jpayne@68 710 }
jpayne@68 711 return outList;
jpayne@68 712 }
jpayne@68 713 }
jpayne@68 714
jpayne@68 715 /** Number of reads processed by this thread */
jpayne@68 716 protected long readsProcessedT=0;
jpayne@68 717 /** Number of bases processed by this thread */
jpayne@68 718 protected long basesProcessedT=0;
jpayne@68 719
jpayne@68 720 /** Number of reads retained by this thread */
jpayne@68 721 protected long readsOutT=0;
jpayne@68 722 /** Number of bases retained by this thread */
jpayne@68 723 protected long basesOutT=0;
jpayne@68 724
jpayne@68 725 /** True only if this thread has completed successfully */
jpayne@68 726 boolean success=false;
jpayne@68 727
jpayne@68 728 /** Shared output stream */
jpayne@68 729 private final ConcurrentReadOutputStream ros;
jpayne@68 730 /** Thread ID */
jpayne@68 731 final int tid;
jpayne@68 732 /** Next file ID */
jpayne@68 733 final AtomicInteger atom;
jpayne@68 734 }
jpayne@68 735
jpayne@68 736 /*--------------------------------------------------------------*/
jpayne@68 737 /*---------------- Fields ----------------*/
jpayne@68 738 /*--------------------------------------------------------------*/
jpayne@68 739
jpayne@68 740 private ArrayList<String> fnaList=new ArrayList<String>();
jpayne@68 741 private ArrayList<String> gffList=new ArrayList<String>();
jpayne@68 742 private String out=null;
jpayne@68 743 private String types="CDS";
jpayne@68 744 private boolean invert=false;
jpayne@68 745 private boolean banPartial=true;
jpayne@68 746 private int minLen=1;
jpayne@68 747 private int maxLen=Integer.MAX_VALUE;
jpayne@68 748
jpayne@68 749 private String[] requiredAttributes;
jpayne@68 750 private String[] bannedAttributes;
jpayne@68 751
jpayne@68 752 /*--------------------------------------------------------------*/
jpayne@68 753
jpayne@68 754 private long bytesOut=0;
jpayne@68 755 private boolean renameByTaxID=false;
jpayne@68 756 private int taxMode=ACCESSION_MODE;
jpayne@68 757 private boolean requirePresent=false;
jpayne@68 758 private boolean alignRibo=false;
jpayne@68 759 private boolean adjustEndpoints=false;
jpayne@68 760 private boolean onePerFile=false;
jpayne@68 761 private boolean pickBest=false;
jpayne@68 762 private int ssuSlop=999;
jpayne@68 763 private int lsuSlop=999;
jpayne@68 764
jpayne@68 765 private float ID_MULT=0.96f;
jpayne@68 766
jpayne@68 767 private int maxNs=-1;
jpayne@68 768 private double maxNFraction=-1;
jpayne@68 769
jpayne@68 770 private static int ACCESSION_MODE=0, GI_MODE=1, HEADER_MODE=2, TAXID_MODE=3;
jpayne@68 771
jpayne@68 772 /*--------------------------------------------------------------*/
jpayne@68 773
jpayne@68 774 /** Number of reads processed */
jpayne@68 775 protected long readsProcessed=0;
jpayne@68 776 /** Number of bases processed */
jpayne@68 777 protected long basesProcessed=0;
jpayne@68 778
jpayne@68 779 /** Number of reads retained */
jpayne@68 780 protected long readsOut=0;
jpayne@68 781 /** Number of bases retained */
jpayne@68 782 protected long basesOut=0;
jpayne@68 783
jpayne@68 784 protected AtomicLong flipped=new AtomicLong(0);
jpayne@68 785 protected AtomicLong failed=new AtomicLong(0);
jpayne@68 786
jpayne@68 787 /** Quit after processing this many input reads; -1 means no limit */
jpayne@68 788 private long maxReads=-1;
jpayne@68 789
jpayne@68 790 /*--------------------------------------------------------------*/
jpayne@68 791 /*---------------- Final Fields ----------------*/
jpayne@68 792 /*--------------------------------------------------------------*/
jpayne@68 793
jpayne@68 794 private final FileFormat ffout;
jpayne@68 795 private final ArrayList<Read> dummy=new ArrayList<Read>();
jpayne@68 796
jpayne@68 797 /*--------------------------------------------------------------*/
jpayne@68 798 /*---------------- Common Fields ----------------*/
jpayne@68 799 /*--------------------------------------------------------------*/
jpayne@68 800
jpayne@68 801 private PrintStream outstream=System.err;
jpayne@68 802 public static boolean verbose=false;
jpayne@68 803 public boolean errorState=false;
jpayne@68 804 public boolean ordered=true;
jpayne@68 805 private boolean overwrite=true;
jpayne@68 806 private boolean append=false;
jpayne@68 807
jpayne@68 808 }