annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/GffLine.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package gff;
jpayne@68 2
jpayne@68 3 import java.util.ArrayList;
jpayne@68 4 import java.util.HashSet;
jpayne@68 5 import java.util.Locale;
jpayne@68 6
jpayne@68 7 import dna.Data;
jpayne@68 8 import fileIO.ByteFile;
jpayne@68 9 import fileIO.FileFormat;
jpayne@68 10 import prok.ProkObject;
jpayne@68 11 import shared.Parse;
jpayne@68 12 import shared.Shared;
jpayne@68 13 import shared.Tools;
jpayne@68 14 import structures.ByteBuilder;
jpayne@68 15 import var2.ScafMap;
jpayne@68 16 import var2.VCFLine;
jpayne@68 17 import var2.Var;
jpayne@68 18
jpayne@68 19 /**
jpayne@68 20 * Used by both the var2 and prok packages for processing gff files.
jpayne@68 21 * @author Brian Bushnell
jpayne@68 22 * @date Sep 12, 2018
jpayne@68 23 *
jpayne@68 24 */
jpayne@68 25 public class GffLine {
jpayne@68 26
jpayne@68 27 //#seqid source type start end score strand phase attributes
jpayne@68 28 public GffLine(byte[] line){
jpayne@68 29 int a=0, b=0;
jpayne@68 30
jpayne@68 31 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 32 assert(b>a) : "Missing field 0: "+new String(line);
jpayne@68 33 seqid=parseSeqid ? intern(new String(line, a, b-a)) : null;
jpayne@68 34 // assert(seqid==null || seqid.equals(new String(line, a, b-a)));
jpayne@68 35 // assert(seqid!=null) : new String(line, a, b-a)+", "+a+", "+b+"\n"+line;
jpayne@68 36 b++;
jpayne@68 37 a=b;
jpayne@68 38
jpayne@68 39 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 40 assert(b>a) : "Missing field 1: "+new String(line);
jpayne@68 41 if(b==a+1 && line[a]=='.'){source=DOTS;}
jpayne@68 42 else{source=paseSource ? intern(new String(line, a, b-a)) : null;}
jpayne@68 43 b++;
jpayne@68 44 a=b;
jpayne@68 45
jpayne@68 46 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 47 assert(b>a) : "Missing field 2: "+new String(line);
jpayne@68 48 if(b==a+1 && line[a]=='.'){type=DOTS;}
jpayne@68 49 else{
jpayne@68 50 try {//This was to catch a probably intermittent hardware error; can't replicate.
jpayne@68 51 type=(parseType ? intern(new String(line, a, b-a)) : null);
jpayne@68 52 } catch (Exception e) {
jpayne@68 53 // TODO Auto-generated catch block
jpayne@68 54 e.printStackTrace();
jpayne@68 55 System.err.println("\n"+new String(line)+"\n"+a+", "+b+", "+(b-a));
jpayne@68 56 assert(false);
jpayne@68 57 }
jpayne@68 58 }
jpayne@68 59 b++;
jpayne@68 60 a=b;
jpayne@68 61
jpayne@68 62 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 63 assert(b>a) : "Missing field 3: "+new String(line);
jpayne@68 64 start=Parse.parseInt(line, a, b);
jpayne@68 65 b++;
jpayne@68 66 a=b;
jpayne@68 67
jpayne@68 68 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 69 assert(b>a) : "Missing field 4: "+new String(line);
jpayne@68 70 stop=Parse.parseInt(line, a, b);
jpayne@68 71 b++;
jpayne@68 72 a=b;
jpayne@68 73
jpayne@68 74 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 75 if(b<=a){
jpayne@68 76 //Badly formatted line; common in IMG
jpayne@68 77 return;
jpayne@68 78 }
jpayne@68 79 assert(b>a) : "Missing field 5: "+new String(line);
jpayne@68 80 if(b==a+1 && line[a]=='.'){score=-1;}
jpayne@68 81 else{score=Parse.parseFloat(line, a, b);}
jpayne@68 82 b++;
jpayne@68 83 a=b;
jpayne@68 84
jpayne@68 85 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 86 assert(b>a) : "Missing field 6: "+new String(line);
jpayne@68 87 assert(b==a+1);
jpayne@68 88 strand=find(line[a], STRANDS);
jpayne@68 89 // assert(strand>0) : line[a]+", "+Arrays.toString(STRANDS)+", "+(char)line[b];
jpayne@68 90 b++;
jpayne@68 91 a=b;
jpayne@68 92
jpayne@68 93 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 94 assert(b>a) : "Missing field 7: "+new String(line);
jpayne@68 95 assert(b==a+1);
jpayne@68 96 if(line[a]=='.'){phase=-1;}
jpayne@68 97 else{phase=Parse.parseInt(line, a, b);}
jpayne@68 98 b++;
jpayne@68 99 a=b;
jpayne@68 100
jpayne@68 101 while(b<line.length && line[b]!='\t'){b++;}
jpayne@68 102 assert(b>a) : "Missing field 8: "+new String(line);
jpayne@68 103 if(b==a+1 && line[a]=='.'){attributes=DOTS;}
jpayne@68 104 else{attributes=parseAttributes ? new String(line, a, b-a) : null;}
jpayne@68 105 b++;
jpayne@68 106 a=b;
jpayne@68 107
jpayne@68 108 // assert(strand>=0) : "\n"+this.toString()+"\n"+new String(line);
jpayne@68 109 }
jpayne@68 110
jpayne@68 111 public GffLine(VCFLine vcf){
jpayne@68 112 seqid=vcf.scaf;
jpayne@68 113 source=DOTS;
jpayne@68 114 type="sequence_variant_obs";
jpayne@68 115 start=vcf.start()+1;
jpayne@68 116 stop=vcf.stop()+1;
jpayne@68 117 score=(float)vcf.qual;
jpayne@68 118 strand=PLUS;
jpayne@68 119 phase=-1;
jpayne@68 120 final int vtype=vcf.type();
jpayne@68 121 ByteBuilder bb=new ByteBuilder(16);
jpayne@68 122 bb.append("ID=").append(Var.typeArray[vtype]).append(' ');
jpayne@68 123 if(vtype==Var.SUB){
jpayne@68 124 bb.append(vcf.ref).append('>').append(vcf.alt);
jpayne@68 125 }else if(vtype==Var.DEL){
jpayne@68 126 bb.append("length ").append(vcf.reflen()-vcf.readlen());
jpayne@68 127 }else if(vtype==Var.INS){
jpayne@68 128 int offset=vcf.reflen();
jpayne@68 129 int length=vcf.readlen()-offset;
jpayne@68 130 bb.append(vcf.alt, offset, length);
jpayne@68 131 }else if(vtype==Var.NOCALL){
jpayne@68 132 bb.append("length ").append(vcf.reflen());
jpayne@68 133 }
jpayne@68 134 attributes=bb.toString();
jpayne@68 135 bb.clear();
jpayne@68 136 }
jpayne@68 137
jpayne@68 138 public GffLine(Var v, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){
jpayne@68 139 seqid=v.scafName();
jpayne@68 140 source=DOTS;
jpayne@68 141 type="sequence_variant_obs";
jpayne@68 142 start=v.start+1;
jpayne@68 143 stop=Tools.max(v.start+1, v.stop);
jpayne@68 144 score=(float)v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map);
jpayne@68 145 strand=PLUS;
jpayne@68 146 phase=-1;
jpayne@68 147 final int vtype=v.type();
jpayne@68 148 ByteBuilder bb=new ByteBuilder(16);
jpayne@68 149 bb.append("ID=").append(Var.typeArray[vtype]);
jpayne@68 150 if(vtype==Var.SUB || vtype==Var.INS){
jpayne@68 151 bb.append(' ').append(v.allele);
jpayne@68 152 }else if(vtype==Var.DEL || vtype==Var.NOCALL){
jpayne@68 153 bb.append(" length ").append(v.reflen());
jpayne@68 154 }else{assert(false) : vtype+"\n"+v;}
jpayne@68 155 attributes=bb.toString();
jpayne@68 156 bb.clear();
jpayne@68 157 }
jpayne@68 158
jpayne@68 159 public GffLine(Var v){
jpayne@68 160 seqid=v.scafName();
jpayne@68 161 source="BBTools";
jpayne@68 162 type="sequence_variant_obs";
jpayne@68 163 start=v.start+1;
jpayne@68 164 stop=Tools.max(v.start+1, v.stop);
jpayne@68 165 score=-1;
jpayne@68 166 strand=PLUS;
jpayne@68 167 phase=-1;
jpayne@68 168 final int vtype=v.type();
jpayne@68 169 ByteBuilder bb=new ByteBuilder(16);
jpayne@68 170 bb.append("ID=").append(Var.typeArray[vtype]);
jpayne@68 171 if(vtype==Var.SUB || vtype==Var.INS){
jpayne@68 172 bb.append(' ').append(v.allele);
jpayne@68 173 }else if(vtype==Var.DEL || vtype==Var.NOCALL){
jpayne@68 174 bb.append(" length ").append(v.reflen());
jpayne@68 175 }else{assert(false) : vtype+"\n"+v;}
jpayne@68 176 attributes=bb.toString();
jpayne@68 177 bb.clear();
jpayne@68 178 }
jpayne@68 179
jpayne@68 180 public static ArrayList<GffLine> loadGffFile(String fname, String types, boolean banUnprocessed){
jpayne@68 181 FileFormat ff=FileFormat.testInput(fname, FileFormat.GFF, null, false, false);
jpayne@68 182 return loadGffFile(ff, types, banUnprocessed);
jpayne@68 183 }
jpayne@68 184
jpayne@68 185 public static ArrayList<GffLine>[] loadGffFileByType(FileFormat ff, String types, boolean banUnprocessed){
jpayne@68 186 ArrayList<GffLine> list=loadGffFile(ff, types, banUnprocessed);
jpayne@68 187 String[] typeArray=types.split(",");
jpayne@68 188 ArrayList<GffLine>[] lists=new ArrayList[typeArray.length];
jpayne@68 189 for(int i=0; i<typeArray.length; i++){
jpayne@68 190 String type=typeArray[i];
jpayne@68 191 lists[i]=new ArrayList<GffLine>();
jpayne@68 192 for(GffLine gline : list){
jpayne@68 193 if(gline.type.equals(type)){
jpayne@68 194 lists[i].add(gline);
jpayne@68 195 }
jpayne@68 196 }
jpayne@68 197 }
jpayne@68 198 return lists;
jpayne@68 199 }
jpayne@68 200
jpayne@68 201 public static ArrayList<GffLine> loadGffFile(FileFormat ff, String types, boolean banUnprocessed){
jpayne@68 202 HashSet<String> set=null;
jpayne@68 203 if(types!=null){
jpayne@68 204 String[] split=types.split(",");
jpayne@68 205 set=new HashSet<String>(split.length*2);
jpayne@68 206 for(String s : split){
jpayne@68 207 set.add(s);
jpayne@68 208 }
jpayne@68 209 }
jpayne@68 210
jpayne@68 211 ArrayList<GffLine> list=new ArrayList<GffLine>();
jpayne@68 212 ByteFile bf=ByteFile.makeByteFile(ff);
jpayne@68 213 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
jpayne@68 214 if(line[0]=='#'){
jpayne@68 215 //skip
jpayne@68 216 }else{
jpayne@68 217 GffLine gline=new GffLine(line);
jpayne@68 218 assert(gline.strand>=0) : "\n"+gline.toString()+"\n"+new String(line)+"\n";
jpayne@68 219 if(set==null || (gline.type!=null && set.contains(gline.type))){
jpayne@68 220 if(!banUnprocessed || ProkObject.processType(gline.prokType())){
jpayne@68 221 list.add(gline);
jpayne@68 222 }
jpayne@68 223 }
jpayne@68 224 }
jpayne@68 225 }
jpayne@68 226
jpayne@68 227 boolean error=bf.close();
jpayne@68 228 assert(!error) : "Problem with file "+ff.name();
jpayne@68 229 return list;
jpayne@68 230 }
jpayne@68 231
jpayne@68 232 public static void toText(ByteBuilder bb, Var v, double properPairRate, double totalQualityAvg,
jpayne@68 233 double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){
jpayne@68 234 // assert(false);
jpayne@68 235 bb.append(v.scafName(map)).append('\t');
jpayne@68 236 bb.append('.').append('\t');
jpayne@68 237 bb.append("sequence_variant_obs").append('\t');
jpayne@68 238 bb.append(v.start+1).append('\t');
jpayne@68 239 bb.append(Tools.max(v.start+1, v.stop)).append('\t');
jpayne@68 240 bb.append(v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map), 2).append('\t');
jpayne@68 241 bb.append('+').append('\t');
jpayne@68 242 bb.append('.').append('\t');
jpayne@68 243 // System.err.println(v.typeString()+", "+v.start+", "+v.stop);
jpayne@68 244 final int vtype=v.type();
jpayne@68 245 bb.append("ID=").append(Var.typeArray[vtype]);
jpayne@68 246 if(vtype==Var.SUB || vtype==Var.INS){
jpayne@68 247 bb.append(' ').append(v.allele);
jpayne@68 248 }else if(vtype==Var.DEL || vtype==Var.NOCALL){
jpayne@68 249 bb.append(" length ").append(v.reflen());
jpayne@68 250 }else{assert(false) : vtype+"\n"+v;}
jpayne@68 251 }
jpayne@68 252
jpayne@68 253 public static String toHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy,
jpayne@68 254 long reads, long pairs, long properPairs, long bases, String ref){
jpayne@68 255 StringBuilder sb=new StringBuilder();
jpayne@68 256
jpayne@68 257 final double readLengthAvg=bases/Tools.max(1.0, reads);
jpayne@68 258 sb.append("##gff-version 3\n");
jpayne@68 259 sb.append("#BBMapVersion\t"+Shared.BBMAP_VERSION_STRING+"\n");
jpayne@68 260 sb.append("#ploidy\t"+ploidy+"\n");
jpayne@68 261 sb.append(String.format(Locale.ROOT, "#rarity\t%.5f\n", rarity));
jpayne@68 262 sb.append(String.format(Locale.ROOT, "#minAlleleFraction\t%.4f\n", minAlleleFraction));
jpayne@68 263 sb.append("#reads\t"+reads+"\n");
jpayne@68 264 sb.append("#pairedReads\t"+pairs+"\n");
jpayne@68 265 sb.append("#properlyPairedReads\t"+properPairs+"\n");
jpayne@68 266 sb.append(String.format(Locale.ROOT, "#readLengthAvg\t%.2f\n", readLengthAvg));
jpayne@68 267 sb.append(String.format(Locale.ROOT, "#properPairRate\t%.4f\n", properPairRate));
jpayne@68 268 sb.append(String.format(Locale.ROOT, "#totalQualityAvg\t%.4f\n", totalQualityAvg));
jpayne@68 269 sb.append(String.format(Locale.ROOT, "#mapqAvg\t%.2f\n", mapqAvg));
jpayne@68 270 if(ref!=null){sb.append("#reference\t"+ref+"\n");}
jpayne@68 271
jpayne@68 272 sb.append("#seqid source type start end score strand phase attributes");
jpayne@68 273 return sb.toString();
jpayne@68 274 }
jpayne@68 275
jpayne@68 276 @Override
jpayne@68 277 public String toString(){
jpayne@68 278 ByteBuilder bb=new ByteBuilder();
jpayne@68 279 appendTo(bb);
jpayne@68 280 return bb.toString();
jpayne@68 281 }
jpayne@68 282
jpayne@68 283 public ByteBuilder appendTo(ByteBuilder bb){
jpayne@68 284 bb.append(seqid==null ? "." : seqid).append('\t');
jpayne@68 285 bb.append(source==null ? "." : source).append('\t');
jpayne@68 286 bb.append(type==null ? "." : type).append('\t');
jpayne@68 287 bb.append(start).append('\t');
jpayne@68 288 bb.append(stop).append('\t');
jpayne@68 289 if(score<0){bb.append('.').append('\t');}
jpayne@68 290 else{bb.append(score, 2).append('\t');}
jpayne@68 291
jpayne@68 292 bb.append((strand>=0 ? STRANDS[strand] : (byte)'.')).append('\t');
jpayne@68 293
jpayne@68 294 if(phase<0){bb.append('.').append('\t');}
jpayne@68 295 else{bb.append(phase).append('\t');}
jpayne@68 296
jpayne@68 297 bb.append(attributes==null ? "." : attributes);
jpayne@68 298 return bb;
jpayne@68 299 }
jpayne@68 300
jpayne@68 301 public int length() {
jpayne@68 302 return stop-start+1;
jpayne@68 303 }
jpayne@68 304
jpayne@68 305 private static int find(byte a, byte[] array){
jpayne@68 306 for(int i=0; i<array.length; i++){
jpayne@68 307 if(array[i]==a){return i;}
jpayne@68 308 }
jpayne@68 309 return -1;
jpayne@68 310 }
jpayne@68 311
jpayne@68 312 private static String intern(String s){
jpayne@68 313 return Data.forceIntern(s);
jpayne@68 314 }
jpayne@68 315
jpayne@68 316 @Override
jpayne@68 317 public int hashCode(){
jpayne@68 318 return trueStop()^seqid.hashCode();
jpayne@68 319 }
jpayne@68 320
jpayne@68 321 @Override
jpayne@68 322 public boolean equals(Object o){
jpayne@68 323 GffLine b=(GffLine)o;
jpayne@68 324 if(start!=b.start){return false;}
jpayne@68 325 if(stop!=b.stop){return false;}
jpayne@68 326 if(strand!=b.strand){return false;}
jpayne@68 327 if(!seqid.equals(b.seqid)){return false;}
jpayne@68 328 if(!type.equals(b.type)){return false;}
jpayne@68 329 return true;
jpayne@68 330 }
jpayne@68 331
jpayne@68 332 public int trueStart(){
jpayne@68 333 return strand==0 ? start : stop;
jpayne@68 334 }
jpayne@68 335
jpayne@68 336 public int trueStop(){
jpayne@68 337 return strand==0 ? stop : start;
jpayne@68 338 }
jpayne@68 339
jpayne@68 340 public final int prokType(){
jpayne@68 341 if(type.equals("CDS")){
jpayne@68 342 return ProkObject.CDS;
jpayne@68 343 }else if(type.equals("tRNA")){
jpayne@68 344 return ProkObject.tRNA;
jpayne@68 345 }else if(type.equals("rRNA")){
jpayne@68 346 if(attributes.contains("16S")){
jpayne@68 347 return ProkObject.r16S;
jpayne@68 348 }else if(attributes.contains("23S")){
jpayne@68 349 return ProkObject.r23S;
jpayne@68 350 }else if(attributes.contains("18S")){
jpayne@68 351 return ProkObject.r18S;
jpayne@68 352 }else if(attributes.contains("5S") && length()<300){
jpayne@68 353 return ProkObject.r5S;
jpayne@68 354 }
jpayne@68 355 }
jpayne@68 356 return -1;
jpayne@68 357 }
jpayne@68 358
jpayne@68 359 public final boolean partial(){return attributes!=null && attributes.contains("partial=true");}
jpayne@68 360
jpayne@68 361 public final boolean inbounds(int scaflen){return start>=0 && stop<scaflen;}
jpayne@68 362
jpayne@68 363 public String seqid;
jpayne@68 364 public String source;
jpayne@68 365 public String type;
jpayne@68 366 public int start;
jpayne@68 367 public int stop;
jpayne@68 368 public float score;
jpayne@68 369 public int strand;
jpayne@68 370 public int phase;
jpayne@68 371 public String attributes;
jpayne@68 372
jpayne@68 373 private static final byte[] STRANDS=new byte[] {'+', '-', '?', '.'};
jpayne@68 374 public static final int PLUS=0, MINUS=1, QMARK=2, DOT=3;
jpayne@68 375 public static final String DOTS=".";
jpayne@68 376
jpayne@68 377 public static boolean parseSeqid=true;
jpayne@68 378 public static boolean paseSource=false;
jpayne@68 379 public static boolean parseType=true;
jpayne@68 380 public static boolean parseScore=false;
jpayne@68 381 public static boolean parseAttributes=true;
jpayne@68 382
jpayne@68 383 // public static boolean parseSeqid=true;
jpayne@68 384 // public static boolean paseSource=true;
jpayne@68 385 // public static boolean parseType=true;
jpayne@68 386 // public static boolean parseScore=true;
jpayne@68 387 // public static boolean parseAttributes=true;
jpayne@68 388
jpayne@68 389 }