comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/GffLine.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package gff;
2
3 import java.util.ArrayList;
4 import java.util.HashSet;
5 import java.util.Locale;
6
7 import dna.Data;
8 import fileIO.ByteFile;
9 import fileIO.FileFormat;
10 import prok.ProkObject;
11 import shared.Parse;
12 import shared.Shared;
13 import shared.Tools;
14 import structures.ByteBuilder;
15 import var2.ScafMap;
16 import var2.VCFLine;
17 import var2.Var;
18
19 /**
20 * Used by both the var2 and prok packages for processing gff files.
21 * @author Brian Bushnell
22 * @date Sep 12, 2018
23 *
24 */
25 public class GffLine {
26
27 //#seqid source type start end score strand phase attributes
28 public GffLine(byte[] line){
29 int a=0, b=0;
30
31 while(b<line.length && line[b]!='\t'){b++;}
32 assert(b>a) : "Missing field 0: "+new String(line);
33 seqid=parseSeqid ? intern(new String(line, a, b-a)) : null;
34 // assert(seqid==null || seqid.equals(new String(line, a, b-a)));
35 // assert(seqid!=null) : new String(line, a, b-a)+", "+a+", "+b+"\n"+line;
36 b++;
37 a=b;
38
39 while(b<line.length && line[b]!='\t'){b++;}
40 assert(b>a) : "Missing field 1: "+new String(line);
41 if(b==a+1 && line[a]=='.'){source=DOTS;}
42 else{source=paseSource ? intern(new String(line, a, b-a)) : null;}
43 b++;
44 a=b;
45
46 while(b<line.length && line[b]!='\t'){b++;}
47 assert(b>a) : "Missing field 2: "+new String(line);
48 if(b==a+1 && line[a]=='.'){type=DOTS;}
49 else{
50 try {//This was to catch a probably intermittent hardware error; can't replicate.
51 type=(parseType ? intern(new String(line, a, b-a)) : null);
52 } catch (Exception e) {
53 // TODO Auto-generated catch block
54 e.printStackTrace();
55 System.err.println("\n"+new String(line)+"\n"+a+", "+b+", "+(b-a));
56 assert(false);
57 }
58 }
59 b++;
60 a=b;
61
62 while(b<line.length && line[b]!='\t'){b++;}
63 assert(b>a) : "Missing field 3: "+new String(line);
64 start=Parse.parseInt(line, a, b);
65 b++;
66 a=b;
67
68 while(b<line.length && line[b]!='\t'){b++;}
69 assert(b>a) : "Missing field 4: "+new String(line);
70 stop=Parse.parseInt(line, a, b);
71 b++;
72 a=b;
73
74 while(b<line.length && line[b]!='\t'){b++;}
75 if(b<=a){
76 //Badly formatted line; common in IMG
77 return;
78 }
79 assert(b>a) : "Missing field 5: "+new String(line);
80 if(b==a+1 && line[a]=='.'){score=-1;}
81 else{score=Parse.parseFloat(line, a, b);}
82 b++;
83 a=b;
84
85 while(b<line.length && line[b]!='\t'){b++;}
86 assert(b>a) : "Missing field 6: "+new String(line);
87 assert(b==a+1);
88 strand=find(line[a], STRANDS);
89 // assert(strand>0) : line[a]+", "+Arrays.toString(STRANDS)+", "+(char)line[b];
90 b++;
91 a=b;
92
93 while(b<line.length && line[b]!='\t'){b++;}
94 assert(b>a) : "Missing field 7: "+new String(line);
95 assert(b==a+1);
96 if(line[a]=='.'){phase=-1;}
97 else{phase=Parse.parseInt(line, a, b);}
98 b++;
99 a=b;
100
101 while(b<line.length && line[b]!='\t'){b++;}
102 assert(b>a) : "Missing field 8: "+new String(line);
103 if(b==a+1 && line[a]=='.'){attributes=DOTS;}
104 else{attributes=parseAttributes ? new String(line, a, b-a) : null;}
105 b++;
106 a=b;
107
108 // assert(strand>=0) : "\n"+this.toString()+"\n"+new String(line);
109 }
110
111 public GffLine(VCFLine vcf){
112 seqid=vcf.scaf;
113 source=DOTS;
114 type="sequence_variant_obs";
115 start=vcf.start()+1;
116 stop=vcf.stop()+1;
117 score=(float)vcf.qual;
118 strand=PLUS;
119 phase=-1;
120 final int vtype=vcf.type();
121 ByteBuilder bb=new ByteBuilder(16);
122 bb.append("ID=").append(Var.typeArray[vtype]).append(' ');
123 if(vtype==Var.SUB){
124 bb.append(vcf.ref).append('>').append(vcf.alt);
125 }else if(vtype==Var.DEL){
126 bb.append("length ").append(vcf.reflen()-vcf.readlen());
127 }else if(vtype==Var.INS){
128 int offset=vcf.reflen();
129 int length=vcf.readlen()-offset;
130 bb.append(vcf.alt, offset, length);
131 }else if(vtype==Var.NOCALL){
132 bb.append("length ").append(vcf.reflen());
133 }
134 attributes=bb.toString();
135 bb.clear();
136 }
137
138 public GffLine(Var v, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){
139 seqid=v.scafName();
140 source=DOTS;
141 type="sequence_variant_obs";
142 start=v.start+1;
143 stop=Tools.max(v.start+1, v.stop);
144 score=(float)v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map);
145 strand=PLUS;
146 phase=-1;
147 final int vtype=v.type();
148 ByteBuilder bb=new ByteBuilder(16);
149 bb.append("ID=").append(Var.typeArray[vtype]);
150 if(vtype==Var.SUB || vtype==Var.INS){
151 bb.append(' ').append(v.allele);
152 }else if(vtype==Var.DEL || vtype==Var.NOCALL){
153 bb.append(" length ").append(v.reflen());
154 }else{assert(false) : vtype+"\n"+v;}
155 attributes=bb.toString();
156 bb.clear();
157 }
158
159 public GffLine(Var v){
160 seqid=v.scafName();
161 source="BBTools";
162 type="sequence_variant_obs";
163 start=v.start+1;
164 stop=Tools.max(v.start+1, v.stop);
165 score=-1;
166 strand=PLUS;
167 phase=-1;
168 final int vtype=v.type();
169 ByteBuilder bb=new ByteBuilder(16);
170 bb.append("ID=").append(Var.typeArray[vtype]);
171 if(vtype==Var.SUB || vtype==Var.INS){
172 bb.append(' ').append(v.allele);
173 }else if(vtype==Var.DEL || vtype==Var.NOCALL){
174 bb.append(" length ").append(v.reflen());
175 }else{assert(false) : vtype+"\n"+v;}
176 attributes=bb.toString();
177 bb.clear();
178 }
179
180 public static ArrayList<GffLine> loadGffFile(String fname, String types, boolean banUnprocessed){
181 FileFormat ff=FileFormat.testInput(fname, FileFormat.GFF, null, false, false);
182 return loadGffFile(ff, types, banUnprocessed);
183 }
184
185 public static ArrayList<GffLine>[] loadGffFileByType(FileFormat ff, String types, boolean banUnprocessed){
186 ArrayList<GffLine> list=loadGffFile(ff, types, banUnprocessed);
187 String[] typeArray=types.split(",");
188 ArrayList<GffLine>[] lists=new ArrayList[typeArray.length];
189 for(int i=0; i<typeArray.length; i++){
190 String type=typeArray[i];
191 lists[i]=new ArrayList<GffLine>();
192 for(GffLine gline : list){
193 if(gline.type.equals(type)){
194 lists[i].add(gline);
195 }
196 }
197 }
198 return lists;
199 }
200
201 public static ArrayList<GffLine> loadGffFile(FileFormat ff, String types, boolean banUnprocessed){
202 HashSet<String> set=null;
203 if(types!=null){
204 String[] split=types.split(",");
205 set=new HashSet<String>(split.length*2);
206 for(String s : split){
207 set.add(s);
208 }
209 }
210
211 ArrayList<GffLine> list=new ArrayList<GffLine>();
212 ByteFile bf=ByteFile.makeByteFile(ff);
213 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){
214 if(line[0]=='#'){
215 //skip
216 }else{
217 GffLine gline=new GffLine(line);
218 assert(gline.strand>=0) : "\n"+gline.toString()+"\n"+new String(line)+"\n";
219 if(set==null || (gline.type!=null && set.contains(gline.type))){
220 if(!banUnprocessed || ProkObject.processType(gline.prokType())){
221 list.add(gline);
222 }
223 }
224 }
225 }
226
227 boolean error=bf.close();
228 assert(!error) : "Problem with file "+ff.name();
229 return list;
230 }
231
232 public static void toText(ByteBuilder bb, Var v, double properPairRate, double totalQualityAvg,
233 double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){
234 // assert(false);
235 bb.append(v.scafName(map)).append('\t');
236 bb.append('.').append('\t');
237 bb.append("sequence_variant_obs").append('\t');
238 bb.append(v.start+1).append('\t');
239 bb.append(Tools.max(v.start+1, v.stop)).append('\t');
240 bb.append(v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map), 2).append('\t');
241 bb.append('+').append('\t');
242 bb.append('.').append('\t');
243 // System.err.println(v.typeString()+", "+v.start+", "+v.stop);
244 final int vtype=v.type();
245 bb.append("ID=").append(Var.typeArray[vtype]);
246 if(vtype==Var.SUB || vtype==Var.INS){
247 bb.append(' ').append(v.allele);
248 }else if(vtype==Var.DEL || vtype==Var.NOCALL){
249 bb.append(" length ").append(v.reflen());
250 }else{assert(false) : vtype+"\n"+v;}
251 }
252
253 public static String toHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy,
254 long reads, long pairs, long properPairs, long bases, String ref){
255 StringBuilder sb=new StringBuilder();
256
257 final double readLengthAvg=bases/Tools.max(1.0, reads);
258 sb.append("##gff-version 3\n");
259 sb.append("#BBMapVersion\t"+Shared.BBMAP_VERSION_STRING+"\n");
260 sb.append("#ploidy\t"+ploidy+"\n");
261 sb.append(String.format(Locale.ROOT, "#rarity\t%.5f\n", rarity));
262 sb.append(String.format(Locale.ROOT, "#minAlleleFraction\t%.4f\n", minAlleleFraction));
263 sb.append("#reads\t"+reads+"\n");
264 sb.append("#pairedReads\t"+pairs+"\n");
265 sb.append("#properlyPairedReads\t"+properPairs+"\n");
266 sb.append(String.format(Locale.ROOT, "#readLengthAvg\t%.2f\n", readLengthAvg));
267 sb.append(String.format(Locale.ROOT, "#properPairRate\t%.4f\n", properPairRate));
268 sb.append(String.format(Locale.ROOT, "#totalQualityAvg\t%.4f\n", totalQualityAvg));
269 sb.append(String.format(Locale.ROOT, "#mapqAvg\t%.2f\n", mapqAvg));
270 if(ref!=null){sb.append("#reference\t"+ref+"\n");}
271
272 sb.append("#seqid source type start end score strand phase attributes");
273 return sb.toString();
274 }
275
276 @Override
277 public String toString(){
278 ByteBuilder bb=new ByteBuilder();
279 appendTo(bb);
280 return bb.toString();
281 }
282
283 public ByteBuilder appendTo(ByteBuilder bb){
284 bb.append(seqid==null ? "." : seqid).append('\t');
285 bb.append(source==null ? "." : source).append('\t');
286 bb.append(type==null ? "." : type).append('\t');
287 bb.append(start).append('\t');
288 bb.append(stop).append('\t');
289 if(score<0){bb.append('.').append('\t');}
290 else{bb.append(score, 2).append('\t');}
291
292 bb.append((strand>=0 ? STRANDS[strand] : (byte)'.')).append('\t');
293
294 if(phase<0){bb.append('.').append('\t');}
295 else{bb.append(phase).append('\t');}
296
297 bb.append(attributes==null ? "." : attributes);
298 return bb;
299 }
300
301 public int length() {
302 return stop-start+1;
303 }
304
305 private static int find(byte a, byte[] array){
306 for(int i=0; i<array.length; i++){
307 if(array[i]==a){return i;}
308 }
309 return -1;
310 }
311
312 private static String intern(String s){
313 return Data.forceIntern(s);
314 }
315
316 @Override
317 public int hashCode(){
318 return trueStop()^seqid.hashCode();
319 }
320
321 @Override
322 public boolean equals(Object o){
323 GffLine b=(GffLine)o;
324 if(start!=b.start){return false;}
325 if(stop!=b.stop){return false;}
326 if(strand!=b.strand){return false;}
327 if(!seqid.equals(b.seqid)){return false;}
328 if(!type.equals(b.type)){return false;}
329 return true;
330 }
331
332 public int trueStart(){
333 return strand==0 ? start : stop;
334 }
335
336 public int trueStop(){
337 return strand==0 ? stop : start;
338 }
339
340 public final int prokType(){
341 if(type.equals("CDS")){
342 return ProkObject.CDS;
343 }else if(type.equals("tRNA")){
344 return ProkObject.tRNA;
345 }else if(type.equals("rRNA")){
346 if(attributes.contains("16S")){
347 return ProkObject.r16S;
348 }else if(attributes.contains("23S")){
349 return ProkObject.r23S;
350 }else if(attributes.contains("18S")){
351 return ProkObject.r18S;
352 }else if(attributes.contains("5S") && length()<300){
353 return ProkObject.r5S;
354 }
355 }
356 return -1;
357 }
358
359 public final boolean partial(){return attributes!=null && attributes.contains("partial=true");}
360
361 public final boolean inbounds(int scaflen){return start>=0 && stop<scaflen;}
362
363 public String seqid;
364 public String source;
365 public String type;
366 public int start;
367 public int stop;
368 public float score;
369 public int strand;
370 public int phase;
371 public String attributes;
372
373 private static final byte[] STRANDS=new byte[] {'+', '-', '?', '.'};
374 public static final int PLUS=0, MINUS=1, QMARK=2, DOT=3;
375 public static final String DOTS=".";
376
377 public static boolean parseSeqid=true;
378 public static boolean paseSource=false;
379 public static boolean parseType=true;
380 public static boolean parseScore=false;
381 public static boolean parseAttributes=true;
382
383 // public static boolean parseSeqid=true;
384 // public static boolean paseSource=true;
385 // public static boolean parseType=true;
386 // public static boolean parseScore=true;
387 // public static boolean parseAttributes=true;
388
389 }