Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/gff/GffLine.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package gff; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.HashSet; | |
5 import java.util.Locale; | |
6 | |
7 import dna.Data; | |
8 import fileIO.ByteFile; | |
9 import fileIO.FileFormat; | |
10 import prok.ProkObject; | |
11 import shared.Parse; | |
12 import shared.Shared; | |
13 import shared.Tools; | |
14 import structures.ByteBuilder; | |
15 import var2.ScafMap; | |
16 import var2.VCFLine; | |
17 import var2.Var; | |
18 | |
19 /** | |
20 * Used by both the var2 and prok packages for processing gff files. | |
21 * @author Brian Bushnell | |
22 * @date Sep 12, 2018 | |
23 * | |
24 */ | |
25 public class GffLine { | |
26 | |
27 //#seqid source type start end score strand phase attributes | |
28 public GffLine(byte[] line){ | |
29 int a=0, b=0; | |
30 | |
31 while(b<line.length && line[b]!='\t'){b++;} | |
32 assert(b>a) : "Missing field 0: "+new String(line); | |
33 seqid=parseSeqid ? intern(new String(line, a, b-a)) : null; | |
34 // assert(seqid==null || seqid.equals(new String(line, a, b-a))); | |
35 // assert(seqid!=null) : new String(line, a, b-a)+", "+a+", "+b+"\n"+line; | |
36 b++; | |
37 a=b; | |
38 | |
39 while(b<line.length && line[b]!='\t'){b++;} | |
40 assert(b>a) : "Missing field 1: "+new String(line); | |
41 if(b==a+1 && line[a]=='.'){source=DOTS;} | |
42 else{source=paseSource ? intern(new String(line, a, b-a)) : null;} | |
43 b++; | |
44 a=b; | |
45 | |
46 while(b<line.length && line[b]!='\t'){b++;} | |
47 assert(b>a) : "Missing field 2: "+new String(line); | |
48 if(b==a+1 && line[a]=='.'){type=DOTS;} | |
49 else{ | |
50 try {//This was to catch a probably intermittent hardware error; can't replicate. | |
51 type=(parseType ? intern(new String(line, a, b-a)) : null); | |
52 } catch (Exception e) { | |
53 // TODO Auto-generated catch block | |
54 e.printStackTrace(); | |
55 System.err.println("\n"+new String(line)+"\n"+a+", "+b+", "+(b-a)); | |
56 assert(false); | |
57 } | |
58 } | |
59 b++; | |
60 a=b; | |
61 | |
62 while(b<line.length && line[b]!='\t'){b++;} | |
63 assert(b>a) : "Missing field 3: "+new String(line); | |
64 start=Parse.parseInt(line, a, b); | |
65 b++; | |
66 a=b; | |
67 | |
68 while(b<line.length && line[b]!='\t'){b++;} | |
69 assert(b>a) : "Missing field 4: "+new String(line); | |
70 stop=Parse.parseInt(line, a, b); | |
71 b++; | |
72 a=b; | |
73 | |
74 while(b<line.length && line[b]!='\t'){b++;} | |
75 if(b<=a){ | |
76 //Badly formatted line; common in IMG | |
77 return; | |
78 } | |
79 assert(b>a) : "Missing field 5: "+new String(line); | |
80 if(b==a+1 && line[a]=='.'){score=-1;} | |
81 else{score=Parse.parseFloat(line, a, b);} | |
82 b++; | |
83 a=b; | |
84 | |
85 while(b<line.length && line[b]!='\t'){b++;} | |
86 assert(b>a) : "Missing field 6: "+new String(line); | |
87 assert(b==a+1); | |
88 strand=find(line[a], STRANDS); | |
89 // assert(strand>0) : line[a]+", "+Arrays.toString(STRANDS)+", "+(char)line[b]; | |
90 b++; | |
91 a=b; | |
92 | |
93 while(b<line.length && line[b]!='\t'){b++;} | |
94 assert(b>a) : "Missing field 7: "+new String(line); | |
95 assert(b==a+1); | |
96 if(line[a]=='.'){phase=-1;} | |
97 else{phase=Parse.parseInt(line, a, b);} | |
98 b++; | |
99 a=b; | |
100 | |
101 while(b<line.length && line[b]!='\t'){b++;} | |
102 assert(b>a) : "Missing field 8: "+new String(line); | |
103 if(b==a+1 && line[a]=='.'){attributes=DOTS;} | |
104 else{attributes=parseAttributes ? new String(line, a, b-a) : null;} | |
105 b++; | |
106 a=b; | |
107 | |
108 // assert(strand>=0) : "\n"+this.toString()+"\n"+new String(line); | |
109 } | |
110 | |
111 public GffLine(VCFLine vcf){ | |
112 seqid=vcf.scaf; | |
113 source=DOTS; | |
114 type="sequence_variant_obs"; | |
115 start=vcf.start()+1; | |
116 stop=vcf.stop()+1; | |
117 score=(float)vcf.qual; | |
118 strand=PLUS; | |
119 phase=-1; | |
120 final int vtype=vcf.type(); | |
121 ByteBuilder bb=new ByteBuilder(16); | |
122 bb.append("ID=").append(Var.typeArray[vtype]).append(' '); | |
123 if(vtype==Var.SUB){ | |
124 bb.append(vcf.ref).append('>').append(vcf.alt); | |
125 }else if(vtype==Var.DEL){ | |
126 bb.append("length ").append(vcf.reflen()-vcf.readlen()); | |
127 }else if(vtype==Var.INS){ | |
128 int offset=vcf.reflen(); | |
129 int length=vcf.readlen()-offset; | |
130 bb.append(vcf.alt, offset, length); | |
131 }else if(vtype==Var.NOCALL){ | |
132 bb.append("length ").append(vcf.reflen()); | |
133 } | |
134 attributes=bb.toString(); | |
135 bb.clear(); | |
136 } | |
137 | |
138 public GffLine(Var v, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ | |
139 seqid=v.scafName(); | |
140 source=DOTS; | |
141 type="sequence_variant_obs"; | |
142 start=v.start+1; | |
143 stop=Tools.max(v.start+1, v.stop); | |
144 score=(float)v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map); | |
145 strand=PLUS; | |
146 phase=-1; | |
147 final int vtype=v.type(); | |
148 ByteBuilder bb=new ByteBuilder(16); | |
149 bb.append("ID=").append(Var.typeArray[vtype]); | |
150 if(vtype==Var.SUB || vtype==Var.INS){ | |
151 bb.append(' ').append(v.allele); | |
152 }else if(vtype==Var.DEL || vtype==Var.NOCALL){ | |
153 bb.append(" length ").append(v.reflen()); | |
154 }else{assert(false) : vtype+"\n"+v;} | |
155 attributes=bb.toString(); | |
156 bb.clear(); | |
157 } | |
158 | |
159 public GffLine(Var v){ | |
160 seqid=v.scafName(); | |
161 source="BBTools"; | |
162 type="sequence_variant_obs"; | |
163 start=v.start+1; | |
164 stop=Tools.max(v.start+1, v.stop); | |
165 score=-1; | |
166 strand=PLUS; | |
167 phase=-1; | |
168 final int vtype=v.type(); | |
169 ByteBuilder bb=new ByteBuilder(16); | |
170 bb.append("ID=").append(Var.typeArray[vtype]); | |
171 if(vtype==Var.SUB || vtype==Var.INS){ | |
172 bb.append(' ').append(v.allele); | |
173 }else if(vtype==Var.DEL || vtype==Var.NOCALL){ | |
174 bb.append(" length ").append(v.reflen()); | |
175 }else{assert(false) : vtype+"\n"+v;} | |
176 attributes=bb.toString(); | |
177 bb.clear(); | |
178 } | |
179 | |
180 public static ArrayList<GffLine> loadGffFile(String fname, String types, boolean banUnprocessed){ | |
181 FileFormat ff=FileFormat.testInput(fname, FileFormat.GFF, null, false, false); | |
182 return loadGffFile(ff, types, banUnprocessed); | |
183 } | |
184 | |
185 public static ArrayList<GffLine>[] loadGffFileByType(FileFormat ff, String types, boolean banUnprocessed){ | |
186 ArrayList<GffLine> list=loadGffFile(ff, types, banUnprocessed); | |
187 String[] typeArray=types.split(","); | |
188 ArrayList<GffLine>[] lists=new ArrayList[typeArray.length]; | |
189 for(int i=0; i<typeArray.length; i++){ | |
190 String type=typeArray[i]; | |
191 lists[i]=new ArrayList<GffLine>(); | |
192 for(GffLine gline : list){ | |
193 if(gline.type.equals(type)){ | |
194 lists[i].add(gline); | |
195 } | |
196 } | |
197 } | |
198 return lists; | |
199 } | |
200 | |
201 public static ArrayList<GffLine> loadGffFile(FileFormat ff, String types, boolean banUnprocessed){ | |
202 HashSet<String> set=null; | |
203 if(types!=null){ | |
204 String[] split=types.split(","); | |
205 set=new HashSet<String>(split.length*2); | |
206 for(String s : split){ | |
207 set.add(s); | |
208 } | |
209 } | |
210 | |
211 ArrayList<GffLine> list=new ArrayList<GffLine>(); | |
212 ByteFile bf=ByteFile.makeByteFile(ff); | |
213 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ | |
214 if(line[0]=='#'){ | |
215 //skip | |
216 }else{ | |
217 GffLine gline=new GffLine(line); | |
218 assert(gline.strand>=0) : "\n"+gline.toString()+"\n"+new String(line)+"\n"; | |
219 if(set==null || (gline.type!=null && set.contains(gline.type))){ | |
220 if(!banUnprocessed || ProkObject.processType(gline.prokType())){ | |
221 list.add(gline); | |
222 } | |
223 } | |
224 } | |
225 } | |
226 | |
227 boolean error=bf.close(); | |
228 assert(!error) : "Problem with file "+ff.name(); | |
229 return list; | |
230 } | |
231 | |
232 public static void toText(ByteBuilder bb, Var v, double properPairRate, double totalQualityAvg, | |
233 double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ | |
234 // assert(false); | |
235 bb.append(v.scafName(map)).append('\t'); | |
236 bb.append('.').append('\t'); | |
237 bb.append("sequence_variant_obs").append('\t'); | |
238 bb.append(v.start+1).append('\t'); | |
239 bb.append(Tools.max(v.start+1, v.stop)).append('\t'); | |
240 bb.append(v.score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map), 2).append('\t'); | |
241 bb.append('+').append('\t'); | |
242 bb.append('.').append('\t'); | |
243 // System.err.println(v.typeString()+", "+v.start+", "+v.stop); | |
244 final int vtype=v.type(); | |
245 bb.append("ID=").append(Var.typeArray[vtype]); | |
246 if(vtype==Var.SUB || vtype==Var.INS){ | |
247 bb.append(' ').append(v.allele); | |
248 }else if(vtype==Var.DEL || vtype==Var.NOCALL){ | |
249 bb.append(" length ").append(v.reflen()); | |
250 }else{assert(false) : vtype+"\n"+v;} | |
251 } | |
252 | |
253 public static String toHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy, | |
254 long reads, long pairs, long properPairs, long bases, String ref){ | |
255 StringBuilder sb=new StringBuilder(); | |
256 | |
257 final double readLengthAvg=bases/Tools.max(1.0, reads); | |
258 sb.append("##gff-version 3\n"); | |
259 sb.append("#BBMapVersion\t"+Shared.BBMAP_VERSION_STRING+"\n"); | |
260 sb.append("#ploidy\t"+ploidy+"\n"); | |
261 sb.append(String.format(Locale.ROOT, "#rarity\t%.5f\n", rarity)); | |
262 sb.append(String.format(Locale.ROOT, "#minAlleleFraction\t%.4f\n", minAlleleFraction)); | |
263 sb.append("#reads\t"+reads+"\n"); | |
264 sb.append("#pairedReads\t"+pairs+"\n"); | |
265 sb.append("#properlyPairedReads\t"+properPairs+"\n"); | |
266 sb.append(String.format(Locale.ROOT, "#readLengthAvg\t%.2f\n", readLengthAvg)); | |
267 sb.append(String.format(Locale.ROOT, "#properPairRate\t%.4f\n", properPairRate)); | |
268 sb.append(String.format(Locale.ROOT, "#totalQualityAvg\t%.4f\n", totalQualityAvg)); | |
269 sb.append(String.format(Locale.ROOT, "#mapqAvg\t%.2f\n", mapqAvg)); | |
270 if(ref!=null){sb.append("#reference\t"+ref+"\n");} | |
271 | |
272 sb.append("#seqid source type start end score strand phase attributes"); | |
273 return sb.toString(); | |
274 } | |
275 | |
276 @Override | |
277 public String toString(){ | |
278 ByteBuilder bb=new ByteBuilder(); | |
279 appendTo(bb); | |
280 return bb.toString(); | |
281 } | |
282 | |
283 public ByteBuilder appendTo(ByteBuilder bb){ | |
284 bb.append(seqid==null ? "." : seqid).append('\t'); | |
285 bb.append(source==null ? "." : source).append('\t'); | |
286 bb.append(type==null ? "." : type).append('\t'); | |
287 bb.append(start).append('\t'); | |
288 bb.append(stop).append('\t'); | |
289 if(score<0){bb.append('.').append('\t');} | |
290 else{bb.append(score, 2).append('\t');} | |
291 | |
292 bb.append((strand>=0 ? STRANDS[strand] : (byte)'.')).append('\t'); | |
293 | |
294 if(phase<0){bb.append('.').append('\t');} | |
295 else{bb.append(phase).append('\t');} | |
296 | |
297 bb.append(attributes==null ? "." : attributes); | |
298 return bb; | |
299 } | |
300 | |
301 public int length() { | |
302 return stop-start+1; | |
303 } | |
304 | |
305 private static int find(byte a, byte[] array){ | |
306 for(int i=0; i<array.length; i++){ | |
307 if(array[i]==a){return i;} | |
308 } | |
309 return -1; | |
310 } | |
311 | |
312 private static String intern(String s){ | |
313 return Data.forceIntern(s); | |
314 } | |
315 | |
316 @Override | |
317 public int hashCode(){ | |
318 return trueStop()^seqid.hashCode(); | |
319 } | |
320 | |
321 @Override | |
322 public boolean equals(Object o){ | |
323 GffLine b=(GffLine)o; | |
324 if(start!=b.start){return false;} | |
325 if(stop!=b.stop){return false;} | |
326 if(strand!=b.strand){return false;} | |
327 if(!seqid.equals(b.seqid)){return false;} | |
328 if(!type.equals(b.type)){return false;} | |
329 return true; | |
330 } | |
331 | |
332 public int trueStart(){ | |
333 return strand==0 ? start : stop; | |
334 } | |
335 | |
336 public int trueStop(){ | |
337 return strand==0 ? stop : start; | |
338 } | |
339 | |
340 public final int prokType(){ | |
341 if(type.equals("CDS")){ | |
342 return ProkObject.CDS; | |
343 }else if(type.equals("tRNA")){ | |
344 return ProkObject.tRNA; | |
345 }else if(type.equals("rRNA")){ | |
346 if(attributes.contains("16S")){ | |
347 return ProkObject.r16S; | |
348 }else if(attributes.contains("23S")){ | |
349 return ProkObject.r23S; | |
350 }else if(attributes.contains("18S")){ | |
351 return ProkObject.r18S; | |
352 }else if(attributes.contains("5S") && length()<300){ | |
353 return ProkObject.r5S; | |
354 } | |
355 } | |
356 return -1; | |
357 } | |
358 | |
359 public final boolean partial(){return attributes!=null && attributes.contains("partial=true");} | |
360 | |
361 public final boolean inbounds(int scaflen){return start>=0 && stop<scaflen;} | |
362 | |
363 public String seqid; | |
364 public String source; | |
365 public String type; | |
366 public int start; | |
367 public int stop; | |
368 public float score; | |
369 public int strand; | |
370 public int phase; | |
371 public String attributes; | |
372 | |
373 private static final byte[] STRANDS=new byte[] {'+', '-', '?', '.'}; | |
374 public static final int PLUS=0, MINUS=1, QMARK=2, DOT=3; | |
375 public static final String DOTS="."; | |
376 | |
377 public static boolean parseSeqid=true; | |
378 public static boolean paseSource=false; | |
379 public static boolean parseType=true; | |
380 public static boolean parseScore=false; | |
381 public static boolean parseAttributes=true; | |
382 | |
383 // public static boolean parseSeqid=true; | |
384 // public static boolean paseSource=true; | |
385 // public static boolean parseType=true; | |
386 // public static boolean parseScore=true; | |
387 // public static boolean parseAttributes=true; | |
388 | |
389 } |