jpayne@68
|
1 package gff;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.util.ArrayList;
|
jpayne@68
|
4 import java.util.Arrays;
|
jpayne@68
|
5
|
jpayne@68
|
6 import fileIO.ByteStreamWriter;
|
jpayne@68
|
7 import shared.Shared;
|
jpayne@68
|
8 import shared.Tools;
|
jpayne@68
|
9 import structures.ByteBuilder;
|
jpayne@68
|
10
|
jpayne@68
|
11 public class GbffFeature {
|
jpayne@68
|
12
|
jpayne@68
|
13 public GbffFeature(final ArrayList<byte[]> lines0, final String typeString, final String accessionString){
|
jpayne@68
|
14 accession=accessionString;
|
jpayne@68
|
15 setType(typeString);
|
jpayne@68
|
16 parseSlow(lines0);
|
jpayne@68
|
17 if(type==rRNA){
|
jpayne@68
|
18 setSubtype();
|
jpayne@68
|
19 }
|
jpayne@68
|
20 if(stop<start){error=true;}
|
jpayne@68
|
21 }
|
jpayne@68
|
22
|
jpayne@68
|
23 private void parseSlow(final ArrayList<byte[]> lines0){
|
jpayne@68
|
24 ArrayList<byte[]> lines=fixLines(lines0);
|
jpayne@68
|
25 parseStartStop(lines.get(0));
|
jpayne@68
|
26 for(int i=1; i<lines.size(); i++){
|
jpayne@68
|
27 byte[] line=lines.get(i);
|
jpayne@68
|
28 if(Tools.startsWith(line, "product=")){
|
jpayne@68
|
29 product=parseLine(line);
|
jpayne@68
|
30 }else if(Tools.startsWith(line, "locus_tag=")){
|
jpayne@68
|
31 locus_tag=parseLine(line);
|
jpayne@68
|
32 }else if(Tools.equals(line, "pseudo")){
|
jpayne@68
|
33 pseudo=true;
|
jpayne@68
|
34 }
|
jpayne@68
|
35
|
jpayne@68
|
36 // else if(Tools.startsWith(line, "ID=")){
|
jpayne@68
|
37 // id=parseLine(line);
|
jpayne@68
|
38 // }else if(Tools.startsWith(line, "Name=")){
|
jpayne@68
|
39 // name=parseLine(line);
|
jpayne@68
|
40 // }
|
jpayne@68
|
41 }
|
jpayne@68
|
42 // System.err.println("\nvvvvv");
|
jpayne@68
|
43 // for(byte[] line : lines0){
|
jpayne@68
|
44 // System.err.println("'"+new String(line)+"'");
|
jpayne@68
|
45 // }
|
jpayne@68
|
46 // for(byte[] line : lines){
|
jpayne@68
|
47 // System.err.println("'"+new String(line)+"'");
|
jpayne@68
|
48 // }
|
jpayne@68
|
49 // System.err.println("^^^^^");
|
jpayne@68
|
50 }
|
jpayne@68
|
51
|
jpayne@68
|
52 ArrayList<byte[]> fixLines(ArrayList<byte[]> lines){
|
jpayne@68
|
53 ArrayList<byte[]> fixed=new ArrayList<byte[]>();
|
jpayne@68
|
54 ByteBuilder bb=new ByteBuilder();
|
jpayne@68
|
55 for(byte[] line : lines){
|
jpayne@68
|
56 if(bb.length()>0 && line[21]=='/'){
|
jpayne@68
|
57 fixed.add(bb.toBytes());
|
jpayne@68
|
58 bb.clear();
|
jpayne@68
|
59 }
|
jpayne@68
|
60 append(bb, line);
|
jpayne@68
|
61 }
|
jpayne@68
|
62 if(bb.length()>0){
|
jpayne@68
|
63 fixed.add(bb.toBytes());
|
jpayne@68
|
64 bb.clear();
|
jpayne@68
|
65 }
|
jpayne@68
|
66 return fixed;
|
jpayne@68
|
67 }
|
jpayne@68
|
68
|
jpayne@68
|
69 void append(ByteBuilder bb, byte[] line){
|
jpayne@68
|
70 assert(line[20]==' ');
|
jpayne@68
|
71 assert(line.length>21);
|
jpayne@68
|
72 // assert(line[21]!=' ') : "'"+new String(line)+"'";
|
jpayne@68
|
73 if(line[21]=='/'){
|
jpayne@68
|
74 bb.append(line, 22, line.length-22);
|
jpayne@68
|
75 }else{
|
jpayne@68
|
76 // System.err.println(line.length+", "+21+", "+(line.length-21+1)+"\n'"+new String(line)+"'");
|
jpayne@68
|
77 if(bb.length>0){bb.append(' ');}
|
jpayne@68
|
78 bb.append(line, 21, line.length-21);
|
jpayne@68
|
79 }
|
jpayne@68
|
80 }
|
jpayne@68
|
81
|
jpayne@68
|
82 void setType(String typeString){
|
jpayne@68
|
83 int x=Tools.find(typeString, typeStrings);
|
jpayne@68
|
84 assert(x>=0) : x+", "+typeString;
|
jpayne@68
|
85 type=x;
|
jpayne@68
|
86 }
|
jpayne@68
|
87
|
jpayne@68
|
88 void parseStartStop(final byte[] line0){
|
jpayne@68
|
89 byte[] line=line0;
|
jpayne@68
|
90
|
jpayne@68
|
91 if(line[0]=='c'){
|
jpayne@68
|
92 assert(Tools.startsWith(line, "complement("));
|
jpayne@68
|
93 line=Arrays.copyOfRange(line, 11, line.length-1);
|
jpayne@68
|
94 strand=Shared.MINUS;
|
jpayne@68
|
95 }
|
jpayne@68
|
96 if(line[0]=='j'){
|
jpayne@68
|
97 assert(Tools.startsWith(line, "join("));
|
jpayne@68
|
98 line=Arrays.copyOfRange(line, 5, line.length-1);
|
jpayne@68
|
99 strand=Shared.MINUS;
|
jpayne@68
|
100 }
|
jpayne@68
|
101
|
jpayne@68
|
102 int i=0;
|
jpayne@68
|
103 for(start=0; i<line.length; i++){
|
jpayne@68
|
104 int x=line[i];
|
jpayne@68
|
105 if(x=='.'){break;}
|
jpayne@68
|
106 else if(x!='<'){
|
jpayne@68
|
107 if(Tools.isDigit(x)){
|
jpayne@68
|
108 start=start*10+(x-'0');
|
jpayne@68
|
109 }else{
|
jpayne@68
|
110 //if(!error){System.err.println(new String(line0)+"\n"+new String(line));}
|
jpayne@68
|
111 error=true;
|
jpayne@68
|
112 }
|
jpayne@68
|
113 }
|
jpayne@68
|
114 }
|
jpayne@68
|
115 // while(line[i]=='.'){i++;} //Not needed
|
jpayne@68
|
116 for(stop=0; i<line.length; i++){
|
jpayne@68
|
117 int x=line[i];
|
jpayne@68
|
118 if(x=='.' || x==','){
|
jpayne@68
|
119 stop=0;
|
jpayne@68
|
120 }else if(x==' '){
|
jpayne@68
|
121 //do nothing; line wrap
|
jpayne@68
|
122 }else if(x!='>'){
|
jpayne@68
|
123 if(Tools.isDigit(x)){
|
jpayne@68
|
124 stop=stop*10+(x-'0');
|
jpayne@68
|
125 }else{
|
jpayne@68
|
126 //if(!error){System.err.println(new String(line0)+"\n"+new String(line));}
|
jpayne@68
|
127 error=true;
|
jpayne@68
|
128 }
|
jpayne@68
|
129 }
|
jpayne@68
|
130 }
|
jpayne@68
|
131 }
|
jpayne@68
|
132
|
jpayne@68
|
133 String parseLine(byte[] line){
|
jpayne@68
|
134 String[] split=Tools.equalsPattern.split(new String(line));
|
jpayne@68
|
135 String s=split[1];
|
jpayne@68
|
136 return s.substring(1, s.length()-1);
|
jpayne@68
|
137 }
|
jpayne@68
|
138
|
jpayne@68
|
139 void setSubtype(){
|
jpayne@68
|
140 subtype=-1;
|
jpayne@68
|
141 if(product==null){return;}
|
jpayne@68
|
142 String[] split=Tools.spacePattern.split(product);
|
jpayne@68
|
143 subtype=Tools.find(split[0], typeStrings);
|
jpayne@68
|
144 // assert(false) : type+", "+subtype+", "+split[0]+", "+this.toString()+"\n"+product;
|
jpayne@68
|
145 }
|
jpayne@68
|
146
|
jpayne@68
|
147 public void toGff(ByteStreamWriter bsw) {
|
jpayne@68
|
148 ByteBuilder bb=bsw.getBuffer();
|
jpayne@68
|
149 appendGff(bb);
|
jpayne@68
|
150 bb.nl();
|
jpayne@68
|
151 bsw.flushBuffer(false);
|
jpayne@68
|
152 }
|
jpayne@68
|
153
|
jpayne@68
|
154 public ByteBuilder appendGff(ByteBuilder bb) {
|
jpayne@68
|
155 // bsw.print("#seqid source type start end score strand phase attributes\n".getBytes());
|
jpayne@68
|
156 bb.append(accession).tab();
|
jpayne@68
|
157 bb.append('.').tab();
|
jpayne@68
|
158 bb.append((pseudo && type==GENE) ? "pseudogene" : typeStringsGff[type]).tab();
|
jpayne@68
|
159 bb.append(start).tab();
|
jpayne@68
|
160 bb.append(stop).tab();
|
jpayne@68
|
161 bb.append('.').tab();
|
jpayne@68
|
162 bb.append(Shared.strandCodes2[strand]).tab();
|
jpayne@68
|
163 bb.append('.').tab();
|
jpayne@68
|
164
|
jpayne@68
|
165 boolean attributes=false;
|
jpayne@68
|
166 // if(id!=null){
|
jpayne@68
|
167 // bb.append("ID=").append(id);
|
jpayne@68
|
168 // attributes=true;
|
jpayne@68
|
169 // }
|
jpayne@68
|
170 // if(name!=null){
|
jpayne@68
|
171 // if(attributes){bb.append(';');}
|
jpayne@68
|
172 // bb.append("Name=").append(name);
|
jpayne@68
|
173 // attributes=true;
|
jpayne@68
|
174 // }
|
jpayne@68
|
175 if(product!=null){
|
jpayne@68
|
176 if(attributes){bb.append(';');}
|
jpayne@68
|
177 bb.append("product=").append(product);
|
jpayne@68
|
178 attributes=true;
|
jpayne@68
|
179 }
|
jpayne@68
|
180 if(locus_tag!=null){
|
jpayne@68
|
181 if(attributes){bb.append(';');}
|
jpayne@68
|
182 bb.append("locus_tag=").append(locus_tag);
|
jpayne@68
|
183 attributes=true;
|
jpayne@68
|
184 }
|
jpayne@68
|
185 if(subtype>-1){
|
jpayne@68
|
186 if(attributes){bb.append(';');}
|
jpayne@68
|
187 bb.append("subtype=").append(typeStringsGff[subtype]);
|
jpayne@68
|
188 attributes=true;
|
jpayne@68
|
189 }
|
jpayne@68
|
190 if(!attributes){bb.append('.');}
|
jpayne@68
|
191 return bb;
|
jpayne@68
|
192 }
|
jpayne@68
|
193
|
jpayne@68
|
194
|
jpayne@68
|
195 @Override
|
jpayne@68
|
196 public String toString(){
|
jpayne@68
|
197 return appendGff(new ByteBuilder()).toString();
|
jpayne@68
|
198 }
|
jpayne@68
|
199
|
jpayne@68
|
200 public int type=-1;
|
jpayne@68
|
201 public int subtype=-1;
|
jpayne@68
|
202 //TODO: could have coding amino, for tRNA
|
jpayne@68
|
203 public String product;
|
jpayne@68
|
204 public String locus_tag;
|
jpayne@68
|
205 // public String id;
|
jpayne@68
|
206 // public String name;
|
jpayne@68
|
207
|
jpayne@68
|
208 public int start;
|
jpayne@68
|
209 public int stop;
|
jpayne@68
|
210 public byte strand=Shared.PLUS;
|
jpayne@68
|
211 public String accession;
|
jpayne@68
|
212 public boolean pseudo=false;
|
jpayne@68
|
213 public boolean error=false;
|
jpayne@68
|
214
|
jpayne@68
|
215 public static final String[] typeStrings={"gene", "CDS", "rRNA", "tRNA", "ncRNA", "repeat_region",
|
jpayne@68
|
216 "5'UTR", "3'UTR", "intron", "exon", "5S", "16S", "23S"};
|
jpayne@68
|
217 public static final String[] typeStringsGff={"gene", "CDS", "rRNA", "tRNA", "ncRNA", "repeat_region",
|
jpayne@68
|
218 "five_prime_UTR", "three_prime_UTR", "intron", "exon", "5S", "16S", "23S"};
|
jpayne@68
|
219
|
jpayne@68
|
220 //types
|
jpayne@68
|
221 public static final int GENE=0, CDS=1, rRNA=2, tRNA=3, ncRNA=4, repeat_region=5, UTR5=6, UTR3=7, intron=8, exon=9;
|
jpayne@68
|
222 //subtypes
|
jpayne@68
|
223 public static final int r5S=10, r16S=11, r23S=12;
|
jpayne@68
|
224
|
jpayne@68
|
225 }
|