jpayne@69
|
1 from cpython cimport PyBytes_FromStringAndSize
|
jpayne@69
|
2
|
jpayne@69
|
3 from libc.stdio cimport printf, feof, fgets
|
jpayne@69
|
4 from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
|
jpayne@69
|
5 from libc.stdlib cimport free, malloc, calloc, realloc
|
jpayne@69
|
6 from libc.stdlib cimport atoi, atol, atof
|
jpayne@69
|
7
|
jpayne@69
|
8 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
|
jpayne@69
|
9 from pysam.libcutils cimport encode_filename, from_string_and_size
|
jpayne@69
|
10
|
jpayne@69
|
11 import collections
|
jpayne@69
|
12 import copy
|
jpayne@69
|
13
|
jpayne@69
|
14
|
jpayne@69
|
15 cdef char *StrOrEmpty(char * buffer):
|
jpayne@69
|
16 if buffer == NULL:
|
jpayne@69
|
17 return ""
|
jpayne@69
|
18 else: return buffer
|
jpayne@69
|
19
|
jpayne@69
|
20
|
jpayne@69
|
21 cdef int isNew(char * p, char * buffer, size_t nbytes):
|
jpayne@69
|
22 """return True if `p` is located within `buffer` of size
|
jpayne@69
|
23 `nbytes`
|
jpayne@69
|
24 """
|
jpayne@69
|
25 if p == NULL:
|
jpayne@69
|
26 return 0
|
jpayne@69
|
27
|
jpayne@69
|
28 return not (buffer <= p <= buffer + nbytes)
|
jpayne@69
|
29
|
jpayne@69
|
30
|
jpayne@69
|
31 cdef class TupleProxy:
|
jpayne@69
|
32 '''Proxy class for access to parsed row as a tuple.
|
jpayne@69
|
33
|
jpayne@69
|
34 This class represents a table row for fast read-access.
|
jpayne@69
|
35
|
jpayne@69
|
36 Access to individual fields is via the [] operator.
|
jpayne@69
|
37
|
jpayne@69
|
38 Only read-only access is implemented.
|
jpayne@69
|
39
|
jpayne@69
|
40 '''
|
jpayne@69
|
41
|
jpayne@69
|
42 def __cinit__(self, encoding="ascii"):
|
jpayne@69
|
43 self.data = NULL
|
jpayne@69
|
44 self.fields = NULL
|
jpayne@69
|
45 self.nbytes = 0
|
jpayne@69
|
46 self.is_modified = 0
|
jpayne@69
|
47 self.nfields = 0
|
jpayne@69
|
48 # start counting at field offset
|
jpayne@69
|
49 self.offset = 0
|
jpayne@69
|
50 self.encoding = encoding
|
jpayne@69
|
51
|
jpayne@69
|
52 def __dealloc__(self):
|
jpayne@69
|
53 cdef int x
|
jpayne@69
|
54 if self.is_modified:
|
jpayne@69
|
55 for x from 0 <= x < self.nfields:
|
jpayne@69
|
56 if isNew(self.fields[x], self.data, self.nbytes):
|
jpayne@69
|
57 free(self.fields[x])
|
jpayne@69
|
58 self.fields[x] = NULL
|
jpayne@69
|
59
|
jpayne@69
|
60 if self.data != NULL:
|
jpayne@69
|
61 free(self.data)
|
jpayne@69
|
62 if self.fields != NULL:
|
jpayne@69
|
63 free(self.fields)
|
jpayne@69
|
64
|
jpayne@69
|
65 def __copy__(self):
|
jpayne@69
|
66 if self.is_modified:
|
jpayne@69
|
67 raise NotImplementedError(
|
jpayne@69
|
68 "copying modified tuples is not implemented")
|
jpayne@69
|
69 cdef TupleProxy n = type(self)()
|
jpayne@69
|
70 n.copy(self.data, self.nbytes, reset=True)
|
jpayne@69
|
71 return n
|
jpayne@69
|
72
|
jpayne@69
|
73 def compare(self, TupleProxy other):
|
jpayne@69
|
74 '''return -1,0,1, if contents in this are binary
|
jpayne@69
|
75 <,=,> to *other*
|
jpayne@69
|
76
|
jpayne@69
|
77 '''
|
jpayne@69
|
78 if self.is_modified or other.is_modified:
|
jpayne@69
|
79 raise NotImplementedError(
|
jpayne@69
|
80 'comparison of modified TupleProxies is not implemented')
|
jpayne@69
|
81 if self.data == other.data:
|
jpayne@69
|
82 return 0
|
jpayne@69
|
83
|
jpayne@69
|
84 if self.nbytes < other.nbytes:
|
jpayne@69
|
85 return -1
|
jpayne@69
|
86 elif self.nbytes > other.nbytes:
|
jpayne@69
|
87 return 1
|
jpayne@69
|
88 return memcmp(self.data, other.data, self.nbytes)
|
jpayne@69
|
89
|
jpayne@69
|
90 def __richcmp__(self, TupleProxy other, int op):
|
jpayne@69
|
91 if op == 2: # == operator
|
jpayne@69
|
92 return self.compare(other) == 0
|
jpayne@69
|
93 elif op == 3: # != operator
|
jpayne@69
|
94 return self.compare(other) != 0
|
jpayne@69
|
95 else:
|
jpayne@69
|
96 err_msg = "op {0} isn't implemented yet".format(op)
|
jpayne@69
|
97 raise NotImplementedError(err_msg)
|
jpayne@69
|
98
|
jpayne@69
|
99 cdef take(self, char * buffer, size_t nbytes):
|
jpayne@69
|
100 '''start presenting buffer.
|
jpayne@69
|
101
|
jpayne@69
|
102 Take ownership of the pointer.
|
jpayne@69
|
103 '''
|
jpayne@69
|
104 self.data = buffer
|
jpayne@69
|
105 self.nbytes = nbytes
|
jpayne@69
|
106 self.update(buffer, nbytes)
|
jpayne@69
|
107
|
jpayne@69
|
108 cdef present(self, char * buffer, size_t nbytes):
|
jpayne@69
|
109 '''start presenting buffer.
|
jpayne@69
|
110
|
jpayne@69
|
111 Do not take ownership of the pointer.
|
jpayne@69
|
112 '''
|
jpayne@69
|
113 self.update(buffer, nbytes)
|
jpayne@69
|
114
|
jpayne@69
|
115 cdef copy(self, char * buffer, size_t nbytes, bint reset=False):
|
jpayne@69
|
116 '''start presenting buffer of size *nbytes*.
|
jpayne@69
|
117
|
jpayne@69
|
118 Buffer is a '\0'-terminated string without the '\n'.
|
jpayne@69
|
119
|
jpayne@69
|
120 Take a copy of buffer.
|
jpayne@69
|
121 '''
|
jpayne@69
|
122 # +1 for '\0'
|
jpayne@69
|
123 cdef int s = sizeof(char) * (nbytes + 1)
|
jpayne@69
|
124 self.data = <char*>malloc(s)
|
jpayne@69
|
125 if self.data == NULL:
|
jpayne@69
|
126 raise ValueError("out of memory in TupleProxy.copy()")
|
jpayne@69
|
127 memcpy(<char*>self.data, buffer, s)
|
jpayne@69
|
128
|
jpayne@69
|
129 if reset:
|
jpayne@69
|
130 for x from 0 <= x < nbytes:
|
jpayne@69
|
131 if self.data[x] == b'\0':
|
jpayne@69
|
132 self.data[x] = b'\t'
|
jpayne@69
|
133
|
jpayne@69
|
134 self.update(self.data, nbytes)
|
jpayne@69
|
135
|
jpayne@69
|
136 cpdef int getMinFields(self):
|
jpayne@69
|
137 '''return minimum number of fields.'''
|
jpayne@69
|
138 # 1 is not a valid tabix entry, but TupleProxy
|
jpayne@69
|
139 # could be more generic.
|
jpayne@69
|
140 return 1
|
jpayne@69
|
141
|
jpayne@69
|
142 cpdef int getMaxFields(self):
|
jpayne@69
|
143 '''return maximum number of fields. Return
|
jpayne@69
|
144 0 for unknown length.'''
|
jpayne@69
|
145 return 0
|
jpayne@69
|
146
|
jpayne@69
|
147 cdef update(self, char * buffer, size_t nbytes):
|
jpayne@69
|
148 '''update internal data.
|
jpayne@69
|
149
|
jpayne@69
|
150 *buffer* is a \0 terminated string.
|
jpayne@69
|
151
|
jpayne@69
|
152 *nbytes* is the number of bytes in buffer (excluding
|
jpayne@69
|
153 the \0)
|
jpayne@69
|
154
|
jpayne@69
|
155 Update starts work in buffer, thus can be used
|
jpayne@69
|
156 to collect any number of fields until nbytes
|
jpayne@69
|
157 is exhausted.
|
jpayne@69
|
158
|
jpayne@69
|
159 If max_fields is set, the number of fields is initialized to
|
jpayne@69
|
160 max_fields.
|
jpayne@69
|
161
|
jpayne@69
|
162 '''
|
jpayne@69
|
163 cdef char * pos
|
jpayne@69
|
164 cdef char * old_pos
|
jpayne@69
|
165 cdef int field
|
jpayne@69
|
166 cdef int max_fields, min_fields, x
|
jpayne@69
|
167
|
jpayne@69
|
168 assert strlen(buffer) == nbytes, \
|
jpayne@69
|
169 "length of buffer (%i) != number of bytes (%i)" % (
|
jpayne@69
|
170 strlen(buffer), nbytes)
|
jpayne@69
|
171
|
jpayne@69
|
172 if buffer[nbytes] != 0:
|
jpayne@69
|
173 raise ValueError("incomplete line at %s" % buffer)
|
jpayne@69
|
174
|
jpayne@69
|
175 #################################
|
jpayne@69
|
176 # remove line breaks and feeds and update number of bytes
|
jpayne@69
|
177 x = nbytes - 1
|
jpayne@69
|
178 while x > 0 and (buffer[x] == b'\n' or buffer[x] == b'\r'):
|
jpayne@69
|
179 buffer[x] = b'\0'
|
jpayne@69
|
180 x -= 1
|
jpayne@69
|
181 self.nbytes = x + 1
|
jpayne@69
|
182
|
jpayne@69
|
183 #################################
|
jpayne@69
|
184 # clear data
|
jpayne@69
|
185 if self.fields != NULL:
|
jpayne@69
|
186 free(self.fields)
|
jpayne@69
|
187
|
jpayne@69
|
188 for field from 0 <= field < self.nfields:
|
jpayne@69
|
189 if isNew(self.fields[field], self.data, self.nbytes):
|
jpayne@69
|
190 free(self.fields[field])
|
jpayne@69
|
191
|
jpayne@69
|
192 self.is_modified = self.nfields = 0
|
jpayne@69
|
193
|
jpayne@69
|
194 #################################
|
jpayne@69
|
195 # allocate new
|
jpayne@69
|
196 max_fields = self.getMaxFields()
|
jpayne@69
|
197 # pre-count fields - better would be
|
jpayne@69
|
198 # to guess or dynamically grow
|
jpayne@69
|
199 if max_fields == 0:
|
jpayne@69
|
200 for x from 0 <= x < nbytes:
|
jpayne@69
|
201 if buffer[x] == b'\t':
|
jpayne@69
|
202 max_fields += 1
|
jpayne@69
|
203 max_fields += 1
|
jpayne@69
|
204
|
jpayne@69
|
205 self.fields = <char **>calloc(max_fields, sizeof(char *))
|
jpayne@69
|
206 if self.fields == NULL:
|
jpayne@69
|
207 raise ValueError("out of memory in TupleProxy.update()")
|
jpayne@69
|
208
|
jpayne@69
|
209 #################################
|
jpayne@69
|
210 # start filling
|
jpayne@69
|
211 field = 0
|
jpayne@69
|
212 self.fields[field] = pos = buffer
|
jpayne@69
|
213 field += 1
|
jpayne@69
|
214 old_pos = pos
|
jpayne@69
|
215 while 1:
|
jpayne@69
|
216
|
jpayne@69
|
217 pos = <char*>memchr(pos, b'\t', nbytes)
|
jpayne@69
|
218 if pos == NULL:
|
jpayne@69
|
219 break
|
jpayne@69
|
220 if field >= max_fields:
|
jpayne@69
|
221 raise ValueError(
|
jpayne@69
|
222 "parsing error: more than %i fields in line: %s" %
|
jpayne@69
|
223 (max_fields, buffer))
|
jpayne@69
|
224
|
jpayne@69
|
225 pos[0] = b'\0'
|
jpayne@69
|
226 pos += 1
|
jpayne@69
|
227 self.fields[field] = pos
|
jpayne@69
|
228 field += 1
|
jpayne@69
|
229 nbytes -= pos - old_pos
|
jpayne@69
|
230 if nbytes < 0:
|
jpayne@69
|
231 break
|
jpayne@69
|
232 old_pos = pos
|
jpayne@69
|
233 self.nfields = field
|
jpayne@69
|
234 if self.nfields < self.getMinFields():
|
jpayne@69
|
235 raise ValueError(
|
jpayne@69
|
236 "parsing error: fewer than %i fields in line: %s" %
|
jpayne@69
|
237 (self.getMinFields(), buffer))
|
jpayne@69
|
238
|
jpayne@69
|
239 def _getindex(self, int index):
|
jpayne@69
|
240 '''return item at idx index'''
|
jpayne@69
|
241 cdef int i = index
|
jpayne@69
|
242 if i < 0:
|
jpayne@69
|
243 i += self.nfields
|
jpayne@69
|
244 if i < 0:
|
jpayne@69
|
245 raise IndexError("list index out of range")
|
jpayne@69
|
246 # apply offset - separating a fixed number
|
jpayne@69
|
247 # of fields from a variable number such as in VCF
|
jpayne@69
|
248 i += self.offset
|
jpayne@69
|
249 if i >= self.nfields:
|
jpayne@69
|
250 raise IndexError(
|
jpayne@69
|
251 "list index out of range %i >= %i" %
|
jpayne@69
|
252 (i, self.nfields))
|
jpayne@69
|
253 return force_str(self.fields[i], self.encoding)
|
jpayne@69
|
254
|
jpayne@69
|
255 def __getitem__(self, key):
|
jpayne@69
|
256 if type(key) == int:
|
jpayne@69
|
257 return self._getindex(key)
|
jpayne@69
|
258 # slice object
|
jpayne@69
|
259 start, end, step = key.indices(self.nfields)
|
jpayne@69
|
260 result = []
|
jpayne@69
|
261 for index in range(start, end, step):
|
jpayne@69
|
262 result.append(self._getindex(index))
|
jpayne@69
|
263 return result
|
jpayne@69
|
264
|
jpayne@69
|
265 def _setindex(self, index, value):
|
jpayne@69
|
266 '''set item at idx index.'''
|
jpayne@69
|
267 cdef int idx = index
|
jpayne@69
|
268 if idx < 0:
|
jpayne@69
|
269 raise IndexError("list index out of range")
|
jpayne@69
|
270 if idx >= self.nfields:
|
jpayne@69
|
271 raise IndexError("list index out of range")
|
jpayne@69
|
272
|
jpayne@69
|
273 if isNew(self.fields[idx], self.data, self.nbytes):
|
jpayne@69
|
274 free(self.fields[idx])
|
jpayne@69
|
275
|
jpayne@69
|
276 self.is_modified = 1
|
jpayne@69
|
277
|
jpayne@69
|
278 if value is None:
|
jpayne@69
|
279 self.fields[idx] = NULL
|
jpayne@69
|
280 return
|
jpayne@69
|
281
|
jpayne@69
|
282 # conversion with error checking
|
jpayne@69
|
283 value = force_bytes(value)
|
jpayne@69
|
284 cdef char * tmp = <char*>value
|
jpayne@69
|
285 self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char))
|
jpayne@69
|
286 if self.fields[idx] == NULL:
|
jpayne@69
|
287 raise ValueError("out of memory" )
|
jpayne@69
|
288 strcpy(self.fields[idx], tmp)
|
jpayne@69
|
289
|
jpayne@69
|
290 def __setitem__(self, index, value):
|
jpayne@69
|
291 '''set item at *index* to *value*'''
|
jpayne@69
|
292 cdef int i = index
|
jpayne@69
|
293 if i < 0:
|
jpayne@69
|
294 i += self.nfields
|
jpayne@69
|
295 i += self.offset
|
jpayne@69
|
296
|
jpayne@69
|
297 self._setindex(i, value)
|
jpayne@69
|
298
|
jpayne@69
|
299 def __len__(self):
|
jpayne@69
|
300 return self.nfields
|
jpayne@69
|
301
|
jpayne@69
|
302 def __iter__(self):
|
jpayne@69
|
303 return TupleProxyIterator(self)
|
jpayne@69
|
304
|
jpayne@69
|
305 def __str__(self):
|
jpayne@69
|
306 '''return original data'''
|
jpayne@69
|
307 # copy and replace \0 bytes with \t characters
|
jpayne@69
|
308 cdef char * cpy
|
jpayne@69
|
309 if self.is_modified:
|
jpayne@69
|
310 # todo: treat NULL values
|
jpayne@69
|
311 result = []
|
jpayne@69
|
312 for x in xrange(0, self.nfields):
|
jpayne@69
|
313 result.append(StrOrEmpty(self.fields[x]).decode(self.encoding))
|
jpayne@69
|
314 return "\t".join(result)
|
jpayne@69
|
315 else:
|
jpayne@69
|
316 cpy = <char*>calloc(sizeof(char), self.nbytes+1)
|
jpayne@69
|
317 if cpy == NULL:
|
jpayne@69
|
318 raise ValueError("out of memory")
|
jpayne@69
|
319 memcpy(cpy, self.data, self.nbytes+1)
|
jpayne@69
|
320 for x from 0 <= x < self.nbytes:
|
jpayne@69
|
321 if cpy[x] == b'\0':
|
jpayne@69
|
322 cpy[x] = b'\t'
|
jpayne@69
|
323 result = cpy[:self.nbytes]
|
jpayne@69
|
324 free(cpy)
|
jpayne@69
|
325 r = result.decode(self.encoding)
|
jpayne@69
|
326 return r
|
jpayne@69
|
327
|
jpayne@69
|
328
|
jpayne@69
|
329 cdef class TupleProxyIterator:
|
jpayne@69
|
330 def __init__(self, proxy):
|
jpayne@69
|
331 self.proxy = proxy
|
jpayne@69
|
332 self.index = 0
|
jpayne@69
|
333
|
jpayne@69
|
334 def __iter__(self):
|
jpayne@69
|
335 return self
|
jpayne@69
|
336
|
jpayne@69
|
337 def __next__(self):
|
jpayne@69
|
338 if self.index >= self.proxy.nfields:
|
jpayne@69
|
339 raise StopIteration
|
jpayne@69
|
340 cdef char *retval = self.proxy.fields[self.index]
|
jpayne@69
|
341 self.index += 1
|
jpayne@69
|
342 return force_str(retval, self.proxy.encoding) if retval != NULL else None
|
jpayne@69
|
343
|
jpayne@69
|
344
|
jpayne@69
|
345 def toDot(v):
|
jpayne@69
|
346 '''convert value to '.' if None'''
|
jpayne@69
|
347 if v is None:
|
jpayne@69
|
348 return "."
|
jpayne@69
|
349 else:
|
jpayne@69
|
350 return str(v)
|
jpayne@69
|
351
|
jpayne@69
|
352 def quote(v):
|
jpayne@69
|
353 '''return a quoted attribute.'''
|
jpayne@69
|
354 if isinstance(v, str):
|
jpayne@69
|
355 return '"%s"' % v
|
jpayne@69
|
356 else:
|
jpayne@69
|
357 return str(v)
|
jpayne@69
|
358
|
jpayne@69
|
359
|
jpayne@69
|
360 cdef class NamedTupleProxy(TupleProxy):
|
jpayne@69
|
361
|
jpayne@69
|
362 map_key2field = {}
|
jpayne@69
|
363
|
jpayne@69
|
364 def __setattr__(self, key, value):
|
jpayne@69
|
365 '''set attribute.'''
|
jpayne@69
|
366 cdef int idx
|
jpayne@69
|
367 idx, f = self.map_key2field[key]
|
jpayne@69
|
368 if idx >= self.nfields:
|
jpayne@69
|
369 raise KeyError("field %s not set" % key)
|
jpayne@69
|
370 TupleProxy.__setitem__(self, idx, str(value))
|
jpayne@69
|
371
|
jpayne@69
|
372 def __getattr__(self, key):
|
jpayne@69
|
373 cdef int idx
|
jpayne@69
|
374 idx, f = self.map_key2field[key]
|
jpayne@69
|
375 if idx >= self.nfields:
|
jpayne@69
|
376 raise KeyError("field %s not set" % key)
|
jpayne@69
|
377 if f == str:
|
jpayne@69
|
378 return force_str(self.fields[idx],
|
jpayne@69
|
379 self.encoding)
|
jpayne@69
|
380 return f(self.fields[idx])
|
jpayne@69
|
381
|
jpayne@69
|
382
|
jpayne@69
|
383 cdef dot_or_float(v):
|
jpayne@69
|
384 if v == "" or v == b".":
|
jpayne@69
|
385 return None
|
jpayne@69
|
386 else:
|
jpayne@69
|
387 try:
|
jpayne@69
|
388 return int(v)
|
jpayne@69
|
389 except ValueError:
|
jpayne@69
|
390 return float(v)
|
jpayne@69
|
391
|
jpayne@69
|
392
|
jpayne@69
|
393 cdef dot_or_int(v):
|
jpayne@69
|
394 if v == "" or v == b".":
|
jpayne@69
|
395 return None
|
jpayne@69
|
396 else:
|
jpayne@69
|
397 return int(v)
|
jpayne@69
|
398
|
jpayne@69
|
399
|
jpayne@69
|
400 cdef dot_or_str(v):
|
jpayne@69
|
401 if v == "" or v == b".":
|
jpayne@69
|
402 return None
|
jpayne@69
|
403 else:
|
jpayne@69
|
404 return force_str(v)
|
jpayne@69
|
405
|
jpayne@69
|
406
|
jpayne@69
|
407 cdef int from1based(v):
|
jpayne@69
|
408 return atoi(v) - 1
|
jpayne@69
|
409
|
jpayne@69
|
410
|
jpayne@69
|
411 cdef str to1based(int v):
|
jpayne@69
|
412 return str(v + 1)
|
jpayne@69
|
413
|
jpayne@69
|
414
|
jpayne@69
|
415 cdef class GTFProxy(NamedTupleProxy):
|
jpayne@69
|
416 '''Proxy class for access to GTF fields.
|
jpayne@69
|
417
|
jpayne@69
|
418 This class represents a GTF entry for fast read-access.
|
jpayne@69
|
419 Write-access has been added as well, though some care must
|
jpayne@69
|
420 be taken. If any of the string fields (contig, source, ...)
|
jpayne@69
|
421 are set, the new value is tied to the lifetime of the
|
jpayne@69
|
422 argument that was supplied.
|
jpayne@69
|
423
|
jpayne@69
|
424 The only exception is the attributes field when set from
|
jpayne@69
|
425 a dictionary - this field will manage its own memory.
|
jpayne@69
|
426
|
jpayne@69
|
427 '''
|
jpayne@69
|
428 separator = "; "
|
jpayne@69
|
429
|
jpayne@69
|
430 # first value is field index, the tuple contains conversion
|
jpayne@69
|
431 # functions for getting (converting internal string representation
|
jpayne@69
|
432 # to pythonic value) and setting (converting pythonic value to
|
jpayne@69
|
433 # interval string representation)
|
jpayne@69
|
434 map_key2field = {
|
jpayne@69
|
435 'contig' : (0, (str, str)),
|
jpayne@69
|
436 'source' : (1, (dot_or_str, str)),
|
jpayne@69
|
437 'feature': (2, (dot_or_str, str)),
|
jpayne@69
|
438 'start' : (3, (from1based, to1based)),
|
jpayne@69
|
439 'end' : (4, (int, int)),
|
jpayne@69
|
440 'score' : (5, (dot_or_float, toDot)),
|
jpayne@69
|
441 'strand' : (6, (dot_or_str, str)),
|
jpayne@69
|
442 'frame' : (7, (dot_or_int, toDot)),
|
jpayne@69
|
443 'attributes': (8, (str, str))}
|
jpayne@69
|
444
|
jpayne@69
|
445 def __cinit__(self):
|
jpayne@69
|
446 # automatically calls TupleProxy.__cinit__
|
jpayne@69
|
447 self.attribute_dict = None
|
jpayne@69
|
448
|
jpayne@69
|
449 cpdef int getMinFields(self):
|
jpayne@69
|
450 '''return minimum number of fields.'''
|
jpayne@69
|
451 return 9
|
jpayne@69
|
452
|
jpayne@69
|
453 cpdef int getMaxFields(self):
|
jpayne@69
|
454 '''return max number of fields.'''
|
jpayne@69
|
455 return 9
|
jpayne@69
|
456
|
jpayne@69
|
457 def to_dict(self):
|
jpayne@69
|
458 """parse attributes - return as dict
|
jpayne@69
|
459
|
jpayne@69
|
460 The dictionary can be modified to update attributes.
|
jpayne@69
|
461 """
|
jpayne@69
|
462 if not self.attribute_dict:
|
jpayne@69
|
463 self.attribute_dict = self.attribute_string2dict(
|
jpayne@69
|
464 self.attributes)
|
jpayne@69
|
465 self.is_modified = True
|
jpayne@69
|
466 return self.attribute_dict
|
jpayne@69
|
467
|
jpayne@69
|
468 def as_dict(self):
|
jpayne@69
|
469 """deprecated: use :meth:`to_dict`
|
jpayne@69
|
470 """
|
jpayne@69
|
471 return self.to_dict()
|
jpayne@69
|
472
|
jpayne@69
|
473 def from_dict(self, d):
|
jpayne@69
|
474 '''set attributes from a dictionary.'''
|
jpayne@69
|
475 self.attribute_dict = None
|
jpayne@69
|
476 attribute_string = force_bytes(
|
jpayne@69
|
477 self.attribute_dict2string(d),
|
jpayne@69
|
478 self.encoding)
|
jpayne@69
|
479 self._setindex(8, attribute_string)
|
jpayne@69
|
480
|
jpayne@69
|
481 def __str__(self):
|
jpayne@69
|
482 cdef char * cpy
|
jpayne@69
|
483 cdef int x
|
jpayne@69
|
484
|
jpayne@69
|
485 if self.is_modified:
|
jpayne@69
|
486 return "\t".join(
|
jpayne@69
|
487 (self.contig,
|
jpayne@69
|
488 toDot(self.source),
|
jpayne@69
|
489 toDot(self.feature),
|
jpayne@69
|
490 str(self.start + 1),
|
jpayne@69
|
491 str(self.end),
|
jpayne@69
|
492 toDot(self.score),
|
jpayne@69
|
493 toDot(self.strand),
|
jpayne@69
|
494 toDot(self.frame),
|
jpayne@69
|
495 self.attributes))
|
jpayne@69
|
496 else:
|
jpayne@69
|
497 return super().__str__()
|
jpayne@69
|
498
|
jpayne@69
|
499 def invert(self, int lcontig):
|
jpayne@69
|
500 '''invert coordinates to negative strand coordinates
|
jpayne@69
|
501
|
jpayne@69
|
502 This method will only act if the feature is on the
|
jpayne@69
|
503 negative strand.'''
|
jpayne@69
|
504
|
jpayne@69
|
505 if self.strand[0] == '-':
|
jpayne@69
|
506 start = min(self.start, self.end)
|
jpayne@69
|
507 end = max(self.start, self.end)
|
jpayne@69
|
508 self.start, self.end = lcontig - end, lcontig - start
|
jpayne@69
|
509
|
jpayne@69
|
510 def keys(self):
|
jpayne@69
|
511 '''return a list of attributes defined in this entry.'''
|
jpayne@69
|
512 if not self.attribute_dict:
|
jpayne@69
|
513 self.attribute_dict = self.attribute_string2dict(
|
jpayne@69
|
514 self.attributes)
|
jpayne@69
|
515 return self.attribute_dict.keys()
|
jpayne@69
|
516
|
jpayne@69
|
517 def __getitem__(self, key):
|
jpayne@69
|
518 return self.__getattr__(key)
|
jpayne@69
|
519
|
jpayne@69
|
520 def setAttribute(self, name, value):
|
jpayne@69
|
521 '''convenience method to set an attribute.
|
jpayne@69
|
522 '''
|
jpayne@69
|
523 if not self.attribute_dict:
|
jpayne@69
|
524 self.attribute_dict = self.attribute_string2dict(
|
jpayne@69
|
525 self.attributes)
|
jpayne@69
|
526 self.attribute_dict[name] = value
|
jpayne@69
|
527 self.is_modified = True
|
jpayne@69
|
528
|
jpayne@69
|
529 def attribute_string2dict(self, s):
|
jpayne@69
|
530 return collections.OrderedDict(
|
jpayne@69
|
531 self.attribute_string2iterator(s))
|
jpayne@69
|
532
|
jpayne@69
|
533 def __cmp__(self, other):
|
jpayne@69
|
534 return (self.contig, self.strand, self.start) < \
|
jpayne@69
|
535 (other.contig, other.strand, other.start)
|
jpayne@69
|
536
|
jpayne@69
|
537 # python 3 compatibility
|
jpayne@69
|
538 def __richcmp__(GTFProxy self, GTFProxy other, int op):
|
jpayne@69
|
539 if op == 0:
|
jpayne@69
|
540 return (self.contig, self.strand, self.start) < \
|
jpayne@69
|
541 (other.contig, other.strand, other.start)
|
jpayne@69
|
542 elif op == 1:
|
jpayne@69
|
543 return (self.contig, self.strand, self.start) <= \
|
jpayne@69
|
544 (other.contig, other.strand, other.start)
|
jpayne@69
|
545 elif op == 2:
|
jpayne@69
|
546 return self.compare(other) == 0
|
jpayne@69
|
547 elif op == 3:
|
jpayne@69
|
548 return self.compare(other) != 0
|
jpayne@69
|
549 else:
|
jpayne@69
|
550 err_msg = "op {0} isn't implemented yet".format(op)
|
jpayne@69
|
551 raise NotImplementedError(err_msg)
|
jpayne@69
|
552
|
jpayne@69
|
553 def dict2attribute_string(self, d):
|
jpayne@69
|
554 """convert dictionary to attribute string in GTF format.
|
jpayne@69
|
555
|
jpayne@69
|
556 """
|
jpayne@69
|
557 aa = []
|
jpayne@69
|
558 for k, v in d.items():
|
jpayne@69
|
559 if isinstance(v, str):
|
jpayne@69
|
560 aa.append('{} "{}"'.format(k, v))
|
jpayne@69
|
561 else:
|
jpayne@69
|
562 aa.append("{} {}".format(k, str(v)))
|
jpayne@69
|
563
|
jpayne@69
|
564 return self.separator.join(aa) + ";"
|
jpayne@69
|
565
|
jpayne@69
|
566 def attribute_string2iterator(self, s):
|
jpayne@69
|
567 """convert attribute string in GTF format to records
|
jpayne@69
|
568 and iterate over key, value pairs.
|
jpayne@69
|
569 """
|
jpayne@69
|
570
|
jpayne@69
|
571 # remove comments
|
jpayne@69
|
572 attributes = force_str(s, encoding=self.encoding)
|
jpayne@69
|
573
|
jpayne@69
|
574 # separate into fields
|
jpayne@69
|
575 # Fields might contain a ";", for example in ENSEMBL GTF file
|
jpayne@69
|
576 # for mouse, v78:
|
jpayne@69
|
577 # ...; transcript_name "TXNRD2;-001"; ....
|
jpayne@69
|
578 # The current heuristic is to split on a semicolon followed by a
|
jpayne@69
|
579 # space, see also http://mblab.wustl.edu/GTF22.html
|
jpayne@69
|
580
|
jpayne@69
|
581 # Remove white space to prevent a last empty field.
|
jpayne@69
|
582 fields = [x.strip() for x in attributes.strip().split("; ")]
|
jpayne@69
|
583 for f in fields:
|
jpayne@69
|
584
|
jpayne@69
|
585 # strip semicolon (GTF files without a space after the last semicolon)
|
jpayne@69
|
586 if f.endswith(";"):
|
jpayne@69
|
587 f = f[:-1]
|
jpayne@69
|
588
|
jpayne@69
|
589 # split at most once in order to avoid separating
|
jpayne@69
|
590 # multi-word values
|
jpayne@69
|
591 d = [x.strip() for x in f.split(" ", 1)]
|
jpayne@69
|
592
|
jpayne@69
|
593 n, v = d[0], d[1]
|
jpayne@69
|
594 if len(d) > 2:
|
jpayne@69
|
595 v = d[1:]
|
jpayne@69
|
596
|
jpayne@69
|
597 if v[0] == '"' and v[-1] == '"':
|
jpayne@69
|
598 v = v[1:-1]
|
jpayne@69
|
599 else:
|
jpayne@69
|
600 ## try to convert to a value
|
jpayne@69
|
601 try:
|
jpayne@69
|
602 v = float(v)
|
jpayne@69
|
603 v = int(v)
|
jpayne@69
|
604 except ValueError:
|
jpayne@69
|
605 pass
|
jpayne@69
|
606 except TypeError:
|
jpayne@69
|
607 pass
|
jpayne@69
|
608
|
jpayne@69
|
609 yield n, v
|
jpayne@69
|
610
|
jpayne@69
|
611 def __getattr__(self, key):
|
jpayne@69
|
612 """Generic lookup of attribute from GFF/GTF attributes
|
jpayne@69
|
613 """
|
jpayne@69
|
614
|
jpayne@69
|
615 # Only called if there *isn't* an attribute with this name
|
jpayne@69
|
616 cdef int idx
|
jpayne@69
|
617 idx, f = self.map_key2field.get(key, (-1, None))
|
jpayne@69
|
618 if idx >= 0:
|
jpayne@69
|
619 # deal with known attributes (fields 0-8)
|
jpayne@69
|
620 if idx == 8:
|
jpayne@69
|
621 # flush attributes if requested
|
jpayne@69
|
622 if self.is_modified and self.attribute_dict is not None:
|
jpayne@69
|
623 s = self.dict2attribute_string(self.attribute_dict)
|
jpayne@69
|
624 TupleProxy._setindex(self, idx, s)
|
jpayne@69
|
625 self.attribute_dict = None
|
jpayne@69
|
626 return s
|
jpayne@69
|
627
|
jpayne@69
|
628 if f[0] == str:
|
jpayne@69
|
629 return force_str(self.fields[idx],
|
jpayne@69
|
630 self.encoding)
|
jpayne@69
|
631 else:
|
jpayne@69
|
632 return f[0](self.fields[idx])
|
jpayne@69
|
633 else:
|
jpayne@69
|
634 # deal with generic attributes (gene_id, ...)
|
jpayne@69
|
635 if self.attribute_dict is None:
|
jpayne@69
|
636 self.attribute_dict = self.attribute_string2dict(
|
jpayne@69
|
637 self.attributes)
|
jpayne@69
|
638 return self.attribute_dict[key]
|
jpayne@69
|
639
|
jpayne@69
|
640 def __setattr__(self, key, value):
|
jpayne@69
|
641 '''set attribute.'''
|
jpayne@69
|
642
|
jpayne@69
|
643 # Note that __setattr__ is called before properties, so __setattr__ and
|
jpayne@69
|
644 # properties don't mix well. This is different from __getattr__ which is
|
jpayne@69
|
645 # called after any properties have been resolved.
|
jpayne@69
|
646 cdef int idx
|
jpayne@69
|
647 idx, f = self.map_key2field.get(key, (-1, None))
|
jpayne@69
|
648
|
jpayne@69
|
649 if idx >= 0:
|
jpayne@69
|
650 if value is None:
|
jpayne@69
|
651 s = "."
|
jpayne@69
|
652 elif f[1] == str:
|
jpayne@69
|
653 s = force_bytes(value,
|
jpayne@69
|
654 self.encoding)
|
jpayne@69
|
655 else:
|
jpayne@69
|
656 s = str(f[1](value))
|
jpayne@69
|
657 TupleProxy._setindex(self, idx, s)
|
jpayne@69
|
658 else:
|
jpayne@69
|
659 if self.attribute_dict is None:
|
jpayne@69
|
660 self.attribute_dict = self.attribute_string2dict(
|
jpayne@69
|
661 self.attributes)
|
jpayne@69
|
662 self.attribute_dict[key] = value
|
jpayne@69
|
663 self.is_modified = True
|
jpayne@69
|
664
|
jpayne@69
|
665 # for backwards compatibility
|
jpayne@69
|
666 def asDict(self, *args, **kwargs):
|
jpayne@69
|
667 return self.to_dict(*args, **kwargs)
|
jpayne@69
|
668
|
jpayne@69
|
669 def fromDict(self, *args, **kwargs):
|
jpayne@69
|
670 return self.from_dict(*args, **kwargs)
|
jpayne@69
|
671
|
jpayne@69
|
672
|
jpayne@69
|
673 cdef class GFF3Proxy(GTFProxy):
|
jpayne@69
|
674
|
jpayne@69
|
675 def dict2attribute_string(self, d):
|
jpayne@69
|
676 """convert dictionary to attribute string."""
|
jpayne@69
|
677 return ";".join(["{}={}".format(k, v) for k, v in d.items()])
|
jpayne@69
|
678
|
jpayne@69
|
679 def attribute_string2iterator(self, s):
|
jpayne@69
|
680 """convert attribute string in GFF3 format to records
|
jpayne@69
|
681 and iterate over key, value pairs.
|
jpayne@69
|
682 """
|
jpayne@69
|
683
|
jpayne@69
|
684 for f in (x.strip() for x in s.split(";")):
|
jpayne@69
|
685 if not f:
|
jpayne@69
|
686 continue
|
jpayne@69
|
687
|
jpayne@69
|
688 key, value = f.split("=", 1)
|
jpayne@69
|
689 value = value.strip()
|
jpayne@69
|
690
|
jpayne@69
|
691 ## try to convert to a value
|
jpayne@69
|
692 try:
|
jpayne@69
|
693 value = float(value)
|
jpayne@69
|
694 value = int(value)
|
jpayne@69
|
695 except ValueError:
|
jpayne@69
|
696 pass
|
jpayne@69
|
697 except TypeError:
|
jpayne@69
|
698 pass
|
jpayne@69
|
699
|
jpayne@69
|
700 yield key.strip(), value
|
jpayne@69
|
701
|
jpayne@69
|
702
|
jpayne@69
|
703 cdef class BedProxy(NamedTupleProxy):
|
jpayne@69
|
704 '''Proxy class for access to Bed fields.
|
jpayne@69
|
705
|
jpayne@69
|
706 This class represents a BED entry for fast read-access.
|
jpayne@69
|
707 '''
|
jpayne@69
|
708 map_key2field = {
|
jpayne@69
|
709 'contig' : (0, str),
|
jpayne@69
|
710 'start' : (1, int),
|
jpayne@69
|
711 'end' : (2, int),
|
jpayne@69
|
712 'name' : (3, str),
|
jpayne@69
|
713 'score' : (4, float),
|
jpayne@69
|
714 'strand' : (5, str),
|
jpayne@69
|
715 'thickStart' : (6, int),
|
jpayne@69
|
716 'thickEnd' : (7, int),
|
jpayne@69
|
717 'itemRGB' : (8, str),
|
jpayne@69
|
718 'blockCount': (9, int),
|
jpayne@69
|
719 'blockSizes': (10, str),
|
jpayne@69
|
720 'blockStarts': (11, str), }
|
jpayne@69
|
721
|
jpayne@69
|
722 cpdef int getMinFields(self):
|
jpayne@69
|
723 '''return minimum number of fields.'''
|
jpayne@69
|
724 return 3
|
jpayne@69
|
725
|
jpayne@69
|
726 cpdef int getMaxFields(self):
|
jpayne@69
|
727 '''return max number of fields.'''
|
jpayne@69
|
728 return 12
|
jpayne@69
|
729
|
jpayne@69
|
730 cdef update(self, char * buffer, size_t nbytes):
|
jpayne@69
|
731 '''update internal data.
|
jpayne@69
|
732
|
jpayne@69
|
733 nbytes does not include the terminal '\0'.
|
jpayne@69
|
734 '''
|
jpayne@69
|
735 NamedTupleProxy.update(self, buffer, nbytes)
|
jpayne@69
|
736
|
jpayne@69
|
737 if self.nfields < 3:
|
jpayne@69
|
738 raise ValueError(
|
jpayne@69
|
739 "bed format requires at least three columns")
|
jpayne@69
|
740
|
jpayne@69
|
741 # determines bed format
|
jpayne@69
|
742 self.bedfields = self.nfields
|
jpayne@69
|
743
|
jpayne@69
|
744 # do automatic conversion
|
jpayne@69
|
745 self.contig = self.fields[0]
|
jpayne@69
|
746 self.start = atoi(self.fields[1])
|
jpayne@69
|
747 self.end = atoi(self.fields[2])
|
jpayne@69
|
748
|
jpayne@69
|
749 # __setattr__ in base class seems to take precedence
|
jpayne@69
|
750 # hence implement setters in __setattr__
|
jpayne@69
|
751 #property start:
|
jpayne@69
|
752 # def __get__( self ): return self.start
|
jpayne@69
|
753 #property end:
|
jpayne@69
|
754 # def __get__( self ): return self.end
|
jpayne@69
|
755
|
jpayne@69
|
756 def __str__(self):
|
jpayne@69
|
757 cdef int save_fields = self.nfields
|
jpayne@69
|
758 # ensure fields to use correct format
|
jpayne@69
|
759 self.nfields = self.bedfields
|
jpayne@69
|
760 retval = super().__str__()
|
jpayne@69
|
761 self.nfields = save_fields
|
jpayne@69
|
762 return retval
|
jpayne@69
|
763
|
jpayne@69
|
764 def __setattr__(self, key, value):
|
jpayne@69
|
765 '''set attribute.'''
|
jpayne@69
|
766 if key == "start":
|
jpayne@69
|
767 self.start = value
|
jpayne@69
|
768 elif key == "end":
|
jpayne@69
|
769 self.end = value
|
jpayne@69
|
770
|
jpayne@69
|
771 cdef int idx
|
jpayne@69
|
772 idx, f = self.map_key2field[key]
|
jpayne@69
|
773 TupleProxy._setindex(self, idx, str(value))
|
jpayne@69
|
774
|
jpayne@69
|
775
|
jpayne@69
|
776 cdef class VCFProxy(NamedTupleProxy):
|
jpayne@69
|
777 '''Proxy class for access to VCF fields.
|
jpayne@69
|
778
|
jpayne@69
|
779 The genotypes are accessed via a numeric index.
|
jpayne@69
|
780 Sample headers are not available.
|
jpayne@69
|
781 '''
|
jpayne@69
|
782 map_key2field = {
|
jpayne@69
|
783 'contig' : (0, str),
|
jpayne@69
|
784 'pos' : (1, int),
|
jpayne@69
|
785 'id' : (2, str),
|
jpayne@69
|
786 'ref' : (3, str),
|
jpayne@69
|
787 'alt' : (4, str),
|
jpayne@69
|
788 'qual' : (5, str),
|
jpayne@69
|
789 'filter' : (6, str),
|
jpayne@69
|
790 'info' : (7, str),
|
jpayne@69
|
791 'format' : (8, str) }
|
jpayne@69
|
792
|
jpayne@69
|
793 def __cinit__(self):
|
jpayne@69
|
794 # automatically calls TupleProxy.__cinit__
|
jpayne@69
|
795 # start indexed access at genotypes
|
jpayne@69
|
796 self.offset = 9
|
jpayne@69
|
797
|
jpayne@69
|
798 cdef update(self, char * buffer, size_t nbytes):
|
jpayne@69
|
799 '''update internal data.
|
jpayne@69
|
800
|
jpayne@69
|
801 nbytes does not include the terminal '\0'.
|
jpayne@69
|
802 '''
|
jpayne@69
|
803 NamedTupleProxy.update(self, buffer, nbytes)
|
jpayne@69
|
804
|
jpayne@69
|
805 self.contig = self.fields[0]
|
jpayne@69
|
806 # vcf counts from 1 - correct here
|
jpayne@69
|
807 self.pos = atoi(self.fields[1]) - 1
|
jpayne@69
|
808
|
jpayne@69
|
809 def __len__(self):
|
jpayne@69
|
810 '''return number of genotype fields.'''
|
jpayne@69
|
811 return max(0, self.nfields - 9)
|
jpayne@69
|
812
|
jpayne@69
|
813 property pos:
|
jpayne@69
|
814 '''feature end (in 0-based open/closed coordinates).'''
|
jpayne@69
|
815 def __get__(self):
|
jpayne@69
|
816 return self.pos
|
jpayne@69
|
817
|
jpayne@69
|
818 def __setattr__(self, key, value):
|
jpayne@69
|
819 '''set attribute.'''
|
jpayne@69
|
820 if key == "pos":
|
jpayne@69
|
821 self.pos = value
|
jpayne@69
|
822 value += 1
|
jpayne@69
|
823
|
jpayne@69
|
824 cdef int idx
|
jpayne@69
|
825 idx, f = self.map_key2field[key]
|
jpayne@69
|
826 TupleProxy._setindex(self, idx, str(value))
|
jpayne@69
|
827
|
jpayne@69
|
828
|
jpayne@69
|
829 __all__ = [
|
jpayne@69
|
830 "TupleProxy",
|
jpayne@69
|
831 "NamedTupleProxy",
|
jpayne@69
|
832 "GTFProxy",
|
jpayne@69
|
833 "GFF3Proxy",
|
jpayne@69
|
834 "BedProxy",
|
jpayne@69
|
835 "VCFProxy"]
|