Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabixproxies.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 from cpython cimport PyBytes_FromStringAndSize | |
2 | |
3 from libc.stdio cimport printf, feof, fgets | |
4 from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr | |
5 from libc.stdlib cimport free, malloc, calloc, realloc | |
6 from libc.stdlib cimport atoi, atol, atof | |
7 | |
8 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str | |
9 from pysam.libcutils cimport encode_filename, from_string_and_size | |
10 | |
11 import collections | |
12 import copy | |
13 | |
14 | |
15 cdef char *StrOrEmpty(char * buffer): | |
16 if buffer == NULL: | |
17 return "" | |
18 else: return buffer | |
19 | |
20 | |
21 cdef int isNew(char * p, char * buffer, size_t nbytes): | |
22 """return True if `p` is located within `buffer` of size | |
23 `nbytes` | |
24 """ | |
25 if p == NULL: | |
26 return 0 | |
27 | |
28 return not (buffer <= p <= buffer + nbytes) | |
29 | |
30 | |
31 cdef class TupleProxy: | |
32 '''Proxy class for access to parsed row as a tuple. | |
33 | |
34 This class represents a table row for fast read-access. | |
35 | |
36 Access to individual fields is via the [] operator. | |
37 | |
38 Only read-only access is implemented. | |
39 | |
40 ''' | |
41 | |
42 def __cinit__(self, encoding="ascii"): | |
43 self.data = NULL | |
44 self.fields = NULL | |
45 self.nbytes = 0 | |
46 self.is_modified = 0 | |
47 self.nfields = 0 | |
48 # start counting at field offset | |
49 self.offset = 0 | |
50 self.encoding = encoding | |
51 | |
52 def __dealloc__(self): | |
53 cdef int x | |
54 if self.is_modified: | |
55 for x from 0 <= x < self.nfields: | |
56 if isNew(self.fields[x], self.data, self.nbytes): | |
57 free(self.fields[x]) | |
58 self.fields[x] = NULL | |
59 | |
60 if self.data != NULL: | |
61 free(self.data) | |
62 if self.fields != NULL: | |
63 free(self.fields) | |
64 | |
65 def __copy__(self): | |
66 if self.is_modified: | |
67 raise NotImplementedError( | |
68 "copying modified tuples is not implemented") | |
69 cdef TupleProxy n = type(self)() | |
70 n.copy(self.data, self.nbytes, reset=True) | |
71 return n | |
72 | |
73 def compare(self, TupleProxy other): | |
74 '''return -1,0,1, if contents in this are binary | |
75 <,=,> to *other* | |
76 | |
77 ''' | |
78 if self.is_modified or other.is_modified: | |
79 raise NotImplementedError( | |
80 'comparison of modified TupleProxies is not implemented') | |
81 if self.data == other.data: | |
82 return 0 | |
83 | |
84 if self.nbytes < other.nbytes: | |
85 return -1 | |
86 elif self.nbytes > other.nbytes: | |
87 return 1 | |
88 return memcmp(self.data, other.data, self.nbytes) | |
89 | |
90 def __richcmp__(self, TupleProxy other, int op): | |
91 if op == 2: # == operator | |
92 return self.compare(other) == 0 | |
93 elif op == 3: # != operator | |
94 return self.compare(other) != 0 | |
95 else: | |
96 err_msg = "op {0} isn't implemented yet".format(op) | |
97 raise NotImplementedError(err_msg) | |
98 | |
99 cdef take(self, char * buffer, size_t nbytes): | |
100 '''start presenting buffer. | |
101 | |
102 Take ownership of the pointer. | |
103 ''' | |
104 self.data = buffer | |
105 self.nbytes = nbytes | |
106 self.update(buffer, nbytes) | |
107 | |
108 cdef present(self, char * buffer, size_t nbytes): | |
109 '''start presenting buffer. | |
110 | |
111 Do not take ownership of the pointer. | |
112 ''' | |
113 self.update(buffer, nbytes) | |
114 | |
115 cdef copy(self, char * buffer, size_t nbytes, bint reset=False): | |
116 '''start presenting buffer of size *nbytes*. | |
117 | |
118 Buffer is a '\0'-terminated string without the '\n'. | |
119 | |
120 Take a copy of buffer. | |
121 ''' | |
122 # +1 for '\0' | |
123 cdef int s = sizeof(char) * (nbytes + 1) | |
124 self.data = <char*>malloc(s) | |
125 if self.data == NULL: | |
126 raise ValueError("out of memory in TupleProxy.copy()") | |
127 memcpy(<char*>self.data, buffer, s) | |
128 | |
129 if reset: | |
130 for x from 0 <= x < nbytes: | |
131 if self.data[x] == b'\0': | |
132 self.data[x] = b'\t' | |
133 | |
134 self.update(self.data, nbytes) | |
135 | |
136 cpdef int getMinFields(self): | |
137 '''return minimum number of fields.''' | |
138 # 1 is not a valid tabix entry, but TupleProxy | |
139 # could be more generic. | |
140 return 1 | |
141 | |
142 cpdef int getMaxFields(self): | |
143 '''return maximum number of fields. Return | |
144 0 for unknown length.''' | |
145 return 0 | |
146 | |
147 cdef update(self, char * buffer, size_t nbytes): | |
148 '''update internal data. | |
149 | |
150 *buffer* is a \0 terminated string. | |
151 | |
152 *nbytes* is the number of bytes in buffer (excluding | |
153 the \0) | |
154 | |
155 Update starts work in buffer, thus can be used | |
156 to collect any number of fields until nbytes | |
157 is exhausted. | |
158 | |
159 If max_fields is set, the number of fields is initialized to | |
160 max_fields. | |
161 | |
162 ''' | |
163 cdef char * pos | |
164 cdef char * old_pos | |
165 cdef int field | |
166 cdef int max_fields, min_fields, x | |
167 | |
168 assert strlen(buffer) == nbytes, \ | |
169 "length of buffer (%i) != number of bytes (%i)" % ( | |
170 strlen(buffer), nbytes) | |
171 | |
172 if buffer[nbytes] != 0: | |
173 raise ValueError("incomplete line at %s" % buffer) | |
174 | |
175 ################################# | |
176 # remove line breaks and feeds and update number of bytes | |
177 x = nbytes - 1 | |
178 while x > 0 and (buffer[x] == b'\n' or buffer[x] == b'\r'): | |
179 buffer[x] = b'\0' | |
180 x -= 1 | |
181 self.nbytes = x + 1 | |
182 | |
183 ################################# | |
184 # clear data | |
185 if self.fields != NULL: | |
186 free(self.fields) | |
187 | |
188 for field from 0 <= field < self.nfields: | |
189 if isNew(self.fields[field], self.data, self.nbytes): | |
190 free(self.fields[field]) | |
191 | |
192 self.is_modified = self.nfields = 0 | |
193 | |
194 ################################# | |
195 # allocate new | |
196 max_fields = self.getMaxFields() | |
197 # pre-count fields - better would be | |
198 # to guess or dynamically grow | |
199 if max_fields == 0: | |
200 for x from 0 <= x < nbytes: | |
201 if buffer[x] == b'\t': | |
202 max_fields += 1 | |
203 max_fields += 1 | |
204 | |
205 self.fields = <char **>calloc(max_fields, sizeof(char *)) | |
206 if self.fields == NULL: | |
207 raise ValueError("out of memory in TupleProxy.update()") | |
208 | |
209 ################################# | |
210 # start filling | |
211 field = 0 | |
212 self.fields[field] = pos = buffer | |
213 field += 1 | |
214 old_pos = pos | |
215 while 1: | |
216 | |
217 pos = <char*>memchr(pos, b'\t', nbytes) | |
218 if pos == NULL: | |
219 break | |
220 if field >= max_fields: | |
221 raise ValueError( | |
222 "parsing error: more than %i fields in line: %s" % | |
223 (max_fields, buffer)) | |
224 | |
225 pos[0] = b'\0' | |
226 pos += 1 | |
227 self.fields[field] = pos | |
228 field += 1 | |
229 nbytes -= pos - old_pos | |
230 if nbytes < 0: | |
231 break | |
232 old_pos = pos | |
233 self.nfields = field | |
234 if self.nfields < self.getMinFields(): | |
235 raise ValueError( | |
236 "parsing error: fewer than %i fields in line: %s" % | |
237 (self.getMinFields(), buffer)) | |
238 | |
239 def _getindex(self, int index): | |
240 '''return item at idx index''' | |
241 cdef int i = index | |
242 if i < 0: | |
243 i += self.nfields | |
244 if i < 0: | |
245 raise IndexError("list index out of range") | |
246 # apply offset - separating a fixed number | |
247 # of fields from a variable number such as in VCF | |
248 i += self.offset | |
249 if i >= self.nfields: | |
250 raise IndexError( | |
251 "list index out of range %i >= %i" % | |
252 (i, self.nfields)) | |
253 return force_str(self.fields[i], self.encoding) | |
254 | |
255 def __getitem__(self, key): | |
256 if type(key) == int: | |
257 return self._getindex(key) | |
258 # slice object | |
259 start, end, step = key.indices(self.nfields) | |
260 result = [] | |
261 for index in range(start, end, step): | |
262 result.append(self._getindex(index)) | |
263 return result | |
264 | |
265 def _setindex(self, index, value): | |
266 '''set item at idx index.''' | |
267 cdef int idx = index | |
268 if idx < 0: | |
269 raise IndexError("list index out of range") | |
270 if idx >= self.nfields: | |
271 raise IndexError("list index out of range") | |
272 | |
273 if isNew(self.fields[idx], self.data, self.nbytes): | |
274 free(self.fields[idx]) | |
275 | |
276 self.is_modified = 1 | |
277 | |
278 if value is None: | |
279 self.fields[idx] = NULL | |
280 return | |
281 | |
282 # conversion with error checking | |
283 value = force_bytes(value) | |
284 cdef char * tmp = <char*>value | |
285 self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char)) | |
286 if self.fields[idx] == NULL: | |
287 raise ValueError("out of memory" ) | |
288 strcpy(self.fields[idx], tmp) | |
289 | |
290 def __setitem__(self, index, value): | |
291 '''set item at *index* to *value*''' | |
292 cdef int i = index | |
293 if i < 0: | |
294 i += self.nfields | |
295 i += self.offset | |
296 | |
297 self._setindex(i, value) | |
298 | |
299 def __len__(self): | |
300 return self.nfields | |
301 | |
302 def __iter__(self): | |
303 return TupleProxyIterator(self) | |
304 | |
305 def __str__(self): | |
306 '''return original data''' | |
307 # copy and replace \0 bytes with \t characters | |
308 cdef char * cpy | |
309 if self.is_modified: | |
310 # todo: treat NULL values | |
311 result = [] | |
312 for x in xrange(0, self.nfields): | |
313 result.append(StrOrEmpty(self.fields[x]).decode(self.encoding)) | |
314 return "\t".join(result) | |
315 else: | |
316 cpy = <char*>calloc(sizeof(char), self.nbytes+1) | |
317 if cpy == NULL: | |
318 raise ValueError("out of memory") | |
319 memcpy(cpy, self.data, self.nbytes+1) | |
320 for x from 0 <= x < self.nbytes: | |
321 if cpy[x] == b'\0': | |
322 cpy[x] = b'\t' | |
323 result = cpy[:self.nbytes] | |
324 free(cpy) | |
325 r = result.decode(self.encoding) | |
326 return r | |
327 | |
328 | |
329 cdef class TupleProxyIterator: | |
330 def __init__(self, proxy): | |
331 self.proxy = proxy | |
332 self.index = 0 | |
333 | |
334 def __iter__(self): | |
335 return self | |
336 | |
337 def __next__(self): | |
338 if self.index >= self.proxy.nfields: | |
339 raise StopIteration | |
340 cdef char *retval = self.proxy.fields[self.index] | |
341 self.index += 1 | |
342 return force_str(retval, self.proxy.encoding) if retval != NULL else None | |
343 | |
344 | |
345 def toDot(v): | |
346 '''convert value to '.' if None''' | |
347 if v is None: | |
348 return "." | |
349 else: | |
350 return str(v) | |
351 | |
352 def quote(v): | |
353 '''return a quoted attribute.''' | |
354 if isinstance(v, str): | |
355 return '"%s"' % v | |
356 else: | |
357 return str(v) | |
358 | |
359 | |
360 cdef class NamedTupleProxy(TupleProxy): | |
361 | |
362 map_key2field = {} | |
363 | |
364 def __setattr__(self, key, value): | |
365 '''set attribute.''' | |
366 cdef int idx | |
367 idx, f = self.map_key2field[key] | |
368 if idx >= self.nfields: | |
369 raise KeyError("field %s not set" % key) | |
370 TupleProxy.__setitem__(self, idx, str(value)) | |
371 | |
372 def __getattr__(self, key): | |
373 cdef int idx | |
374 idx, f = self.map_key2field[key] | |
375 if idx >= self.nfields: | |
376 raise KeyError("field %s not set" % key) | |
377 if f == str: | |
378 return force_str(self.fields[idx], | |
379 self.encoding) | |
380 return f(self.fields[idx]) | |
381 | |
382 | |
383 cdef dot_or_float(v): | |
384 if v == "" or v == b".": | |
385 return None | |
386 else: | |
387 try: | |
388 return int(v) | |
389 except ValueError: | |
390 return float(v) | |
391 | |
392 | |
393 cdef dot_or_int(v): | |
394 if v == "" or v == b".": | |
395 return None | |
396 else: | |
397 return int(v) | |
398 | |
399 | |
400 cdef dot_or_str(v): | |
401 if v == "" or v == b".": | |
402 return None | |
403 else: | |
404 return force_str(v) | |
405 | |
406 | |
407 cdef int from1based(v): | |
408 return atoi(v) - 1 | |
409 | |
410 | |
411 cdef str to1based(int v): | |
412 return str(v + 1) | |
413 | |
414 | |
415 cdef class GTFProxy(NamedTupleProxy): | |
416 '''Proxy class for access to GTF fields. | |
417 | |
418 This class represents a GTF entry for fast read-access. | |
419 Write-access has been added as well, though some care must | |
420 be taken. If any of the string fields (contig, source, ...) | |
421 are set, the new value is tied to the lifetime of the | |
422 argument that was supplied. | |
423 | |
424 The only exception is the attributes field when set from | |
425 a dictionary - this field will manage its own memory. | |
426 | |
427 ''' | |
428 separator = "; " | |
429 | |
430 # first value is field index, the tuple contains conversion | |
431 # functions for getting (converting internal string representation | |
432 # to pythonic value) and setting (converting pythonic value to | |
433 # interval string representation) | |
434 map_key2field = { | |
435 'contig' : (0, (str, str)), | |
436 'source' : (1, (dot_or_str, str)), | |
437 'feature': (2, (dot_or_str, str)), | |
438 'start' : (3, (from1based, to1based)), | |
439 'end' : (4, (int, int)), | |
440 'score' : (5, (dot_or_float, toDot)), | |
441 'strand' : (6, (dot_or_str, str)), | |
442 'frame' : (7, (dot_or_int, toDot)), | |
443 'attributes': (8, (str, str))} | |
444 | |
445 def __cinit__(self): | |
446 # automatically calls TupleProxy.__cinit__ | |
447 self.attribute_dict = None | |
448 | |
449 cpdef int getMinFields(self): | |
450 '''return minimum number of fields.''' | |
451 return 9 | |
452 | |
453 cpdef int getMaxFields(self): | |
454 '''return max number of fields.''' | |
455 return 9 | |
456 | |
457 def to_dict(self): | |
458 """parse attributes - return as dict | |
459 | |
460 The dictionary can be modified to update attributes. | |
461 """ | |
462 if not self.attribute_dict: | |
463 self.attribute_dict = self.attribute_string2dict( | |
464 self.attributes) | |
465 self.is_modified = True | |
466 return self.attribute_dict | |
467 | |
468 def as_dict(self): | |
469 """deprecated: use :meth:`to_dict` | |
470 """ | |
471 return self.to_dict() | |
472 | |
473 def from_dict(self, d): | |
474 '''set attributes from a dictionary.''' | |
475 self.attribute_dict = None | |
476 attribute_string = force_bytes( | |
477 self.attribute_dict2string(d), | |
478 self.encoding) | |
479 self._setindex(8, attribute_string) | |
480 | |
481 def __str__(self): | |
482 cdef char * cpy | |
483 cdef int x | |
484 | |
485 if self.is_modified: | |
486 return "\t".join( | |
487 (self.contig, | |
488 toDot(self.source), | |
489 toDot(self.feature), | |
490 str(self.start + 1), | |
491 str(self.end), | |
492 toDot(self.score), | |
493 toDot(self.strand), | |
494 toDot(self.frame), | |
495 self.attributes)) | |
496 else: | |
497 return super().__str__() | |
498 | |
499 def invert(self, int lcontig): | |
500 '''invert coordinates to negative strand coordinates | |
501 | |
502 This method will only act if the feature is on the | |
503 negative strand.''' | |
504 | |
505 if self.strand[0] == '-': | |
506 start = min(self.start, self.end) | |
507 end = max(self.start, self.end) | |
508 self.start, self.end = lcontig - end, lcontig - start | |
509 | |
510 def keys(self): | |
511 '''return a list of attributes defined in this entry.''' | |
512 if not self.attribute_dict: | |
513 self.attribute_dict = self.attribute_string2dict( | |
514 self.attributes) | |
515 return self.attribute_dict.keys() | |
516 | |
517 def __getitem__(self, key): | |
518 return self.__getattr__(key) | |
519 | |
520 def setAttribute(self, name, value): | |
521 '''convenience method to set an attribute. | |
522 ''' | |
523 if not self.attribute_dict: | |
524 self.attribute_dict = self.attribute_string2dict( | |
525 self.attributes) | |
526 self.attribute_dict[name] = value | |
527 self.is_modified = True | |
528 | |
529 def attribute_string2dict(self, s): | |
530 return collections.OrderedDict( | |
531 self.attribute_string2iterator(s)) | |
532 | |
533 def __cmp__(self, other): | |
534 return (self.contig, self.strand, self.start) < \ | |
535 (other.contig, other.strand, other.start) | |
536 | |
537 # python 3 compatibility | |
538 def __richcmp__(GTFProxy self, GTFProxy other, int op): | |
539 if op == 0: | |
540 return (self.contig, self.strand, self.start) < \ | |
541 (other.contig, other.strand, other.start) | |
542 elif op == 1: | |
543 return (self.contig, self.strand, self.start) <= \ | |
544 (other.contig, other.strand, other.start) | |
545 elif op == 2: | |
546 return self.compare(other) == 0 | |
547 elif op == 3: | |
548 return self.compare(other) != 0 | |
549 else: | |
550 err_msg = "op {0} isn't implemented yet".format(op) | |
551 raise NotImplementedError(err_msg) | |
552 | |
553 def dict2attribute_string(self, d): | |
554 """convert dictionary to attribute string in GTF format. | |
555 | |
556 """ | |
557 aa = [] | |
558 for k, v in d.items(): | |
559 if isinstance(v, str): | |
560 aa.append('{} "{}"'.format(k, v)) | |
561 else: | |
562 aa.append("{} {}".format(k, str(v))) | |
563 | |
564 return self.separator.join(aa) + ";" | |
565 | |
566 def attribute_string2iterator(self, s): | |
567 """convert attribute string in GTF format to records | |
568 and iterate over key, value pairs. | |
569 """ | |
570 | |
571 # remove comments | |
572 attributes = force_str(s, encoding=self.encoding) | |
573 | |
574 # separate into fields | |
575 # Fields might contain a ";", for example in ENSEMBL GTF file | |
576 # for mouse, v78: | |
577 # ...; transcript_name "TXNRD2;-001"; .... | |
578 # The current heuristic is to split on a semicolon followed by a | |
579 # space, see also http://mblab.wustl.edu/GTF22.html | |
580 | |
581 # Remove white space to prevent a last empty field. | |
582 fields = [x.strip() for x in attributes.strip().split("; ")] | |
583 for f in fields: | |
584 | |
585 # strip semicolon (GTF files without a space after the last semicolon) | |
586 if f.endswith(";"): | |
587 f = f[:-1] | |
588 | |
589 # split at most once in order to avoid separating | |
590 # multi-word values | |
591 d = [x.strip() for x in f.split(" ", 1)] | |
592 | |
593 n, v = d[0], d[1] | |
594 if len(d) > 2: | |
595 v = d[1:] | |
596 | |
597 if v[0] == '"' and v[-1] == '"': | |
598 v = v[1:-1] | |
599 else: | |
600 ## try to convert to a value | |
601 try: | |
602 v = float(v) | |
603 v = int(v) | |
604 except ValueError: | |
605 pass | |
606 except TypeError: | |
607 pass | |
608 | |
609 yield n, v | |
610 | |
611 def __getattr__(self, key): | |
612 """Generic lookup of attribute from GFF/GTF attributes | |
613 """ | |
614 | |
615 # Only called if there *isn't* an attribute with this name | |
616 cdef int idx | |
617 idx, f = self.map_key2field.get(key, (-1, None)) | |
618 if idx >= 0: | |
619 # deal with known attributes (fields 0-8) | |
620 if idx == 8: | |
621 # flush attributes if requested | |
622 if self.is_modified and self.attribute_dict is not None: | |
623 s = self.dict2attribute_string(self.attribute_dict) | |
624 TupleProxy._setindex(self, idx, s) | |
625 self.attribute_dict = None | |
626 return s | |
627 | |
628 if f[0] == str: | |
629 return force_str(self.fields[idx], | |
630 self.encoding) | |
631 else: | |
632 return f[0](self.fields[idx]) | |
633 else: | |
634 # deal with generic attributes (gene_id, ...) | |
635 if self.attribute_dict is None: | |
636 self.attribute_dict = self.attribute_string2dict( | |
637 self.attributes) | |
638 return self.attribute_dict[key] | |
639 | |
640 def __setattr__(self, key, value): | |
641 '''set attribute.''' | |
642 | |
643 # Note that __setattr__ is called before properties, so __setattr__ and | |
644 # properties don't mix well. This is different from __getattr__ which is | |
645 # called after any properties have been resolved. | |
646 cdef int idx | |
647 idx, f = self.map_key2field.get(key, (-1, None)) | |
648 | |
649 if idx >= 0: | |
650 if value is None: | |
651 s = "." | |
652 elif f[1] == str: | |
653 s = force_bytes(value, | |
654 self.encoding) | |
655 else: | |
656 s = str(f[1](value)) | |
657 TupleProxy._setindex(self, idx, s) | |
658 else: | |
659 if self.attribute_dict is None: | |
660 self.attribute_dict = self.attribute_string2dict( | |
661 self.attributes) | |
662 self.attribute_dict[key] = value | |
663 self.is_modified = True | |
664 | |
665 # for backwards compatibility | |
666 def asDict(self, *args, **kwargs): | |
667 return self.to_dict(*args, **kwargs) | |
668 | |
669 def fromDict(self, *args, **kwargs): | |
670 return self.from_dict(*args, **kwargs) | |
671 | |
672 | |
673 cdef class GFF3Proxy(GTFProxy): | |
674 | |
675 def dict2attribute_string(self, d): | |
676 """convert dictionary to attribute string.""" | |
677 return ";".join(["{}={}".format(k, v) for k, v in d.items()]) | |
678 | |
679 def attribute_string2iterator(self, s): | |
680 """convert attribute string in GFF3 format to records | |
681 and iterate over key, value pairs. | |
682 """ | |
683 | |
684 for f in (x.strip() for x in s.split(";")): | |
685 if not f: | |
686 continue | |
687 | |
688 key, value = f.split("=", 1) | |
689 value = value.strip() | |
690 | |
691 ## try to convert to a value | |
692 try: | |
693 value = float(value) | |
694 value = int(value) | |
695 except ValueError: | |
696 pass | |
697 except TypeError: | |
698 pass | |
699 | |
700 yield key.strip(), value | |
701 | |
702 | |
703 cdef class BedProxy(NamedTupleProxy): | |
704 '''Proxy class for access to Bed fields. | |
705 | |
706 This class represents a BED entry for fast read-access. | |
707 ''' | |
708 map_key2field = { | |
709 'contig' : (0, str), | |
710 'start' : (1, int), | |
711 'end' : (2, int), | |
712 'name' : (3, str), | |
713 'score' : (4, float), | |
714 'strand' : (5, str), | |
715 'thickStart' : (6, int), | |
716 'thickEnd' : (7, int), | |
717 'itemRGB' : (8, str), | |
718 'blockCount': (9, int), | |
719 'blockSizes': (10, str), | |
720 'blockStarts': (11, str), } | |
721 | |
722 cpdef int getMinFields(self): | |
723 '''return minimum number of fields.''' | |
724 return 3 | |
725 | |
726 cpdef int getMaxFields(self): | |
727 '''return max number of fields.''' | |
728 return 12 | |
729 | |
730 cdef update(self, char * buffer, size_t nbytes): | |
731 '''update internal data. | |
732 | |
733 nbytes does not include the terminal '\0'. | |
734 ''' | |
735 NamedTupleProxy.update(self, buffer, nbytes) | |
736 | |
737 if self.nfields < 3: | |
738 raise ValueError( | |
739 "bed format requires at least three columns") | |
740 | |
741 # determines bed format | |
742 self.bedfields = self.nfields | |
743 | |
744 # do automatic conversion | |
745 self.contig = self.fields[0] | |
746 self.start = atoi(self.fields[1]) | |
747 self.end = atoi(self.fields[2]) | |
748 | |
749 # __setattr__ in base class seems to take precedence | |
750 # hence implement setters in __setattr__ | |
751 #property start: | |
752 # def __get__( self ): return self.start | |
753 #property end: | |
754 # def __get__( self ): return self.end | |
755 | |
756 def __str__(self): | |
757 cdef int save_fields = self.nfields | |
758 # ensure fields to use correct format | |
759 self.nfields = self.bedfields | |
760 retval = super().__str__() | |
761 self.nfields = save_fields | |
762 return retval | |
763 | |
764 def __setattr__(self, key, value): | |
765 '''set attribute.''' | |
766 if key == "start": | |
767 self.start = value | |
768 elif key == "end": | |
769 self.end = value | |
770 | |
771 cdef int idx | |
772 idx, f = self.map_key2field[key] | |
773 TupleProxy._setindex(self, idx, str(value)) | |
774 | |
775 | |
776 cdef class VCFProxy(NamedTupleProxy): | |
777 '''Proxy class for access to VCF fields. | |
778 | |
779 The genotypes are accessed via a numeric index. | |
780 Sample headers are not available. | |
781 ''' | |
782 map_key2field = { | |
783 'contig' : (0, str), | |
784 'pos' : (1, int), | |
785 'id' : (2, str), | |
786 'ref' : (3, str), | |
787 'alt' : (4, str), | |
788 'qual' : (5, str), | |
789 'filter' : (6, str), | |
790 'info' : (7, str), | |
791 'format' : (8, str) } | |
792 | |
793 def __cinit__(self): | |
794 # automatically calls TupleProxy.__cinit__ | |
795 # start indexed access at genotypes | |
796 self.offset = 9 | |
797 | |
798 cdef update(self, char * buffer, size_t nbytes): | |
799 '''update internal data. | |
800 | |
801 nbytes does not include the terminal '\0'. | |
802 ''' | |
803 NamedTupleProxy.update(self, buffer, nbytes) | |
804 | |
805 self.contig = self.fields[0] | |
806 # vcf counts from 1 - correct here | |
807 self.pos = atoi(self.fields[1]) - 1 | |
808 | |
809 def __len__(self): | |
810 '''return number of genotype fields.''' | |
811 return max(0, self.nfields - 9) | |
812 | |
813 property pos: | |
814 '''feature end (in 0-based open/closed coordinates).''' | |
815 def __get__(self): | |
816 return self.pos | |
817 | |
818 def __setattr__(self, key, value): | |
819 '''set attribute.''' | |
820 if key == "pos": | |
821 self.pos = value | |
822 value += 1 | |
823 | |
824 cdef int idx | |
825 idx, f = self.map_key2field[key] | |
826 TupleProxy._setindex(self, idx, str(value)) | |
827 | |
828 | |
829 __all__ = [ | |
830 "TupleProxy", | |
831 "NamedTupleProxy", | |
832 "GTFProxy", | |
833 "GFF3Proxy", | |
834 "BedProxy", | |
835 "VCFProxy"] |