comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabixproxies.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 from cpython cimport PyBytes_FromStringAndSize
2
3 from libc.stdio cimport printf, feof, fgets
4 from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
5 from libc.stdlib cimport free, malloc, calloc, realloc
6 from libc.stdlib cimport atoi, atol, atof
7
8 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
9 from pysam.libcutils cimport encode_filename, from_string_and_size
10
11 import collections
12 import copy
13
14
15 cdef char *StrOrEmpty(char * buffer):
16 if buffer == NULL:
17 return ""
18 else: return buffer
19
20
21 cdef int isNew(char * p, char * buffer, size_t nbytes):
22 """return True if `p` is located within `buffer` of size
23 `nbytes`
24 """
25 if p == NULL:
26 return 0
27
28 return not (buffer <= p <= buffer + nbytes)
29
30
31 cdef class TupleProxy:
32 '''Proxy class for access to parsed row as a tuple.
33
34 This class represents a table row for fast read-access.
35
36 Access to individual fields is via the [] operator.
37
38 Only read-only access is implemented.
39
40 '''
41
42 def __cinit__(self, encoding="ascii"):
43 self.data = NULL
44 self.fields = NULL
45 self.nbytes = 0
46 self.is_modified = 0
47 self.nfields = 0
48 # start counting at field offset
49 self.offset = 0
50 self.encoding = encoding
51
52 def __dealloc__(self):
53 cdef int x
54 if self.is_modified:
55 for x from 0 <= x < self.nfields:
56 if isNew(self.fields[x], self.data, self.nbytes):
57 free(self.fields[x])
58 self.fields[x] = NULL
59
60 if self.data != NULL:
61 free(self.data)
62 if self.fields != NULL:
63 free(self.fields)
64
65 def __copy__(self):
66 if self.is_modified:
67 raise NotImplementedError(
68 "copying modified tuples is not implemented")
69 cdef TupleProxy n = type(self)()
70 n.copy(self.data, self.nbytes, reset=True)
71 return n
72
73 def compare(self, TupleProxy other):
74 '''return -1,0,1, if contents in this are binary
75 <,=,> to *other*
76
77 '''
78 if self.is_modified or other.is_modified:
79 raise NotImplementedError(
80 'comparison of modified TupleProxies is not implemented')
81 if self.data == other.data:
82 return 0
83
84 if self.nbytes < other.nbytes:
85 return -1
86 elif self.nbytes > other.nbytes:
87 return 1
88 return memcmp(self.data, other.data, self.nbytes)
89
90 def __richcmp__(self, TupleProxy other, int op):
91 if op == 2: # == operator
92 return self.compare(other) == 0
93 elif op == 3: # != operator
94 return self.compare(other) != 0
95 else:
96 err_msg = "op {0} isn't implemented yet".format(op)
97 raise NotImplementedError(err_msg)
98
99 cdef take(self, char * buffer, size_t nbytes):
100 '''start presenting buffer.
101
102 Take ownership of the pointer.
103 '''
104 self.data = buffer
105 self.nbytes = nbytes
106 self.update(buffer, nbytes)
107
108 cdef present(self, char * buffer, size_t nbytes):
109 '''start presenting buffer.
110
111 Do not take ownership of the pointer.
112 '''
113 self.update(buffer, nbytes)
114
115 cdef copy(self, char * buffer, size_t nbytes, bint reset=False):
116 '''start presenting buffer of size *nbytes*.
117
118 Buffer is a '\0'-terminated string without the '\n'.
119
120 Take a copy of buffer.
121 '''
122 # +1 for '\0'
123 cdef int s = sizeof(char) * (nbytes + 1)
124 self.data = <char*>malloc(s)
125 if self.data == NULL:
126 raise ValueError("out of memory in TupleProxy.copy()")
127 memcpy(<char*>self.data, buffer, s)
128
129 if reset:
130 for x from 0 <= x < nbytes:
131 if self.data[x] == b'\0':
132 self.data[x] = b'\t'
133
134 self.update(self.data, nbytes)
135
136 cpdef int getMinFields(self):
137 '''return minimum number of fields.'''
138 # 1 is not a valid tabix entry, but TupleProxy
139 # could be more generic.
140 return 1
141
142 cpdef int getMaxFields(self):
143 '''return maximum number of fields. Return
144 0 for unknown length.'''
145 return 0
146
147 cdef update(self, char * buffer, size_t nbytes):
148 '''update internal data.
149
150 *buffer* is a \0 terminated string.
151
152 *nbytes* is the number of bytes in buffer (excluding
153 the \0)
154
155 Update starts work in buffer, thus can be used
156 to collect any number of fields until nbytes
157 is exhausted.
158
159 If max_fields is set, the number of fields is initialized to
160 max_fields.
161
162 '''
163 cdef char * pos
164 cdef char * old_pos
165 cdef int field
166 cdef int max_fields, min_fields, x
167
168 assert strlen(buffer) == nbytes, \
169 "length of buffer (%i) != number of bytes (%i)" % (
170 strlen(buffer), nbytes)
171
172 if buffer[nbytes] != 0:
173 raise ValueError("incomplete line at %s" % buffer)
174
175 #################################
176 # remove line breaks and feeds and update number of bytes
177 x = nbytes - 1
178 while x > 0 and (buffer[x] == b'\n' or buffer[x] == b'\r'):
179 buffer[x] = b'\0'
180 x -= 1
181 self.nbytes = x + 1
182
183 #################################
184 # clear data
185 if self.fields != NULL:
186 free(self.fields)
187
188 for field from 0 <= field < self.nfields:
189 if isNew(self.fields[field], self.data, self.nbytes):
190 free(self.fields[field])
191
192 self.is_modified = self.nfields = 0
193
194 #################################
195 # allocate new
196 max_fields = self.getMaxFields()
197 # pre-count fields - better would be
198 # to guess or dynamically grow
199 if max_fields == 0:
200 for x from 0 <= x < nbytes:
201 if buffer[x] == b'\t':
202 max_fields += 1
203 max_fields += 1
204
205 self.fields = <char **>calloc(max_fields, sizeof(char *))
206 if self.fields == NULL:
207 raise ValueError("out of memory in TupleProxy.update()")
208
209 #################################
210 # start filling
211 field = 0
212 self.fields[field] = pos = buffer
213 field += 1
214 old_pos = pos
215 while 1:
216
217 pos = <char*>memchr(pos, b'\t', nbytes)
218 if pos == NULL:
219 break
220 if field >= max_fields:
221 raise ValueError(
222 "parsing error: more than %i fields in line: %s" %
223 (max_fields, buffer))
224
225 pos[0] = b'\0'
226 pos += 1
227 self.fields[field] = pos
228 field += 1
229 nbytes -= pos - old_pos
230 if nbytes < 0:
231 break
232 old_pos = pos
233 self.nfields = field
234 if self.nfields < self.getMinFields():
235 raise ValueError(
236 "parsing error: fewer than %i fields in line: %s" %
237 (self.getMinFields(), buffer))
238
239 def _getindex(self, int index):
240 '''return item at idx index'''
241 cdef int i = index
242 if i < 0:
243 i += self.nfields
244 if i < 0:
245 raise IndexError("list index out of range")
246 # apply offset - separating a fixed number
247 # of fields from a variable number such as in VCF
248 i += self.offset
249 if i >= self.nfields:
250 raise IndexError(
251 "list index out of range %i >= %i" %
252 (i, self.nfields))
253 return force_str(self.fields[i], self.encoding)
254
255 def __getitem__(self, key):
256 if type(key) == int:
257 return self._getindex(key)
258 # slice object
259 start, end, step = key.indices(self.nfields)
260 result = []
261 for index in range(start, end, step):
262 result.append(self._getindex(index))
263 return result
264
265 def _setindex(self, index, value):
266 '''set item at idx index.'''
267 cdef int idx = index
268 if idx < 0:
269 raise IndexError("list index out of range")
270 if idx >= self.nfields:
271 raise IndexError("list index out of range")
272
273 if isNew(self.fields[idx], self.data, self.nbytes):
274 free(self.fields[idx])
275
276 self.is_modified = 1
277
278 if value is None:
279 self.fields[idx] = NULL
280 return
281
282 # conversion with error checking
283 value = force_bytes(value)
284 cdef char * tmp = <char*>value
285 self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char))
286 if self.fields[idx] == NULL:
287 raise ValueError("out of memory" )
288 strcpy(self.fields[idx], tmp)
289
290 def __setitem__(self, index, value):
291 '''set item at *index* to *value*'''
292 cdef int i = index
293 if i < 0:
294 i += self.nfields
295 i += self.offset
296
297 self._setindex(i, value)
298
299 def __len__(self):
300 return self.nfields
301
302 def __iter__(self):
303 return TupleProxyIterator(self)
304
305 def __str__(self):
306 '''return original data'''
307 # copy and replace \0 bytes with \t characters
308 cdef char * cpy
309 if self.is_modified:
310 # todo: treat NULL values
311 result = []
312 for x in xrange(0, self.nfields):
313 result.append(StrOrEmpty(self.fields[x]).decode(self.encoding))
314 return "\t".join(result)
315 else:
316 cpy = <char*>calloc(sizeof(char), self.nbytes+1)
317 if cpy == NULL:
318 raise ValueError("out of memory")
319 memcpy(cpy, self.data, self.nbytes+1)
320 for x from 0 <= x < self.nbytes:
321 if cpy[x] == b'\0':
322 cpy[x] = b'\t'
323 result = cpy[:self.nbytes]
324 free(cpy)
325 r = result.decode(self.encoding)
326 return r
327
328
329 cdef class TupleProxyIterator:
330 def __init__(self, proxy):
331 self.proxy = proxy
332 self.index = 0
333
334 def __iter__(self):
335 return self
336
337 def __next__(self):
338 if self.index >= self.proxy.nfields:
339 raise StopIteration
340 cdef char *retval = self.proxy.fields[self.index]
341 self.index += 1
342 return force_str(retval, self.proxy.encoding) if retval != NULL else None
343
344
345 def toDot(v):
346 '''convert value to '.' if None'''
347 if v is None:
348 return "."
349 else:
350 return str(v)
351
352 def quote(v):
353 '''return a quoted attribute.'''
354 if isinstance(v, str):
355 return '"%s"' % v
356 else:
357 return str(v)
358
359
360 cdef class NamedTupleProxy(TupleProxy):
361
362 map_key2field = {}
363
364 def __setattr__(self, key, value):
365 '''set attribute.'''
366 cdef int idx
367 idx, f = self.map_key2field[key]
368 if idx >= self.nfields:
369 raise KeyError("field %s not set" % key)
370 TupleProxy.__setitem__(self, idx, str(value))
371
372 def __getattr__(self, key):
373 cdef int idx
374 idx, f = self.map_key2field[key]
375 if idx >= self.nfields:
376 raise KeyError("field %s not set" % key)
377 if f == str:
378 return force_str(self.fields[idx],
379 self.encoding)
380 return f(self.fields[idx])
381
382
383 cdef dot_or_float(v):
384 if v == "" or v == b".":
385 return None
386 else:
387 try:
388 return int(v)
389 except ValueError:
390 return float(v)
391
392
393 cdef dot_or_int(v):
394 if v == "" or v == b".":
395 return None
396 else:
397 return int(v)
398
399
400 cdef dot_or_str(v):
401 if v == "" or v == b".":
402 return None
403 else:
404 return force_str(v)
405
406
407 cdef int from1based(v):
408 return atoi(v) - 1
409
410
411 cdef str to1based(int v):
412 return str(v + 1)
413
414
415 cdef class GTFProxy(NamedTupleProxy):
416 '''Proxy class for access to GTF fields.
417
418 This class represents a GTF entry for fast read-access.
419 Write-access has been added as well, though some care must
420 be taken. If any of the string fields (contig, source, ...)
421 are set, the new value is tied to the lifetime of the
422 argument that was supplied.
423
424 The only exception is the attributes field when set from
425 a dictionary - this field will manage its own memory.
426
427 '''
428 separator = "; "
429
430 # first value is field index, the tuple contains conversion
431 # functions for getting (converting internal string representation
432 # to pythonic value) and setting (converting pythonic value to
433 # interval string representation)
434 map_key2field = {
435 'contig' : (0, (str, str)),
436 'source' : (1, (dot_or_str, str)),
437 'feature': (2, (dot_or_str, str)),
438 'start' : (3, (from1based, to1based)),
439 'end' : (4, (int, int)),
440 'score' : (5, (dot_or_float, toDot)),
441 'strand' : (6, (dot_or_str, str)),
442 'frame' : (7, (dot_or_int, toDot)),
443 'attributes': (8, (str, str))}
444
445 def __cinit__(self):
446 # automatically calls TupleProxy.__cinit__
447 self.attribute_dict = None
448
449 cpdef int getMinFields(self):
450 '''return minimum number of fields.'''
451 return 9
452
453 cpdef int getMaxFields(self):
454 '''return max number of fields.'''
455 return 9
456
457 def to_dict(self):
458 """parse attributes - return as dict
459
460 The dictionary can be modified to update attributes.
461 """
462 if not self.attribute_dict:
463 self.attribute_dict = self.attribute_string2dict(
464 self.attributes)
465 self.is_modified = True
466 return self.attribute_dict
467
468 def as_dict(self):
469 """deprecated: use :meth:`to_dict`
470 """
471 return self.to_dict()
472
473 def from_dict(self, d):
474 '''set attributes from a dictionary.'''
475 self.attribute_dict = None
476 attribute_string = force_bytes(
477 self.attribute_dict2string(d),
478 self.encoding)
479 self._setindex(8, attribute_string)
480
481 def __str__(self):
482 cdef char * cpy
483 cdef int x
484
485 if self.is_modified:
486 return "\t".join(
487 (self.contig,
488 toDot(self.source),
489 toDot(self.feature),
490 str(self.start + 1),
491 str(self.end),
492 toDot(self.score),
493 toDot(self.strand),
494 toDot(self.frame),
495 self.attributes))
496 else:
497 return super().__str__()
498
499 def invert(self, int lcontig):
500 '''invert coordinates to negative strand coordinates
501
502 This method will only act if the feature is on the
503 negative strand.'''
504
505 if self.strand[0] == '-':
506 start = min(self.start, self.end)
507 end = max(self.start, self.end)
508 self.start, self.end = lcontig - end, lcontig - start
509
510 def keys(self):
511 '''return a list of attributes defined in this entry.'''
512 if not self.attribute_dict:
513 self.attribute_dict = self.attribute_string2dict(
514 self.attributes)
515 return self.attribute_dict.keys()
516
517 def __getitem__(self, key):
518 return self.__getattr__(key)
519
520 def setAttribute(self, name, value):
521 '''convenience method to set an attribute.
522 '''
523 if not self.attribute_dict:
524 self.attribute_dict = self.attribute_string2dict(
525 self.attributes)
526 self.attribute_dict[name] = value
527 self.is_modified = True
528
529 def attribute_string2dict(self, s):
530 return collections.OrderedDict(
531 self.attribute_string2iterator(s))
532
533 def __cmp__(self, other):
534 return (self.contig, self.strand, self.start) < \
535 (other.contig, other.strand, other.start)
536
537 # python 3 compatibility
538 def __richcmp__(GTFProxy self, GTFProxy other, int op):
539 if op == 0:
540 return (self.contig, self.strand, self.start) < \
541 (other.contig, other.strand, other.start)
542 elif op == 1:
543 return (self.contig, self.strand, self.start) <= \
544 (other.contig, other.strand, other.start)
545 elif op == 2:
546 return self.compare(other) == 0
547 elif op == 3:
548 return self.compare(other) != 0
549 else:
550 err_msg = "op {0} isn't implemented yet".format(op)
551 raise NotImplementedError(err_msg)
552
553 def dict2attribute_string(self, d):
554 """convert dictionary to attribute string in GTF format.
555
556 """
557 aa = []
558 for k, v in d.items():
559 if isinstance(v, str):
560 aa.append('{} "{}"'.format(k, v))
561 else:
562 aa.append("{} {}".format(k, str(v)))
563
564 return self.separator.join(aa) + ";"
565
566 def attribute_string2iterator(self, s):
567 """convert attribute string in GTF format to records
568 and iterate over key, value pairs.
569 """
570
571 # remove comments
572 attributes = force_str(s, encoding=self.encoding)
573
574 # separate into fields
575 # Fields might contain a ";", for example in ENSEMBL GTF file
576 # for mouse, v78:
577 # ...; transcript_name "TXNRD2;-001"; ....
578 # The current heuristic is to split on a semicolon followed by a
579 # space, see also http://mblab.wustl.edu/GTF22.html
580
581 # Remove white space to prevent a last empty field.
582 fields = [x.strip() for x in attributes.strip().split("; ")]
583 for f in fields:
584
585 # strip semicolon (GTF files without a space after the last semicolon)
586 if f.endswith(";"):
587 f = f[:-1]
588
589 # split at most once in order to avoid separating
590 # multi-word values
591 d = [x.strip() for x in f.split(" ", 1)]
592
593 n, v = d[0], d[1]
594 if len(d) > 2:
595 v = d[1:]
596
597 if v[0] == '"' and v[-1] == '"':
598 v = v[1:-1]
599 else:
600 ## try to convert to a value
601 try:
602 v = float(v)
603 v = int(v)
604 except ValueError:
605 pass
606 except TypeError:
607 pass
608
609 yield n, v
610
611 def __getattr__(self, key):
612 """Generic lookup of attribute from GFF/GTF attributes
613 """
614
615 # Only called if there *isn't* an attribute with this name
616 cdef int idx
617 idx, f = self.map_key2field.get(key, (-1, None))
618 if idx >= 0:
619 # deal with known attributes (fields 0-8)
620 if idx == 8:
621 # flush attributes if requested
622 if self.is_modified and self.attribute_dict is not None:
623 s = self.dict2attribute_string(self.attribute_dict)
624 TupleProxy._setindex(self, idx, s)
625 self.attribute_dict = None
626 return s
627
628 if f[0] == str:
629 return force_str(self.fields[idx],
630 self.encoding)
631 else:
632 return f[0](self.fields[idx])
633 else:
634 # deal with generic attributes (gene_id, ...)
635 if self.attribute_dict is None:
636 self.attribute_dict = self.attribute_string2dict(
637 self.attributes)
638 return self.attribute_dict[key]
639
640 def __setattr__(self, key, value):
641 '''set attribute.'''
642
643 # Note that __setattr__ is called before properties, so __setattr__ and
644 # properties don't mix well. This is different from __getattr__ which is
645 # called after any properties have been resolved.
646 cdef int idx
647 idx, f = self.map_key2field.get(key, (-1, None))
648
649 if idx >= 0:
650 if value is None:
651 s = "."
652 elif f[1] == str:
653 s = force_bytes(value,
654 self.encoding)
655 else:
656 s = str(f[1](value))
657 TupleProxy._setindex(self, idx, s)
658 else:
659 if self.attribute_dict is None:
660 self.attribute_dict = self.attribute_string2dict(
661 self.attributes)
662 self.attribute_dict[key] = value
663 self.is_modified = True
664
665 # for backwards compatibility
666 def asDict(self, *args, **kwargs):
667 return self.to_dict(*args, **kwargs)
668
669 def fromDict(self, *args, **kwargs):
670 return self.from_dict(*args, **kwargs)
671
672
673 cdef class GFF3Proxy(GTFProxy):
674
675 def dict2attribute_string(self, d):
676 """convert dictionary to attribute string."""
677 return ";".join(["{}={}".format(k, v) for k, v in d.items()])
678
679 def attribute_string2iterator(self, s):
680 """convert attribute string in GFF3 format to records
681 and iterate over key, value pairs.
682 """
683
684 for f in (x.strip() for x in s.split(";")):
685 if not f:
686 continue
687
688 key, value = f.split("=", 1)
689 value = value.strip()
690
691 ## try to convert to a value
692 try:
693 value = float(value)
694 value = int(value)
695 except ValueError:
696 pass
697 except TypeError:
698 pass
699
700 yield key.strip(), value
701
702
703 cdef class BedProxy(NamedTupleProxy):
704 '''Proxy class for access to Bed fields.
705
706 This class represents a BED entry for fast read-access.
707 '''
708 map_key2field = {
709 'contig' : (0, str),
710 'start' : (1, int),
711 'end' : (2, int),
712 'name' : (3, str),
713 'score' : (4, float),
714 'strand' : (5, str),
715 'thickStart' : (6, int),
716 'thickEnd' : (7, int),
717 'itemRGB' : (8, str),
718 'blockCount': (9, int),
719 'blockSizes': (10, str),
720 'blockStarts': (11, str), }
721
722 cpdef int getMinFields(self):
723 '''return minimum number of fields.'''
724 return 3
725
726 cpdef int getMaxFields(self):
727 '''return max number of fields.'''
728 return 12
729
730 cdef update(self, char * buffer, size_t nbytes):
731 '''update internal data.
732
733 nbytes does not include the terminal '\0'.
734 '''
735 NamedTupleProxy.update(self, buffer, nbytes)
736
737 if self.nfields < 3:
738 raise ValueError(
739 "bed format requires at least three columns")
740
741 # determines bed format
742 self.bedfields = self.nfields
743
744 # do automatic conversion
745 self.contig = self.fields[0]
746 self.start = atoi(self.fields[1])
747 self.end = atoi(self.fields[2])
748
749 # __setattr__ in base class seems to take precedence
750 # hence implement setters in __setattr__
751 #property start:
752 # def __get__( self ): return self.start
753 #property end:
754 # def __get__( self ): return self.end
755
756 def __str__(self):
757 cdef int save_fields = self.nfields
758 # ensure fields to use correct format
759 self.nfields = self.bedfields
760 retval = super().__str__()
761 self.nfields = save_fields
762 return retval
763
764 def __setattr__(self, key, value):
765 '''set attribute.'''
766 if key == "start":
767 self.start = value
768 elif key == "end":
769 self.end = value
770
771 cdef int idx
772 idx, f = self.map_key2field[key]
773 TupleProxy._setindex(self, idx, str(value))
774
775
776 cdef class VCFProxy(NamedTupleProxy):
777 '''Proxy class for access to VCF fields.
778
779 The genotypes are accessed via a numeric index.
780 Sample headers are not available.
781 '''
782 map_key2field = {
783 'contig' : (0, str),
784 'pos' : (1, int),
785 'id' : (2, str),
786 'ref' : (3, str),
787 'alt' : (4, str),
788 'qual' : (5, str),
789 'filter' : (6, str),
790 'info' : (7, str),
791 'format' : (8, str) }
792
793 def __cinit__(self):
794 # automatically calls TupleProxy.__cinit__
795 # start indexed access at genotypes
796 self.offset = 9
797
798 cdef update(self, char * buffer, size_t nbytes):
799 '''update internal data.
800
801 nbytes does not include the terminal '\0'.
802 '''
803 NamedTupleProxy.update(self, buffer, nbytes)
804
805 self.contig = self.fields[0]
806 # vcf counts from 1 - correct here
807 self.pos = atoi(self.fields[1]) - 1
808
809 def __len__(self):
810 '''return number of genotype fields.'''
811 return max(0, self.nfields - 9)
812
813 property pos:
814 '''feature end (in 0-based open/closed coordinates).'''
815 def __get__(self):
816 return self.pos
817
818 def __setattr__(self, key, value):
819 '''set attribute.'''
820 if key == "pos":
821 self.pos = value
822 value += 1
823
824 cdef int idx
825 idx, f = self.map_key2field[key]
826 TupleProxy._setindex(self, idx, str(value))
827
828
829 __all__ = [
830 "TupleProxy",
831 "NamedTupleProxy",
832 "GTFProxy",
833 "GFF3Proxy",
834 "BedProxy",
835 "VCFProxy"]