annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabixproxies.pyx @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 from cpython cimport PyBytes_FromStringAndSize
jpayne@69 2
jpayne@69 3 from libc.stdio cimport printf, feof, fgets
jpayne@69 4 from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
jpayne@69 5 from libc.stdlib cimport free, malloc, calloc, realloc
jpayne@69 6 from libc.stdlib cimport atoi, atol, atof
jpayne@69 7
jpayne@69 8 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@69 9 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@69 10
jpayne@69 11 import collections
jpayne@69 12 import copy
jpayne@69 13
jpayne@69 14
jpayne@69 15 cdef char *StrOrEmpty(char * buffer):
jpayne@69 16 if buffer == NULL:
jpayne@69 17 return ""
jpayne@69 18 else: return buffer
jpayne@69 19
jpayne@69 20
jpayne@69 21 cdef int isNew(char * p, char * buffer, size_t nbytes):
jpayne@69 22 """return True if `p` is located within `buffer` of size
jpayne@69 23 `nbytes`
jpayne@69 24 """
jpayne@69 25 if p == NULL:
jpayne@69 26 return 0
jpayne@69 27
jpayne@69 28 return not (buffer <= p <= buffer + nbytes)
jpayne@69 29
jpayne@69 30
jpayne@69 31 cdef class TupleProxy:
jpayne@69 32 '''Proxy class for access to parsed row as a tuple.
jpayne@69 33
jpayne@69 34 This class represents a table row for fast read-access.
jpayne@69 35
jpayne@69 36 Access to individual fields is via the [] operator.
jpayne@69 37
jpayne@69 38 Only read-only access is implemented.
jpayne@69 39
jpayne@69 40 '''
jpayne@69 41
jpayne@69 42 def __cinit__(self, encoding="ascii"):
jpayne@69 43 self.data = NULL
jpayne@69 44 self.fields = NULL
jpayne@69 45 self.nbytes = 0
jpayne@69 46 self.is_modified = 0
jpayne@69 47 self.nfields = 0
jpayne@69 48 # start counting at field offset
jpayne@69 49 self.offset = 0
jpayne@69 50 self.encoding = encoding
jpayne@69 51
jpayne@69 52 def __dealloc__(self):
jpayne@69 53 cdef int x
jpayne@69 54 if self.is_modified:
jpayne@69 55 for x from 0 <= x < self.nfields:
jpayne@69 56 if isNew(self.fields[x], self.data, self.nbytes):
jpayne@69 57 free(self.fields[x])
jpayne@69 58 self.fields[x] = NULL
jpayne@69 59
jpayne@69 60 if self.data != NULL:
jpayne@69 61 free(self.data)
jpayne@69 62 if self.fields != NULL:
jpayne@69 63 free(self.fields)
jpayne@69 64
jpayne@69 65 def __copy__(self):
jpayne@69 66 if self.is_modified:
jpayne@69 67 raise NotImplementedError(
jpayne@69 68 "copying modified tuples is not implemented")
jpayne@69 69 cdef TupleProxy n = type(self)()
jpayne@69 70 n.copy(self.data, self.nbytes, reset=True)
jpayne@69 71 return n
jpayne@69 72
jpayne@69 73 def compare(self, TupleProxy other):
jpayne@69 74 '''return -1,0,1, if contents in this are binary
jpayne@69 75 <,=,> to *other*
jpayne@69 76
jpayne@69 77 '''
jpayne@69 78 if self.is_modified or other.is_modified:
jpayne@69 79 raise NotImplementedError(
jpayne@69 80 'comparison of modified TupleProxies is not implemented')
jpayne@69 81 if self.data == other.data:
jpayne@69 82 return 0
jpayne@69 83
jpayne@69 84 if self.nbytes < other.nbytes:
jpayne@69 85 return -1
jpayne@69 86 elif self.nbytes > other.nbytes:
jpayne@69 87 return 1
jpayne@69 88 return memcmp(self.data, other.data, self.nbytes)
jpayne@69 89
jpayne@69 90 def __richcmp__(self, TupleProxy other, int op):
jpayne@69 91 if op == 2: # == operator
jpayne@69 92 return self.compare(other) == 0
jpayne@69 93 elif op == 3: # != operator
jpayne@69 94 return self.compare(other) != 0
jpayne@69 95 else:
jpayne@69 96 err_msg = "op {0} isn't implemented yet".format(op)
jpayne@69 97 raise NotImplementedError(err_msg)
jpayne@69 98
jpayne@69 99 cdef take(self, char * buffer, size_t nbytes):
jpayne@69 100 '''start presenting buffer.
jpayne@69 101
jpayne@69 102 Take ownership of the pointer.
jpayne@69 103 '''
jpayne@69 104 self.data = buffer
jpayne@69 105 self.nbytes = nbytes
jpayne@69 106 self.update(buffer, nbytes)
jpayne@69 107
jpayne@69 108 cdef present(self, char * buffer, size_t nbytes):
jpayne@69 109 '''start presenting buffer.
jpayne@69 110
jpayne@69 111 Do not take ownership of the pointer.
jpayne@69 112 '''
jpayne@69 113 self.update(buffer, nbytes)
jpayne@69 114
jpayne@69 115 cdef copy(self, char * buffer, size_t nbytes, bint reset=False):
jpayne@69 116 '''start presenting buffer of size *nbytes*.
jpayne@69 117
jpayne@69 118 Buffer is a '\0'-terminated string without the '\n'.
jpayne@69 119
jpayne@69 120 Take a copy of buffer.
jpayne@69 121 '''
jpayne@69 122 # +1 for '\0'
jpayne@69 123 cdef int s = sizeof(char) * (nbytes + 1)
jpayne@69 124 self.data = <char*>malloc(s)
jpayne@69 125 if self.data == NULL:
jpayne@69 126 raise ValueError("out of memory in TupleProxy.copy()")
jpayne@69 127 memcpy(<char*>self.data, buffer, s)
jpayne@69 128
jpayne@69 129 if reset:
jpayne@69 130 for x from 0 <= x < nbytes:
jpayne@69 131 if self.data[x] == b'\0':
jpayne@69 132 self.data[x] = b'\t'
jpayne@69 133
jpayne@69 134 self.update(self.data, nbytes)
jpayne@69 135
jpayne@69 136 cpdef int getMinFields(self):
jpayne@69 137 '''return minimum number of fields.'''
jpayne@69 138 # 1 is not a valid tabix entry, but TupleProxy
jpayne@69 139 # could be more generic.
jpayne@69 140 return 1
jpayne@69 141
jpayne@69 142 cpdef int getMaxFields(self):
jpayne@69 143 '''return maximum number of fields. Return
jpayne@69 144 0 for unknown length.'''
jpayne@69 145 return 0
jpayne@69 146
jpayne@69 147 cdef update(self, char * buffer, size_t nbytes):
jpayne@69 148 '''update internal data.
jpayne@69 149
jpayne@69 150 *buffer* is a \0 terminated string.
jpayne@69 151
jpayne@69 152 *nbytes* is the number of bytes in buffer (excluding
jpayne@69 153 the \0)
jpayne@69 154
jpayne@69 155 Update starts work in buffer, thus can be used
jpayne@69 156 to collect any number of fields until nbytes
jpayne@69 157 is exhausted.
jpayne@69 158
jpayne@69 159 If max_fields is set, the number of fields is initialized to
jpayne@69 160 max_fields.
jpayne@69 161
jpayne@69 162 '''
jpayne@69 163 cdef char * pos
jpayne@69 164 cdef char * old_pos
jpayne@69 165 cdef int field
jpayne@69 166 cdef int max_fields, min_fields, x
jpayne@69 167
jpayne@69 168 assert strlen(buffer) == nbytes, \
jpayne@69 169 "length of buffer (%i) != number of bytes (%i)" % (
jpayne@69 170 strlen(buffer), nbytes)
jpayne@69 171
jpayne@69 172 if buffer[nbytes] != 0:
jpayne@69 173 raise ValueError("incomplete line at %s" % buffer)
jpayne@69 174
jpayne@69 175 #################################
jpayne@69 176 # remove line breaks and feeds and update number of bytes
jpayne@69 177 x = nbytes - 1
jpayne@69 178 while x > 0 and (buffer[x] == b'\n' or buffer[x] == b'\r'):
jpayne@69 179 buffer[x] = b'\0'
jpayne@69 180 x -= 1
jpayne@69 181 self.nbytes = x + 1
jpayne@69 182
jpayne@69 183 #################################
jpayne@69 184 # clear data
jpayne@69 185 if self.fields != NULL:
jpayne@69 186 free(self.fields)
jpayne@69 187
jpayne@69 188 for field from 0 <= field < self.nfields:
jpayne@69 189 if isNew(self.fields[field], self.data, self.nbytes):
jpayne@69 190 free(self.fields[field])
jpayne@69 191
jpayne@69 192 self.is_modified = self.nfields = 0
jpayne@69 193
jpayne@69 194 #################################
jpayne@69 195 # allocate new
jpayne@69 196 max_fields = self.getMaxFields()
jpayne@69 197 # pre-count fields - better would be
jpayne@69 198 # to guess or dynamically grow
jpayne@69 199 if max_fields == 0:
jpayne@69 200 for x from 0 <= x < nbytes:
jpayne@69 201 if buffer[x] == b'\t':
jpayne@69 202 max_fields += 1
jpayne@69 203 max_fields += 1
jpayne@69 204
jpayne@69 205 self.fields = <char **>calloc(max_fields, sizeof(char *))
jpayne@69 206 if self.fields == NULL:
jpayne@69 207 raise ValueError("out of memory in TupleProxy.update()")
jpayne@69 208
jpayne@69 209 #################################
jpayne@69 210 # start filling
jpayne@69 211 field = 0
jpayne@69 212 self.fields[field] = pos = buffer
jpayne@69 213 field += 1
jpayne@69 214 old_pos = pos
jpayne@69 215 while 1:
jpayne@69 216
jpayne@69 217 pos = <char*>memchr(pos, b'\t', nbytes)
jpayne@69 218 if pos == NULL:
jpayne@69 219 break
jpayne@69 220 if field >= max_fields:
jpayne@69 221 raise ValueError(
jpayne@69 222 "parsing error: more than %i fields in line: %s" %
jpayne@69 223 (max_fields, buffer))
jpayne@69 224
jpayne@69 225 pos[0] = b'\0'
jpayne@69 226 pos += 1
jpayne@69 227 self.fields[field] = pos
jpayne@69 228 field += 1
jpayne@69 229 nbytes -= pos - old_pos
jpayne@69 230 if nbytes < 0:
jpayne@69 231 break
jpayne@69 232 old_pos = pos
jpayne@69 233 self.nfields = field
jpayne@69 234 if self.nfields < self.getMinFields():
jpayne@69 235 raise ValueError(
jpayne@69 236 "parsing error: fewer than %i fields in line: %s" %
jpayne@69 237 (self.getMinFields(), buffer))
jpayne@69 238
jpayne@69 239 def _getindex(self, int index):
jpayne@69 240 '''return item at idx index'''
jpayne@69 241 cdef int i = index
jpayne@69 242 if i < 0:
jpayne@69 243 i += self.nfields
jpayne@69 244 if i < 0:
jpayne@69 245 raise IndexError("list index out of range")
jpayne@69 246 # apply offset - separating a fixed number
jpayne@69 247 # of fields from a variable number such as in VCF
jpayne@69 248 i += self.offset
jpayne@69 249 if i >= self.nfields:
jpayne@69 250 raise IndexError(
jpayne@69 251 "list index out of range %i >= %i" %
jpayne@69 252 (i, self.nfields))
jpayne@69 253 return force_str(self.fields[i], self.encoding)
jpayne@69 254
jpayne@69 255 def __getitem__(self, key):
jpayne@69 256 if type(key) == int:
jpayne@69 257 return self._getindex(key)
jpayne@69 258 # slice object
jpayne@69 259 start, end, step = key.indices(self.nfields)
jpayne@69 260 result = []
jpayne@69 261 for index in range(start, end, step):
jpayne@69 262 result.append(self._getindex(index))
jpayne@69 263 return result
jpayne@69 264
jpayne@69 265 def _setindex(self, index, value):
jpayne@69 266 '''set item at idx index.'''
jpayne@69 267 cdef int idx = index
jpayne@69 268 if idx < 0:
jpayne@69 269 raise IndexError("list index out of range")
jpayne@69 270 if idx >= self.nfields:
jpayne@69 271 raise IndexError("list index out of range")
jpayne@69 272
jpayne@69 273 if isNew(self.fields[idx], self.data, self.nbytes):
jpayne@69 274 free(self.fields[idx])
jpayne@69 275
jpayne@69 276 self.is_modified = 1
jpayne@69 277
jpayne@69 278 if value is None:
jpayne@69 279 self.fields[idx] = NULL
jpayne@69 280 return
jpayne@69 281
jpayne@69 282 # conversion with error checking
jpayne@69 283 value = force_bytes(value)
jpayne@69 284 cdef char * tmp = <char*>value
jpayne@69 285 self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char))
jpayne@69 286 if self.fields[idx] == NULL:
jpayne@69 287 raise ValueError("out of memory" )
jpayne@69 288 strcpy(self.fields[idx], tmp)
jpayne@69 289
jpayne@69 290 def __setitem__(self, index, value):
jpayne@69 291 '''set item at *index* to *value*'''
jpayne@69 292 cdef int i = index
jpayne@69 293 if i < 0:
jpayne@69 294 i += self.nfields
jpayne@69 295 i += self.offset
jpayne@69 296
jpayne@69 297 self._setindex(i, value)
jpayne@69 298
jpayne@69 299 def __len__(self):
jpayne@69 300 return self.nfields
jpayne@69 301
jpayne@69 302 def __iter__(self):
jpayne@69 303 return TupleProxyIterator(self)
jpayne@69 304
jpayne@69 305 def __str__(self):
jpayne@69 306 '''return original data'''
jpayne@69 307 # copy and replace \0 bytes with \t characters
jpayne@69 308 cdef char * cpy
jpayne@69 309 if self.is_modified:
jpayne@69 310 # todo: treat NULL values
jpayne@69 311 result = []
jpayne@69 312 for x in xrange(0, self.nfields):
jpayne@69 313 result.append(StrOrEmpty(self.fields[x]).decode(self.encoding))
jpayne@69 314 return "\t".join(result)
jpayne@69 315 else:
jpayne@69 316 cpy = <char*>calloc(sizeof(char), self.nbytes+1)
jpayne@69 317 if cpy == NULL:
jpayne@69 318 raise ValueError("out of memory")
jpayne@69 319 memcpy(cpy, self.data, self.nbytes+1)
jpayne@69 320 for x from 0 <= x < self.nbytes:
jpayne@69 321 if cpy[x] == b'\0':
jpayne@69 322 cpy[x] = b'\t'
jpayne@69 323 result = cpy[:self.nbytes]
jpayne@69 324 free(cpy)
jpayne@69 325 r = result.decode(self.encoding)
jpayne@69 326 return r
jpayne@69 327
jpayne@69 328
jpayne@69 329 cdef class TupleProxyIterator:
jpayne@69 330 def __init__(self, proxy):
jpayne@69 331 self.proxy = proxy
jpayne@69 332 self.index = 0
jpayne@69 333
jpayne@69 334 def __iter__(self):
jpayne@69 335 return self
jpayne@69 336
jpayne@69 337 def __next__(self):
jpayne@69 338 if self.index >= self.proxy.nfields:
jpayne@69 339 raise StopIteration
jpayne@69 340 cdef char *retval = self.proxy.fields[self.index]
jpayne@69 341 self.index += 1
jpayne@69 342 return force_str(retval, self.proxy.encoding) if retval != NULL else None
jpayne@69 343
jpayne@69 344
jpayne@69 345 def toDot(v):
jpayne@69 346 '''convert value to '.' if None'''
jpayne@69 347 if v is None:
jpayne@69 348 return "."
jpayne@69 349 else:
jpayne@69 350 return str(v)
jpayne@69 351
jpayne@69 352 def quote(v):
jpayne@69 353 '''return a quoted attribute.'''
jpayne@69 354 if isinstance(v, str):
jpayne@69 355 return '"%s"' % v
jpayne@69 356 else:
jpayne@69 357 return str(v)
jpayne@69 358
jpayne@69 359
jpayne@69 360 cdef class NamedTupleProxy(TupleProxy):
jpayne@69 361
jpayne@69 362 map_key2field = {}
jpayne@69 363
jpayne@69 364 def __setattr__(self, key, value):
jpayne@69 365 '''set attribute.'''
jpayne@69 366 cdef int idx
jpayne@69 367 idx, f = self.map_key2field[key]
jpayne@69 368 if idx >= self.nfields:
jpayne@69 369 raise KeyError("field %s not set" % key)
jpayne@69 370 TupleProxy.__setitem__(self, idx, str(value))
jpayne@69 371
jpayne@69 372 def __getattr__(self, key):
jpayne@69 373 cdef int idx
jpayne@69 374 idx, f = self.map_key2field[key]
jpayne@69 375 if idx >= self.nfields:
jpayne@69 376 raise KeyError("field %s not set" % key)
jpayne@69 377 if f == str:
jpayne@69 378 return force_str(self.fields[idx],
jpayne@69 379 self.encoding)
jpayne@69 380 return f(self.fields[idx])
jpayne@69 381
jpayne@69 382
jpayne@69 383 cdef dot_or_float(v):
jpayne@69 384 if v == "" or v == b".":
jpayne@69 385 return None
jpayne@69 386 else:
jpayne@69 387 try:
jpayne@69 388 return int(v)
jpayne@69 389 except ValueError:
jpayne@69 390 return float(v)
jpayne@69 391
jpayne@69 392
jpayne@69 393 cdef dot_or_int(v):
jpayne@69 394 if v == "" or v == b".":
jpayne@69 395 return None
jpayne@69 396 else:
jpayne@69 397 return int(v)
jpayne@69 398
jpayne@69 399
jpayne@69 400 cdef dot_or_str(v):
jpayne@69 401 if v == "" or v == b".":
jpayne@69 402 return None
jpayne@69 403 else:
jpayne@69 404 return force_str(v)
jpayne@69 405
jpayne@69 406
jpayne@69 407 cdef int from1based(v):
jpayne@69 408 return atoi(v) - 1
jpayne@69 409
jpayne@69 410
jpayne@69 411 cdef str to1based(int v):
jpayne@69 412 return str(v + 1)
jpayne@69 413
jpayne@69 414
jpayne@69 415 cdef class GTFProxy(NamedTupleProxy):
jpayne@69 416 '''Proxy class for access to GTF fields.
jpayne@69 417
jpayne@69 418 This class represents a GTF entry for fast read-access.
jpayne@69 419 Write-access has been added as well, though some care must
jpayne@69 420 be taken. If any of the string fields (contig, source, ...)
jpayne@69 421 are set, the new value is tied to the lifetime of the
jpayne@69 422 argument that was supplied.
jpayne@69 423
jpayne@69 424 The only exception is the attributes field when set from
jpayne@69 425 a dictionary - this field will manage its own memory.
jpayne@69 426
jpayne@69 427 '''
jpayne@69 428 separator = "; "
jpayne@69 429
jpayne@69 430 # first value is field index, the tuple contains conversion
jpayne@69 431 # functions for getting (converting internal string representation
jpayne@69 432 # to pythonic value) and setting (converting pythonic value to
jpayne@69 433 # interval string representation)
jpayne@69 434 map_key2field = {
jpayne@69 435 'contig' : (0, (str, str)),
jpayne@69 436 'source' : (1, (dot_or_str, str)),
jpayne@69 437 'feature': (2, (dot_or_str, str)),
jpayne@69 438 'start' : (3, (from1based, to1based)),
jpayne@69 439 'end' : (4, (int, int)),
jpayne@69 440 'score' : (5, (dot_or_float, toDot)),
jpayne@69 441 'strand' : (6, (dot_or_str, str)),
jpayne@69 442 'frame' : (7, (dot_or_int, toDot)),
jpayne@69 443 'attributes': (8, (str, str))}
jpayne@69 444
jpayne@69 445 def __cinit__(self):
jpayne@69 446 # automatically calls TupleProxy.__cinit__
jpayne@69 447 self.attribute_dict = None
jpayne@69 448
jpayne@69 449 cpdef int getMinFields(self):
jpayne@69 450 '''return minimum number of fields.'''
jpayne@69 451 return 9
jpayne@69 452
jpayne@69 453 cpdef int getMaxFields(self):
jpayne@69 454 '''return max number of fields.'''
jpayne@69 455 return 9
jpayne@69 456
jpayne@69 457 def to_dict(self):
jpayne@69 458 """parse attributes - return as dict
jpayne@69 459
jpayne@69 460 The dictionary can be modified to update attributes.
jpayne@69 461 """
jpayne@69 462 if not self.attribute_dict:
jpayne@69 463 self.attribute_dict = self.attribute_string2dict(
jpayne@69 464 self.attributes)
jpayne@69 465 self.is_modified = True
jpayne@69 466 return self.attribute_dict
jpayne@69 467
jpayne@69 468 def as_dict(self):
jpayne@69 469 """deprecated: use :meth:`to_dict`
jpayne@69 470 """
jpayne@69 471 return self.to_dict()
jpayne@69 472
jpayne@69 473 def from_dict(self, d):
jpayne@69 474 '''set attributes from a dictionary.'''
jpayne@69 475 self.attribute_dict = None
jpayne@69 476 attribute_string = force_bytes(
jpayne@69 477 self.attribute_dict2string(d),
jpayne@69 478 self.encoding)
jpayne@69 479 self._setindex(8, attribute_string)
jpayne@69 480
jpayne@69 481 def __str__(self):
jpayne@69 482 cdef char * cpy
jpayne@69 483 cdef int x
jpayne@69 484
jpayne@69 485 if self.is_modified:
jpayne@69 486 return "\t".join(
jpayne@69 487 (self.contig,
jpayne@69 488 toDot(self.source),
jpayne@69 489 toDot(self.feature),
jpayne@69 490 str(self.start + 1),
jpayne@69 491 str(self.end),
jpayne@69 492 toDot(self.score),
jpayne@69 493 toDot(self.strand),
jpayne@69 494 toDot(self.frame),
jpayne@69 495 self.attributes))
jpayne@69 496 else:
jpayne@69 497 return super().__str__()
jpayne@69 498
jpayne@69 499 def invert(self, int lcontig):
jpayne@69 500 '''invert coordinates to negative strand coordinates
jpayne@69 501
jpayne@69 502 This method will only act if the feature is on the
jpayne@69 503 negative strand.'''
jpayne@69 504
jpayne@69 505 if self.strand[0] == '-':
jpayne@69 506 start = min(self.start, self.end)
jpayne@69 507 end = max(self.start, self.end)
jpayne@69 508 self.start, self.end = lcontig - end, lcontig - start
jpayne@69 509
jpayne@69 510 def keys(self):
jpayne@69 511 '''return a list of attributes defined in this entry.'''
jpayne@69 512 if not self.attribute_dict:
jpayne@69 513 self.attribute_dict = self.attribute_string2dict(
jpayne@69 514 self.attributes)
jpayne@69 515 return self.attribute_dict.keys()
jpayne@69 516
jpayne@69 517 def __getitem__(self, key):
jpayne@69 518 return self.__getattr__(key)
jpayne@69 519
jpayne@69 520 def setAttribute(self, name, value):
jpayne@69 521 '''convenience method to set an attribute.
jpayne@69 522 '''
jpayne@69 523 if not self.attribute_dict:
jpayne@69 524 self.attribute_dict = self.attribute_string2dict(
jpayne@69 525 self.attributes)
jpayne@69 526 self.attribute_dict[name] = value
jpayne@69 527 self.is_modified = True
jpayne@69 528
jpayne@69 529 def attribute_string2dict(self, s):
jpayne@69 530 return collections.OrderedDict(
jpayne@69 531 self.attribute_string2iterator(s))
jpayne@69 532
jpayne@69 533 def __cmp__(self, other):
jpayne@69 534 return (self.contig, self.strand, self.start) < \
jpayne@69 535 (other.contig, other.strand, other.start)
jpayne@69 536
jpayne@69 537 # python 3 compatibility
jpayne@69 538 def __richcmp__(GTFProxy self, GTFProxy other, int op):
jpayne@69 539 if op == 0:
jpayne@69 540 return (self.contig, self.strand, self.start) < \
jpayne@69 541 (other.contig, other.strand, other.start)
jpayne@69 542 elif op == 1:
jpayne@69 543 return (self.contig, self.strand, self.start) <= \
jpayne@69 544 (other.contig, other.strand, other.start)
jpayne@69 545 elif op == 2:
jpayne@69 546 return self.compare(other) == 0
jpayne@69 547 elif op == 3:
jpayne@69 548 return self.compare(other) != 0
jpayne@69 549 else:
jpayne@69 550 err_msg = "op {0} isn't implemented yet".format(op)
jpayne@69 551 raise NotImplementedError(err_msg)
jpayne@69 552
jpayne@69 553 def dict2attribute_string(self, d):
jpayne@69 554 """convert dictionary to attribute string in GTF format.
jpayne@69 555
jpayne@69 556 """
jpayne@69 557 aa = []
jpayne@69 558 for k, v in d.items():
jpayne@69 559 if isinstance(v, str):
jpayne@69 560 aa.append('{} "{}"'.format(k, v))
jpayne@69 561 else:
jpayne@69 562 aa.append("{} {}".format(k, str(v)))
jpayne@69 563
jpayne@69 564 return self.separator.join(aa) + ";"
jpayne@69 565
jpayne@69 566 def attribute_string2iterator(self, s):
jpayne@69 567 """convert attribute string in GTF format to records
jpayne@69 568 and iterate over key, value pairs.
jpayne@69 569 """
jpayne@69 570
jpayne@69 571 # remove comments
jpayne@69 572 attributes = force_str(s, encoding=self.encoding)
jpayne@69 573
jpayne@69 574 # separate into fields
jpayne@69 575 # Fields might contain a ";", for example in ENSEMBL GTF file
jpayne@69 576 # for mouse, v78:
jpayne@69 577 # ...; transcript_name "TXNRD2;-001"; ....
jpayne@69 578 # The current heuristic is to split on a semicolon followed by a
jpayne@69 579 # space, see also http://mblab.wustl.edu/GTF22.html
jpayne@69 580
jpayne@69 581 # Remove white space to prevent a last empty field.
jpayne@69 582 fields = [x.strip() for x in attributes.strip().split("; ")]
jpayne@69 583 for f in fields:
jpayne@69 584
jpayne@69 585 # strip semicolon (GTF files without a space after the last semicolon)
jpayne@69 586 if f.endswith(";"):
jpayne@69 587 f = f[:-1]
jpayne@69 588
jpayne@69 589 # split at most once in order to avoid separating
jpayne@69 590 # multi-word values
jpayne@69 591 d = [x.strip() for x in f.split(" ", 1)]
jpayne@69 592
jpayne@69 593 n, v = d[0], d[1]
jpayne@69 594 if len(d) > 2:
jpayne@69 595 v = d[1:]
jpayne@69 596
jpayne@69 597 if v[0] == '"' and v[-1] == '"':
jpayne@69 598 v = v[1:-1]
jpayne@69 599 else:
jpayne@69 600 ## try to convert to a value
jpayne@69 601 try:
jpayne@69 602 v = float(v)
jpayne@69 603 v = int(v)
jpayne@69 604 except ValueError:
jpayne@69 605 pass
jpayne@69 606 except TypeError:
jpayne@69 607 pass
jpayne@69 608
jpayne@69 609 yield n, v
jpayne@69 610
jpayne@69 611 def __getattr__(self, key):
jpayne@69 612 """Generic lookup of attribute from GFF/GTF attributes
jpayne@69 613 """
jpayne@69 614
jpayne@69 615 # Only called if there *isn't* an attribute with this name
jpayne@69 616 cdef int idx
jpayne@69 617 idx, f = self.map_key2field.get(key, (-1, None))
jpayne@69 618 if idx >= 0:
jpayne@69 619 # deal with known attributes (fields 0-8)
jpayne@69 620 if idx == 8:
jpayne@69 621 # flush attributes if requested
jpayne@69 622 if self.is_modified and self.attribute_dict is not None:
jpayne@69 623 s = self.dict2attribute_string(self.attribute_dict)
jpayne@69 624 TupleProxy._setindex(self, idx, s)
jpayne@69 625 self.attribute_dict = None
jpayne@69 626 return s
jpayne@69 627
jpayne@69 628 if f[0] == str:
jpayne@69 629 return force_str(self.fields[idx],
jpayne@69 630 self.encoding)
jpayne@69 631 else:
jpayne@69 632 return f[0](self.fields[idx])
jpayne@69 633 else:
jpayne@69 634 # deal with generic attributes (gene_id, ...)
jpayne@69 635 if self.attribute_dict is None:
jpayne@69 636 self.attribute_dict = self.attribute_string2dict(
jpayne@69 637 self.attributes)
jpayne@69 638 return self.attribute_dict[key]
jpayne@69 639
jpayne@69 640 def __setattr__(self, key, value):
jpayne@69 641 '''set attribute.'''
jpayne@69 642
jpayne@69 643 # Note that __setattr__ is called before properties, so __setattr__ and
jpayne@69 644 # properties don't mix well. This is different from __getattr__ which is
jpayne@69 645 # called after any properties have been resolved.
jpayne@69 646 cdef int idx
jpayne@69 647 idx, f = self.map_key2field.get(key, (-1, None))
jpayne@69 648
jpayne@69 649 if idx >= 0:
jpayne@69 650 if value is None:
jpayne@69 651 s = "."
jpayne@69 652 elif f[1] == str:
jpayne@69 653 s = force_bytes(value,
jpayne@69 654 self.encoding)
jpayne@69 655 else:
jpayne@69 656 s = str(f[1](value))
jpayne@69 657 TupleProxy._setindex(self, idx, s)
jpayne@69 658 else:
jpayne@69 659 if self.attribute_dict is None:
jpayne@69 660 self.attribute_dict = self.attribute_string2dict(
jpayne@69 661 self.attributes)
jpayne@69 662 self.attribute_dict[key] = value
jpayne@69 663 self.is_modified = True
jpayne@69 664
jpayne@69 665 # for backwards compatibility
jpayne@69 666 def asDict(self, *args, **kwargs):
jpayne@69 667 return self.to_dict(*args, **kwargs)
jpayne@69 668
jpayne@69 669 def fromDict(self, *args, **kwargs):
jpayne@69 670 return self.from_dict(*args, **kwargs)
jpayne@69 671
jpayne@69 672
jpayne@69 673 cdef class GFF3Proxy(GTFProxy):
jpayne@69 674
jpayne@69 675 def dict2attribute_string(self, d):
jpayne@69 676 """convert dictionary to attribute string."""
jpayne@69 677 return ";".join(["{}={}".format(k, v) for k, v in d.items()])
jpayne@69 678
jpayne@69 679 def attribute_string2iterator(self, s):
jpayne@69 680 """convert attribute string in GFF3 format to records
jpayne@69 681 and iterate over key, value pairs.
jpayne@69 682 """
jpayne@69 683
jpayne@69 684 for f in (x.strip() for x in s.split(";")):
jpayne@69 685 if not f:
jpayne@69 686 continue
jpayne@69 687
jpayne@69 688 key, value = f.split("=", 1)
jpayne@69 689 value = value.strip()
jpayne@69 690
jpayne@69 691 ## try to convert to a value
jpayne@69 692 try:
jpayne@69 693 value = float(value)
jpayne@69 694 value = int(value)
jpayne@69 695 except ValueError:
jpayne@69 696 pass
jpayne@69 697 except TypeError:
jpayne@69 698 pass
jpayne@69 699
jpayne@69 700 yield key.strip(), value
jpayne@69 701
jpayne@69 702
jpayne@69 703 cdef class BedProxy(NamedTupleProxy):
jpayne@69 704 '''Proxy class for access to Bed fields.
jpayne@69 705
jpayne@69 706 This class represents a BED entry for fast read-access.
jpayne@69 707 '''
jpayne@69 708 map_key2field = {
jpayne@69 709 'contig' : (0, str),
jpayne@69 710 'start' : (1, int),
jpayne@69 711 'end' : (2, int),
jpayne@69 712 'name' : (3, str),
jpayne@69 713 'score' : (4, float),
jpayne@69 714 'strand' : (5, str),
jpayne@69 715 'thickStart' : (6, int),
jpayne@69 716 'thickEnd' : (7, int),
jpayne@69 717 'itemRGB' : (8, str),
jpayne@69 718 'blockCount': (9, int),
jpayne@69 719 'blockSizes': (10, str),
jpayne@69 720 'blockStarts': (11, str), }
jpayne@69 721
jpayne@69 722 cpdef int getMinFields(self):
jpayne@69 723 '''return minimum number of fields.'''
jpayne@69 724 return 3
jpayne@69 725
jpayne@69 726 cpdef int getMaxFields(self):
jpayne@69 727 '''return max number of fields.'''
jpayne@69 728 return 12
jpayne@69 729
jpayne@69 730 cdef update(self, char * buffer, size_t nbytes):
jpayne@69 731 '''update internal data.
jpayne@69 732
jpayne@69 733 nbytes does not include the terminal '\0'.
jpayne@69 734 '''
jpayne@69 735 NamedTupleProxy.update(self, buffer, nbytes)
jpayne@69 736
jpayne@69 737 if self.nfields < 3:
jpayne@69 738 raise ValueError(
jpayne@69 739 "bed format requires at least three columns")
jpayne@69 740
jpayne@69 741 # determines bed format
jpayne@69 742 self.bedfields = self.nfields
jpayne@69 743
jpayne@69 744 # do automatic conversion
jpayne@69 745 self.contig = self.fields[0]
jpayne@69 746 self.start = atoi(self.fields[1])
jpayne@69 747 self.end = atoi(self.fields[2])
jpayne@69 748
jpayne@69 749 # __setattr__ in base class seems to take precedence
jpayne@69 750 # hence implement setters in __setattr__
jpayne@69 751 #property start:
jpayne@69 752 # def __get__( self ): return self.start
jpayne@69 753 #property end:
jpayne@69 754 # def __get__( self ): return self.end
jpayne@69 755
jpayne@69 756 def __str__(self):
jpayne@69 757 cdef int save_fields = self.nfields
jpayne@69 758 # ensure fields to use correct format
jpayne@69 759 self.nfields = self.bedfields
jpayne@69 760 retval = super().__str__()
jpayne@69 761 self.nfields = save_fields
jpayne@69 762 return retval
jpayne@69 763
jpayne@69 764 def __setattr__(self, key, value):
jpayne@69 765 '''set attribute.'''
jpayne@69 766 if key == "start":
jpayne@69 767 self.start = value
jpayne@69 768 elif key == "end":
jpayne@69 769 self.end = value
jpayne@69 770
jpayne@69 771 cdef int idx
jpayne@69 772 idx, f = self.map_key2field[key]
jpayne@69 773 TupleProxy._setindex(self, idx, str(value))
jpayne@69 774
jpayne@69 775
jpayne@69 776 cdef class VCFProxy(NamedTupleProxy):
jpayne@69 777 '''Proxy class for access to VCF fields.
jpayne@69 778
jpayne@69 779 The genotypes are accessed via a numeric index.
jpayne@69 780 Sample headers are not available.
jpayne@69 781 '''
jpayne@69 782 map_key2field = {
jpayne@69 783 'contig' : (0, str),
jpayne@69 784 'pos' : (1, int),
jpayne@69 785 'id' : (2, str),
jpayne@69 786 'ref' : (3, str),
jpayne@69 787 'alt' : (4, str),
jpayne@69 788 'qual' : (5, str),
jpayne@69 789 'filter' : (6, str),
jpayne@69 790 'info' : (7, str),
jpayne@69 791 'format' : (8, str) }
jpayne@69 792
jpayne@69 793 def __cinit__(self):
jpayne@69 794 # automatically calls TupleProxy.__cinit__
jpayne@69 795 # start indexed access at genotypes
jpayne@69 796 self.offset = 9
jpayne@69 797
jpayne@69 798 cdef update(self, char * buffer, size_t nbytes):
jpayne@69 799 '''update internal data.
jpayne@69 800
jpayne@69 801 nbytes does not include the terminal '\0'.
jpayne@69 802 '''
jpayne@69 803 NamedTupleProxy.update(self, buffer, nbytes)
jpayne@69 804
jpayne@69 805 self.contig = self.fields[0]
jpayne@69 806 # vcf counts from 1 - correct here
jpayne@69 807 self.pos = atoi(self.fields[1]) - 1
jpayne@69 808
jpayne@69 809 def __len__(self):
jpayne@69 810 '''return number of genotype fields.'''
jpayne@69 811 return max(0, self.nfields - 9)
jpayne@69 812
jpayne@69 813 property pos:
jpayne@69 814 '''feature end (in 0-based open/closed coordinates).'''
jpayne@69 815 def __get__(self):
jpayne@69 816 return self.pos
jpayne@69 817
jpayne@69 818 def __setattr__(self, key, value):
jpayne@69 819 '''set attribute.'''
jpayne@69 820 if key == "pos":
jpayne@69 821 self.pos = value
jpayne@69 822 value += 1
jpayne@69 823
jpayne@69 824 cdef int idx
jpayne@69 825 idx, f = self.map_key2field[key]
jpayne@69 826 TupleProxy._setindex(self, idx, str(value))
jpayne@69 827
jpayne@69 828
jpayne@69 829 __all__ = [
jpayne@69 830 "TupleProxy",
jpayne@69 831 "NamedTupleProxy",
jpayne@69 832 "GTFProxy",
jpayne@69 833 "GFF3Proxy",
jpayne@69 834 "BedProxy",
jpayne@69 835 "VCFProxy"]