annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libctabixproxies.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 from cpython cimport PyBytes_FromStringAndSize
jpayne@68 2
jpayne@68 3 from libc.stdio cimport printf, feof, fgets
jpayne@68 4 from libc.string cimport strcpy, strlen, memcmp, memcpy, memchr, strstr, strchr
jpayne@68 5 from libc.stdlib cimport free, malloc, calloc, realloc
jpayne@68 6 from libc.stdlib cimport atoi, atol, atof
jpayne@68 7
jpayne@68 8 from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
jpayne@68 9 from pysam.libcutils cimport encode_filename, from_string_and_size
jpayne@68 10
jpayne@68 11 import collections
jpayne@68 12 import copy
jpayne@68 13
jpayne@68 14
jpayne@68 15 cdef char *StrOrEmpty(char * buffer):
jpayne@68 16 if buffer == NULL:
jpayne@68 17 return ""
jpayne@68 18 else: return buffer
jpayne@68 19
jpayne@68 20
jpayne@68 21 cdef int isNew(char * p, char * buffer, size_t nbytes):
jpayne@68 22 """return True if `p` is located within `buffer` of size
jpayne@68 23 `nbytes`
jpayne@68 24 """
jpayne@68 25 if p == NULL:
jpayne@68 26 return 0
jpayne@68 27
jpayne@68 28 return not (buffer <= p <= buffer + nbytes)
jpayne@68 29
jpayne@68 30
jpayne@68 31 cdef class TupleProxy:
jpayne@68 32 '''Proxy class for access to parsed row as a tuple.
jpayne@68 33
jpayne@68 34 This class represents a table row for fast read-access.
jpayne@68 35
jpayne@68 36 Access to individual fields is via the [] operator.
jpayne@68 37
jpayne@68 38 Only read-only access is implemented.
jpayne@68 39
jpayne@68 40 '''
jpayne@68 41
jpayne@68 42 def __cinit__(self, encoding="ascii"):
jpayne@68 43 self.data = NULL
jpayne@68 44 self.fields = NULL
jpayne@68 45 self.nbytes = 0
jpayne@68 46 self.is_modified = 0
jpayne@68 47 self.nfields = 0
jpayne@68 48 # start counting at field offset
jpayne@68 49 self.offset = 0
jpayne@68 50 self.encoding = encoding
jpayne@68 51
jpayne@68 52 def __dealloc__(self):
jpayne@68 53 cdef int x
jpayne@68 54 if self.is_modified:
jpayne@68 55 for x from 0 <= x < self.nfields:
jpayne@68 56 if isNew(self.fields[x], self.data, self.nbytes):
jpayne@68 57 free(self.fields[x])
jpayne@68 58 self.fields[x] = NULL
jpayne@68 59
jpayne@68 60 if self.data != NULL:
jpayne@68 61 free(self.data)
jpayne@68 62 if self.fields != NULL:
jpayne@68 63 free(self.fields)
jpayne@68 64
jpayne@68 65 def __copy__(self):
jpayne@68 66 if self.is_modified:
jpayne@68 67 raise NotImplementedError(
jpayne@68 68 "copying modified tuples is not implemented")
jpayne@68 69 cdef TupleProxy n = type(self)()
jpayne@68 70 n.copy(self.data, self.nbytes, reset=True)
jpayne@68 71 return n
jpayne@68 72
jpayne@68 73 def compare(self, TupleProxy other):
jpayne@68 74 '''return -1,0,1, if contents in this are binary
jpayne@68 75 <,=,> to *other*
jpayne@68 76
jpayne@68 77 '''
jpayne@68 78 if self.is_modified or other.is_modified:
jpayne@68 79 raise NotImplementedError(
jpayne@68 80 'comparison of modified TupleProxies is not implemented')
jpayne@68 81 if self.data == other.data:
jpayne@68 82 return 0
jpayne@68 83
jpayne@68 84 if self.nbytes < other.nbytes:
jpayne@68 85 return -1
jpayne@68 86 elif self.nbytes > other.nbytes:
jpayne@68 87 return 1
jpayne@68 88 return memcmp(self.data, other.data, self.nbytes)
jpayne@68 89
jpayne@68 90 def __richcmp__(self, TupleProxy other, int op):
jpayne@68 91 if op == 2: # == operator
jpayne@68 92 return self.compare(other) == 0
jpayne@68 93 elif op == 3: # != operator
jpayne@68 94 return self.compare(other) != 0
jpayne@68 95 else:
jpayne@68 96 err_msg = "op {0} isn't implemented yet".format(op)
jpayne@68 97 raise NotImplementedError(err_msg)
jpayne@68 98
jpayne@68 99 cdef take(self, char * buffer, size_t nbytes):
jpayne@68 100 '''start presenting buffer.
jpayne@68 101
jpayne@68 102 Take ownership of the pointer.
jpayne@68 103 '''
jpayne@68 104 self.data = buffer
jpayne@68 105 self.nbytes = nbytes
jpayne@68 106 self.update(buffer, nbytes)
jpayne@68 107
jpayne@68 108 cdef present(self, char * buffer, size_t nbytes):
jpayne@68 109 '''start presenting buffer.
jpayne@68 110
jpayne@68 111 Do not take ownership of the pointer.
jpayne@68 112 '''
jpayne@68 113 self.update(buffer, nbytes)
jpayne@68 114
jpayne@68 115 cdef copy(self, char * buffer, size_t nbytes, bint reset=False):
jpayne@68 116 '''start presenting buffer of size *nbytes*.
jpayne@68 117
jpayne@68 118 Buffer is a '\0'-terminated string without the '\n'.
jpayne@68 119
jpayne@68 120 Take a copy of buffer.
jpayne@68 121 '''
jpayne@68 122 # +1 for '\0'
jpayne@68 123 cdef int s = sizeof(char) * (nbytes + 1)
jpayne@68 124 self.data = <char*>malloc(s)
jpayne@68 125 if self.data == NULL:
jpayne@68 126 raise ValueError("out of memory in TupleProxy.copy()")
jpayne@68 127 memcpy(<char*>self.data, buffer, s)
jpayne@68 128
jpayne@68 129 if reset:
jpayne@68 130 for x from 0 <= x < nbytes:
jpayne@68 131 if self.data[x] == b'\0':
jpayne@68 132 self.data[x] = b'\t'
jpayne@68 133
jpayne@68 134 self.update(self.data, nbytes)
jpayne@68 135
jpayne@68 136 cpdef int getMinFields(self):
jpayne@68 137 '''return minimum number of fields.'''
jpayne@68 138 # 1 is not a valid tabix entry, but TupleProxy
jpayne@68 139 # could be more generic.
jpayne@68 140 return 1
jpayne@68 141
jpayne@68 142 cpdef int getMaxFields(self):
jpayne@68 143 '''return maximum number of fields. Return
jpayne@68 144 0 for unknown length.'''
jpayne@68 145 return 0
jpayne@68 146
jpayne@68 147 cdef update(self, char * buffer, size_t nbytes):
jpayne@68 148 '''update internal data.
jpayne@68 149
jpayne@68 150 *buffer* is a \0 terminated string.
jpayne@68 151
jpayne@68 152 *nbytes* is the number of bytes in buffer (excluding
jpayne@68 153 the \0)
jpayne@68 154
jpayne@68 155 Update starts work in buffer, thus can be used
jpayne@68 156 to collect any number of fields until nbytes
jpayne@68 157 is exhausted.
jpayne@68 158
jpayne@68 159 If max_fields is set, the number of fields is initialized to
jpayne@68 160 max_fields.
jpayne@68 161
jpayne@68 162 '''
jpayne@68 163 cdef char * pos
jpayne@68 164 cdef char * old_pos
jpayne@68 165 cdef int field
jpayne@68 166 cdef int max_fields, min_fields, x
jpayne@68 167
jpayne@68 168 assert strlen(buffer) == nbytes, \
jpayne@68 169 "length of buffer (%i) != number of bytes (%i)" % (
jpayne@68 170 strlen(buffer), nbytes)
jpayne@68 171
jpayne@68 172 if buffer[nbytes] != 0:
jpayne@68 173 raise ValueError("incomplete line at %s" % buffer)
jpayne@68 174
jpayne@68 175 #################################
jpayne@68 176 # remove line breaks and feeds and update number of bytes
jpayne@68 177 x = nbytes - 1
jpayne@68 178 while x > 0 and (buffer[x] == b'\n' or buffer[x] == b'\r'):
jpayne@68 179 buffer[x] = b'\0'
jpayne@68 180 x -= 1
jpayne@68 181 self.nbytes = x + 1
jpayne@68 182
jpayne@68 183 #################################
jpayne@68 184 # clear data
jpayne@68 185 if self.fields != NULL:
jpayne@68 186 free(self.fields)
jpayne@68 187
jpayne@68 188 for field from 0 <= field < self.nfields:
jpayne@68 189 if isNew(self.fields[field], self.data, self.nbytes):
jpayne@68 190 free(self.fields[field])
jpayne@68 191
jpayne@68 192 self.is_modified = self.nfields = 0
jpayne@68 193
jpayne@68 194 #################################
jpayne@68 195 # allocate new
jpayne@68 196 max_fields = self.getMaxFields()
jpayne@68 197 # pre-count fields - better would be
jpayne@68 198 # to guess or dynamically grow
jpayne@68 199 if max_fields == 0:
jpayne@68 200 for x from 0 <= x < nbytes:
jpayne@68 201 if buffer[x] == b'\t':
jpayne@68 202 max_fields += 1
jpayne@68 203 max_fields += 1
jpayne@68 204
jpayne@68 205 self.fields = <char **>calloc(max_fields, sizeof(char *))
jpayne@68 206 if self.fields == NULL:
jpayne@68 207 raise ValueError("out of memory in TupleProxy.update()")
jpayne@68 208
jpayne@68 209 #################################
jpayne@68 210 # start filling
jpayne@68 211 field = 0
jpayne@68 212 self.fields[field] = pos = buffer
jpayne@68 213 field += 1
jpayne@68 214 old_pos = pos
jpayne@68 215 while 1:
jpayne@68 216
jpayne@68 217 pos = <char*>memchr(pos, b'\t', nbytes)
jpayne@68 218 if pos == NULL:
jpayne@68 219 break
jpayne@68 220 if field >= max_fields:
jpayne@68 221 raise ValueError(
jpayne@68 222 "parsing error: more than %i fields in line: %s" %
jpayne@68 223 (max_fields, buffer))
jpayne@68 224
jpayne@68 225 pos[0] = b'\0'
jpayne@68 226 pos += 1
jpayne@68 227 self.fields[field] = pos
jpayne@68 228 field += 1
jpayne@68 229 nbytes -= pos - old_pos
jpayne@68 230 if nbytes < 0:
jpayne@68 231 break
jpayne@68 232 old_pos = pos
jpayne@68 233 self.nfields = field
jpayne@68 234 if self.nfields < self.getMinFields():
jpayne@68 235 raise ValueError(
jpayne@68 236 "parsing error: fewer than %i fields in line: %s" %
jpayne@68 237 (self.getMinFields(), buffer))
jpayne@68 238
jpayne@68 239 def _getindex(self, int index):
jpayne@68 240 '''return item at idx index'''
jpayne@68 241 cdef int i = index
jpayne@68 242 if i < 0:
jpayne@68 243 i += self.nfields
jpayne@68 244 if i < 0:
jpayne@68 245 raise IndexError("list index out of range")
jpayne@68 246 # apply offset - separating a fixed number
jpayne@68 247 # of fields from a variable number such as in VCF
jpayne@68 248 i += self.offset
jpayne@68 249 if i >= self.nfields:
jpayne@68 250 raise IndexError(
jpayne@68 251 "list index out of range %i >= %i" %
jpayne@68 252 (i, self.nfields))
jpayne@68 253 return force_str(self.fields[i], self.encoding)
jpayne@68 254
jpayne@68 255 def __getitem__(self, key):
jpayne@68 256 if type(key) == int:
jpayne@68 257 return self._getindex(key)
jpayne@68 258 # slice object
jpayne@68 259 start, end, step = key.indices(self.nfields)
jpayne@68 260 result = []
jpayne@68 261 for index in range(start, end, step):
jpayne@68 262 result.append(self._getindex(index))
jpayne@68 263 return result
jpayne@68 264
jpayne@68 265 def _setindex(self, index, value):
jpayne@68 266 '''set item at idx index.'''
jpayne@68 267 cdef int idx = index
jpayne@68 268 if idx < 0:
jpayne@68 269 raise IndexError("list index out of range")
jpayne@68 270 if idx >= self.nfields:
jpayne@68 271 raise IndexError("list index out of range")
jpayne@68 272
jpayne@68 273 if isNew(self.fields[idx], self.data, self.nbytes):
jpayne@68 274 free(self.fields[idx])
jpayne@68 275
jpayne@68 276 self.is_modified = 1
jpayne@68 277
jpayne@68 278 if value is None:
jpayne@68 279 self.fields[idx] = NULL
jpayne@68 280 return
jpayne@68 281
jpayne@68 282 # conversion with error checking
jpayne@68 283 value = force_bytes(value)
jpayne@68 284 cdef char * tmp = <char*>value
jpayne@68 285 self.fields[idx] = <char*>malloc((strlen( tmp ) + 1) * sizeof(char))
jpayne@68 286 if self.fields[idx] == NULL:
jpayne@68 287 raise ValueError("out of memory" )
jpayne@68 288 strcpy(self.fields[idx], tmp)
jpayne@68 289
jpayne@68 290 def __setitem__(self, index, value):
jpayne@68 291 '''set item at *index* to *value*'''
jpayne@68 292 cdef int i = index
jpayne@68 293 if i < 0:
jpayne@68 294 i += self.nfields
jpayne@68 295 i += self.offset
jpayne@68 296
jpayne@68 297 self._setindex(i, value)
jpayne@68 298
jpayne@68 299 def __len__(self):
jpayne@68 300 return self.nfields
jpayne@68 301
jpayne@68 302 def __iter__(self):
jpayne@68 303 return TupleProxyIterator(self)
jpayne@68 304
jpayne@68 305 def __str__(self):
jpayne@68 306 '''return original data'''
jpayne@68 307 # copy and replace \0 bytes with \t characters
jpayne@68 308 cdef char * cpy
jpayne@68 309 if self.is_modified:
jpayne@68 310 # todo: treat NULL values
jpayne@68 311 result = []
jpayne@68 312 for x in xrange(0, self.nfields):
jpayne@68 313 result.append(StrOrEmpty(self.fields[x]).decode(self.encoding))
jpayne@68 314 return "\t".join(result)
jpayne@68 315 else:
jpayne@68 316 cpy = <char*>calloc(sizeof(char), self.nbytes+1)
jpayne@68 317 if cpy == NULL:
jpayne@68 318 raise ValueError("out of memory")
jpayne@68 319 memcpy(cpy, self.data, self.nbytes+1)
jpayne@68 320 for x from 0 <= x < self.nbytes:
jpayne@68 321 if cpy[x] == b'\0':
jpayne@68 322 cpy[x] = b'\t'
jpayne@68 323 result = cpy[:self.nbytes]
jpayne@68 324 free(cpy)
jpayne@68 325 r = result.decode(self.encoding)
jpayne@68 326 return r
jpayne@68 327
jpayne@68 328
jpayne@68 329 cdef class TupleProxyIterator:
jpayne@68 330 def __init__(self, proxy):
jpayne@68 331 self.proxy = proxy
jpayne@68 332 self.index = 0
jpayne@68 333
jpayne@68 334 def __iter__(self):
jpayne@68 335 return self
jpayne@68 336
jpayne@68 337 def __next__(self):
jpayne@68 338 if self.index >= self.proxy.nfields:
jpayne@68 339 raise StopIteration
jpayne@68 340 cdef char *retval = self.proxy.fields[self.index]
jpayne@68 341 self.index += 1
jpayne@68 342 return force_str(retval, self.proxy.encoding) if retval != NULL else None
jpayne@68 343
jpayne@68 344
jpayne@68 345 def toDot(v):
jpayne@68 346 '''convert value to '.' if None'''
jpayne@68 347 if v is None:
jpayne@68 348 return "."
jpayne@68 349 else:
jpayne@68 350 return str(v)
jpayne@68 351
jpayne@68 352 def quote(v):
jpayne@68 353 '''return a quoted attribute.'''
jpayne@68 354 if isinstance(v, str):
jpayne@68 355 return '"%s"' % v
jpayne@68 356 else:
jpayne@68 357 return str(v)
jpayne@68 358
jpayne@68 359
jpayne@68 360 cdef class NamedTupleProxy(TupleProxy):
jpayne@68 361
jpayne@68 362 map_key2field = {}
jpayne@68 363
jpayne@68 364 def __setattr__(self, key, value):
jpayne@68 365 '''set attribute.'''
jpayne@68 366 cdef int idx
jpayne@68 367 idx, f = self.map_key2field[key]
jpayne@68 368 if idx >= self.nfields:
jpayne@68 369 raise KeyError("field %s not set" % key)
jpayne@68 370 TupleProxy.__setitem__(self, idx, str(value))
jpayne@68 371
jpayne@68 372 def __getattr__(self, key):
jpayne@68 373 cdef int idx
jpayne@68 374 idx, f = self.map_key2field[key]
jpayne@68 375 if idx >= self.nfields:
jpayne@68 376 raise KeyError("field %s not set" % key)
jpayne@68 377 if f == str:
jpayne@68 378 return force_str(self.fields[idx],
jpayne@68 379 self.encoding)
jpayne@68 380 return f(self.fields[idx])
jpayne@68 381
jpayne@68 382
jpayne@68 383 cdef dot_or_float(v):
jpayne@68 384 if v == "" or v == b".":
jpayne@68 385 return None
jpayne@68 386 else:
jpayne@68 387 try:
jpayne@68 388 return int(v)
jpayne@68 389 except ValueError:
jpayne@68 390 return float(v)
jpayne@68 391
jpayne@68 392
jpayne@68 393 cdef dot_or_int(v):
jpayne@68 394 if v == "" or v == b".":
jpayne@68 395 return None
jpayne@68 396 else:
jpayne@68 397 return int(v)
jpayne@68 398
jpayne@68 399
jpayne@68 400 cdef dot_or_str(v):
jpayne@68 401 if v == "" or v == b".":
jpayne@68 402 return None
jpayne@68 403 else:
jpayne@68 404 return force_str(v)
jpayne@68 405
jpayne@68 406
jpayne@68 407 cdef int from1based(v):
jpayne@68 408 return atoi(v) - 1
jpayne@68 409
jpayne@68 410
jpayne@68 411 cdef str to1based(int v):
jpayne@68 412 return str(v + 1)
jpayne@68 413
jpayne@68 414
jpayne@68 415 cdef class GTFProxy(NamedTupleProxy):
jpayne@68 416 '''Proxy class for access to GTF fields.
jpayne@68 417
jpayne@68 418 This class represents a GTF entry for fast read-access.
jpayne@68 419 Write-access has been added as well, though some care must
jpayne@68 420 be taken. If any of the string fields (contig, source, ...)
jpayne@68 421 are set, the new value is tied to the lifetime of the
jpayne@68 422 argument that was supplied.
jpayne@68 423
jpayne@68 424 The only exception is the attributes field when set from
jpayne@68 425 a dictionary - this field will manage its own memory.
jpayne@68 426
jpayne@68 427 '''
jpayne@68 428 separator = "; "
jpayne@68 429
jpayne@68 430 # first value is field index, the tuple contains conversion
jpayne@68 431 # functions for getting (converting internal string representation
jpayne@68 432 # to pythonic value) and setting (converting pythonic value to
jpayne@68 433 # interval string representation)
jpayne@68 434 map_key2field = {
jpayne@68 435 'contig' : (0, (str, str)),
jpayne@68 436 'source' : (1, (dot_or_str, str)),
jpayne@68 437 'feature': (2, (dot_or_str, str)),
jpayne@68 438 'start' : (3, (from1based, to1based)),
jpayne@68 439 'end' : (4, (int, int)),
jpayne@68 440 'score' : (5, (dot_or_float, toDot)),
jpayne@68 441 'strand' : (6, (dot_or_str, str)),
jpayne@68 442 'frame' : (7, (dot_or_int, toDot)),
jpayne@68 443 'attributes': (8, (str, str))}
jpayne@68 444
jpayne@68 445 def __cinit__(self):
jpayne@68 446 # automatically calls TupleProxy.__cinit__
jpayne@68 447 self.attribute_dict = None
jpayne@68 448
jpayne@68 449 cpdef int getMinFields(self):
jpayne@68 450 '''return minimum number of fields.'''
jpayne@68 451 return 9
jpayne@68 452
jpayne@68 453 cpdef int getMaxFields(self):
jpayne@68 454 '''return max number of fields.'''
jpayne@68 455 return 9
jpayne@68 456
jpayne@68 457 def to_dict(self):
jpayne@68 458 """parse attributes - return as dict
jpayne@68 459
jpayne@68 460 The dictionary can be modified to update attributes.
jpayne@68 461 """
jpayne@68 462 if not self.attribute_dict:
jpayne@68 463 self.attribute_dict = self.attribute_string2dict(
jpayne@68 464 self.attributes)
jpayne@68 465 self.is_modified = True
jpayne@68 466 return self.attribute_dict
jpayne@68 467
jpayne@68 468 def as_dict(self):
jpayne@68 469 """deprecated: use :meth:`to_dict`
jpayne@68 470 """
jpayne@68 471 return self.to_dict()
jpayne@68 472
jpayne@68 473 def from_dict(self, d):
jpayne@68 474 '''set attributes from a dictionary.'''
jpayne@68 475 self.attribute_dict = None
jpayne@68 476 attribute_string = force_bytes(
jpayne@68 477 self.attribute_dict2string(d),
jpayne@68 478 self.encoding)
jpayne@68 479 self._setindex(8, attribute_string)
jpayne@68 480
jpayne@68 481 def __str__(self):
jpayne@68 482 cdef char * cpy
jpayne@68 483 cdef int x
jpayne@68 484
jpayne@68 485 if self.is_modified:
jpayne@68 486 return "\t".join(
jpayne@68 487 (self.contig,
jpayne@68 488 toDot(self.source),
jpayne@68 489 toDot(self.feature),
jpayne@68 490 str(self.start + 1),
jpayne@68 491 str(self.end),
jpayne@68 492 toDot(self.score),
jpayne@68 493 toDot(self.strand),
jpayne@68 494 toDot(self.frame),
jpayne@68 495 self.attributes))
jpayne@68 496 else:
jpayne@68 497 return super().__str__()
jpayne@68 498
jpayne@68 499 def invert(self, int lcontig):
jpayne@68 500 '''invert coordinates to negative strand coordinates
jpayne@68 501
jpayne@68 502 This method will only act if the feature is on the
jpayne@68 503 negative strand.'''
jpayne@68 504
jpayne@68 505 if self.strand[0] == '-':
jpayne@68 506 start = min(self.start, self.end)
jpayne@68 507 end = max(self.start, self.end)
jpayne@68 508 self.start, self.end = lcontig - end, lcontig - start
jpayne@68 509
jpayne@68 510 def keys(self):
jpayne@68 511 '''return a list of attributes defined in this entry.'''
jpayne@68 512 if not self.attribute_dict:
jpayne@68 513 self.attribute_dict = self.attribute_string2dict(
jpayne@68 514 self.attributes)
jpayne@68 515 return self.attribute_dict.keys()
jpayne@68 516
jpayne@68 517 def __getitem__(self, key):
jpayne@68 518 return self.__getattr__(key)
jpayne@68 519
jpayne@68 520 def setAttribute(self, name, value):
jpayne@68 521 '''convenience method to set an attribute.
jpayne@68 522 '''
jpayne@68 523 if not self.attribute_dict:
jpayne@68 524 self.attribute_dict = self.attribute_string2dict(
jpayne@68 525 self.attributes)
jpayne@68 526 self.attribute_dict[name] = value
jpayne@68 527 self.is_modified = True
jpayne@68 528
jpayne@68 529 def attribute_string2dict(self, s):
jpayne@68 530 return collections.OrderedDict(
jpayne@68 531 self.attribute_string2iterator(s))
jpayne@68 532
jpayne@68 533 def __cmp__(self, other):
jpayne@68 534 return (self.contig, self.strand, self.start) < \
jpayne@68 535 (other.contig, other.strand, other.start)
jpayne@68 536
jpayne@68 537 # python 3 compatibility
jpayne@68 538 def __richcmp__(GTFProxy self, GTFProxy other, int op):
jpayne@68 539 if op == 0:
jpayne@68 540 return (self.contig, self.strand, self.start) < \
jpayne@68 541 (other.contig, other.strand, other.start)
jpayne@68 542 elif op == 1:
jpayne@68 543 return (self.contig, self.strand, self.start) <= \
jpayne@68 544 (other.contig, other.strand, other.start)
jpayne@68 545 elif op == 2:
jpayne@68 546 return self.compare(other) == 0
jpayne@68 547 elif op == 3:
jpayne@68 548 return self.compare(other) != 0
jpayne@68 549 else:
jpayne@68 550 err_msg = "op {0} isn't implemented yet".format(op)
jpayne@68 551 raise NotImplementedError(err_msg)
jpayne@68 552
jpayne@68 553 def dict2attribute_string(self, d):
jpayne@68 554 """convert dictionary to attribute string in GTF format.
jpayne@68 555
jpayne@68 556 """
jpayne@68 557 aa = []
jpayne@68 558 for k, v in d.items():
jpayne@68 559 if isinstance(v, str):
jpayne@68 560 aa.append('{} "{}"'.format(k, v))
jpayne@68 561 else:
jpayne@68 562 aa.append("{} {}".format(k, str(v)))
jpayne@68 563
jpayne@68 564 return self.separator.join(aa) + ";"
jpayne@68 565
jpayne@68 566 def attribute_string2iterator(self, s):
jpayne@68 567 """convert attribute string in GTF format to records
jpayne@68 568 and iterate over key, value pairs.
jpayne@68 569 """
jpayne@68 570
jpayne@68 571 # remove comments
jpayne@68 572 attributes = force_str(s, encoding=self.encoding)
jpayne@68 573
jpayne@68 574 # separate into fields
jpayne@68 575 # Fields might contain a ";", for example in ENSEMBL GTF file
jpayne@68 576 # for mouse, v78:
jpayne@68 577 # ...; transcript_name "TXNRD2;-001"; ....
jpayne@68 578 # The current heuristic is to split on a semicolon followed by a
jpayne@68 579 # space, see also http://mblab.wustl.edu/GTF22.html
jpayne@68 580
jpayne@68 581 # Remove white space to prevent a last empty field.
jpayne@68 582 fields = [x.strip() for x in attributes.strip().split("; ")]
jpayne@68 583 for f in fields:
jpayne@68 584
jpayne@68 585 # strip semicolon (GTF files without a space after the last semicolon)
jpayne@68 586 if f.endswith(";"):
jpayne@68 587 f = f[:-1]
jpayne@68 588
jpayne@68 589 # split at most once in order to avoid separating
jpayne@68 590 # multi-word values
jpayne@68 591 d = [x.strip() for x in f.split(" ", 1)]
jpayne@68 592
jpayne@68 593 n, v = d[0], d[1]
jpayne@68 594 if len(d) > 2:
jpayne@68 595 v = d[1:]
jpayne@68 596
jpayne@68 597 if v[0] == '"' and v[-1] == '"':
jpayne@68 598 v = v[1:-1]
jpayne@68 599 else:
jpayne@68 600 ## try to convert to a value
jpayne@68 601 try:
jpayne@68 602 v = float(v)
jpayne@68 603 v = int(v)
jpayne@68 604 except ValueError:
jpayne@68 605 pass
jpayne@68 606 except TypeError:
jpayne@68 607 pass
jpayne@68 608
jpayne@68 609 yield n, v
jpayne@68 610
jpayne@68 611 def __getattr__(self, key):
jpayne@68 612 """Generic lookup of attribute from GFF/GTF attributes
jpayne@68 613 """
jpayne@68 614
jpayne@68 615 # Only called if there *isn't* an attribute with this name
jpayne@68 616 cdef int idx
jpayne@68 617 idx, f = self.map_key2field.get(key, (-1, None))
jpayne@68 618 if idx >= 0:
jpayne@68 619 # deal with known attributes (fields 0-8)
jpayne@68 620 if idx == 8:
jpayne@68 621 # flush attributes if requested
jpayne@68 622 if self.is_modified and self.attribute_dict is not None:
jpayne@68 623 s = self.dict2attribute_string(self.attribute_dict)
jpayne@68 624 TupleProxy._setindex(self, idx, s)
jpayne@68 625 self.attribute_dict = None
jpayne@68 626 return s
jpayne@68 627
jpayne@68 628 if f[0] == str:
jpayne@68 629 return force_str(self.fields[idx],
jpayne@68 630 self.encoding)
jpayne@68 631 else:
jpayne@68 632 return f[0](self.fields[idx])
jpayne@68 633 else:
jpayne@68 634 # deal with generic attributes (gene_id, ...)
jpayne@68 635 if self.attribute_dict is None:
jpayne@68 636 self.attribute_dict = self.attribute_string2dict(
jpayne@68 637 self.attributes)
jpayne@68 638 return self.attribute_dict[key]
jpayne@68 639
jpayne@68 640 def __setattr__(self, key, value):
jpayne@68 641 '''set attribute.'''
jpayne@68 642
jpayne@68 643 # Note that __setattr__ is called before properties, so __setattr__ and
jpayne@68 644 # properties don't mix well. This is different from __getattr__ which is
jpayne@68 645 # called after any properties have been resolved.
jpayne@68 646 cdef int idx
jpayne@68 647 idx, f = self.map_key2field.get(key, (-1, None))
jpayne@68 648
jpayne@68 649 if idx >= 0:
jpayne@68 650 if value is None:
jpayne@68 651 s = "."
jpayne@68 652 elif f[1] == str:
jpayne@68 653 s = force_bytes(value,
jpayne@68 654 self.encoding)
jpayne@68 655 else:
jpayne@68 656 s = str(f[1](value))
jpayne@68 657 TupleProxy._setindex(self, idx, s)
jpayne@68 658 else:
jpayne@68 659 if self.attribute_dict is None:
jpayne@68 660 self.attribute_dict = self.attribute_string2dict(
jpayne@68 661 self.attributes)
jpayne@68 662 self.attribute_dict[key] = value
jpayne@68 663 self.is_modified = True
jpayne@68 664
jpayne@68 665 # for backwards compatibility
jpayne@68 666 def asDict(self, *args, **kwargs):
jpayne@68 667 return self.to_dict(*args, **kwargs)
jpayne@68 668
jpayne@68 669 def fromDict(self, *args, **kwargs):
jpayne@68 670 return self.from_dict(*args, **kwargs)
jpayne@68 671
jpayne@68 672
jpayne@68 673 cdef class GFF3Proxy(GTFProxy):
jpayne@68 674
jpayne@68 675 def dict2attribute_string(self, d):
jpayne@68 676 """convert dictionary to attribute string."""
jpayne@68 677 return ";".join(["{}={}".format(k, v) for k, v in d.items()])
jpayne@68 678
jpayne@68 679 def attribute_string2iterator(self, s):
jpayne@68 680 """convert attribute string in GFF3 format to records
jpayne@68 681 and iterate over key, value pairs.
jpayne@68 682 """
jpayne@68 683
jpayne@68 684 for f in (x.strip() for x in s.split(";")):
jpayne@68 685 if not f:
jpayne@68 686 continue
jpayne@68 687
jpayne@68 688 key, value = f.split("=", 1)
jpayne@68 689 value = value.strip()
jpayne@68 690
jpayne@68 691 ## try to convert to a value
jpayne@68 692 try:
jpayne@68 693 value = float(value)
jpayne@68 694 value = int(value)
jpayne@68 695 except ValueError:
jpayne@68 696 pass
jpayne@68 697 except TypeError:
jpayne@68 698 pass
jpayne@68 699
jpayne@68 700 yield key.strip(), value
jpayne@68 701
jpayne@68 702
jpayne@68 703 cdef class BedProxy(NamedTupleProxy):
jpayne@68 704 '''Proxy class for access to Bed fields.
jpayne@68 705
jpayne@68 706 This class represents a BED entry for fast read-access.
jpayne@68 707 '''
jpayne@68 708 map_key2field = {
jpayne@68 709 'contig' : (0, str),
jpayne@68 710 'start' : (1, int),
jpayne@68 711 'end' : (2, int),
jpayne@68 712 'name' : (3, str),
jpayne@68 713 'score' : (4, float),
jpayne@68 714 'strand' : (5, str),
jpayne@68 715 'thickStart' : (6, int),
jpayne@68 716 'thickEnd' : (7, int),
jpayne@68 717 'itemRGB' : (8, str),
jpayne@68 718 'blockCount': (9, int),
jpayne@68 719 'blockSizes': (10, str),
jpayne@68 720 'blockStarts': (11, str), }
jpayne@68 721
jpayne@68 722 cpdef int getMinFields(self):
jpayne@68 723 '''return minimum number of fields.'''
jpayne@68 724 return 3
jpayne@68 725
jpayne@68 726 cpdef int getMaxFields(self):
jpayne@68 727 '''return max number of fields.'''
jpayne@68 728 return 12
jpayne@68 729
jpayne@68 730 cdef update(self, char * buffer, size_t nbytes):
jpayne@68 731 '''update internal data.
jpayne@68 732
jpayne@68 733 nbytes does not include the terminal '\0'.
jpayne@68 734 '''
jpayne@68 735 NamedTupleProxy.update(self, buffer, nbytes)
jpayne@68 736
jpayne@68 737 if self.nfields < 3:
jpayne@68 738 raise ValueError(
jpayne@68 739 "bed format requires at least three columns")
jpayne@68 740
jpayne@68 741 # determines bed format
jpayne@68 742 self.bedfields = self.nfields
jpayne@68 743
jpayne@68 744 # do automatic conversion
jpayne@68 745 self.contig = self.fields[0]
jpayne@68 746 self.start = atoi(self.fields[1])
jpayne@68 747 self.end = atoi(self.fields[2])
jpayne@68 748
jpayne@68 749 # __setattr__ in base class seems to take precedence
jpayne@68 750 # hence implement setters in __setattr__
jpayne@68 751 #property start:
jpayne@68 752 # def __get__( self ): return self.start
jpayne@68 753 #property end:
jpayne@68 754 # def __get__( self ): return self.end
jpayne@68 755
jpayne@68 756 def __str__(self):
jpayne@68 757 cdef int save_fields = self.nfields
jpayne@68 758 # ensure fields to use correct format
jpayne@68 759 self.nfields = self.bedfields
jpayne@68 760 retval = super().__str__()
jpayne@68 761 self.nfields = save_fields
jpayne@68 762 return retval
jpayne@68 763
jpayne@68 764 def __setattr__(self, key, value):
jpayne@68 765 '''set attribute.'''
jpayne@68 766 if key == "start":
jpayne@68 767 self.start = value
jpayne@68 768 elif key == "end":
jpayne@68 769 self.end = value
jpayne@68 770
jpayne@68 771 cdef int idx
jpayne@68 772 idx, f = self.map_key2field[key]
jpayne@68 773 TupleProxy._setindex(self, idx, str(value))
jpayne@68 774
jpayne@68 775
jpayne@68 776 cdef class VCFProxy(NamedTupleProxy):
jpayne@68 777 '''Proxy class for access to VCF fields.
jpayne@68 778
jpayne@68 779 The genotypes are accessed via a numeric index.
jpayne@68 780 Sample headers are not available.
jpayne@68 781 '''
jpayne@68 782 map_key2field = {
jpayne@68 783 'contig' : (0, str),
jpayne@68 784 'pos' : (1, int),
jpayne@68 785 'id' : (2, str),
jpayne@68 786 'ref' : (3, str),
jpayne@68 787 'alt' : (4, str),
jpayne@68 788 'qual' : (5, str),
jpayne@68 789 'filter' : (6, str),
jpayne@68 790 'info' : (7, str),
jpayne@68 791 'format' : (8, str) }
jpayne@68 792
jpayne@68 793 def __cinit__(self):
jpayne@68 794 # automatically calls TupleProxy.__cinit__
jpayne@68 795 # start indexed access at genotypes
jpayne@68 796 self.offset = 9
jpayne@68 797
jpayne@68 798 cdef update(self, char * buffer, size_t nbytes):
jpayne@68 799 '''update internal data.
jpayne@68 800
jpayne@68 801 nbytes does not include the terminal '\0'.
jpayne@68 802 '''
jpayne@68 803 NamedTupleProxy.update(self, buffer, nbytes)
jpayne@68 804
jpayne@68 805 self.contig = self.fields[0]
jpayne@68 806 # vcf counts from 1 - correct here
jpayne@68 807 self.pos = atoi(self.fields[1]) - 1
jpayne@68 808
jpayne@68 809 def __len__(self):
jpayne@68 810 '''return number of genotype fields.'''
jpayne@68 811 return max(0, self.nfields - 9)
jpayne@68 812
jpayne@68 813 property pos:
jpayne@68 814 '''feature end (in 0-based open/closed coordinates).'''
jpayne@68 815 def __get__(self):
jpayne@68 816 return self.pos
jpayne@68 817
jpayne@68 818 def __setattr__(self, key, value):
jpayne@68 819 '''set attribute.'''
jpayne@68 820 if key == "pos":
jpayne@68 821 self.pos = value
jpayne@68 822 value += 1
jpayne@68 823
jpayne@68 824 cdef int idx
jpayne@68 825 idx, f = self.map_key2field[key]
jpayne@68 826 TupleProxy._setindex(self, idx, str(value))
jpayne@68 827
jpayne@68 828
jpayne@68 829 __all__ = [
jpayne@68 830 "TupleProxy",
jpayne@68 831 "NamedTupleProxy",
jpayne@68 832 "GTFProxy",
jpayne@68 833 "GFF3Proxy",
jpayne@68 834 "BedProxy",
jpayne@68 835 "VCFProxy"]