annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/featurefuncs.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # cython: language_level=2
jpayne@68 2 # distutils: language = c++
jpayne@68 3 from cbedtools cimport Interval
jpayne@68 4 from cbedtools import create_interval_from_list
jpayne@68 5
jpayne@68 6
jpayne@68 7 cpdef extend_fields(Interval feature, int n):
jpayne@68 8 """
jpayne@68 9 Pads the fields of the feature with "." to a total length of `n` fields,
jpayne@68 10 """
jpayne@68 11 fields = feature.fields[:]
jpayne@68 12 while len(fields) < n:
jpayne@68 13 fields.append('.')
jpayne@68 14 i = create_interval_from_list(fields)
jpayne@68 15
jpayne@68 16 if n > 4 and (i[4] == '.'):
jpayne@68 17 i[4] = '0'
jpayne@68 18 if n > 6 and (i[6] == '.'):
jpayne@68 19 i[6] = str(i.start)
jpayne@68 20 if n > 7 and (i[7] == '.'):
jpayne@68 21 i[7] = str(i.stop)
jpayne@68 22 if n > 8 and (i[8] == '.'):
jpayne@68 23 i[8] = '0,0,0'
jpayne@68 24 return i
jpayne@68 25
jpayne@68 26
jpayne@68 27
jpayne@68 28 cpdef center(Interval feature, int width=100):
jpayne@68 29 """
jpayne@68 30 Return the *width* bp from the center of a feature. If a feature is
jpayne@68 31 smaller than *width*, then return the entire feature.
jpayne@68 32 """
jpayne@68 33 if len(feature) < width:
jpayne@68 34 return feature
jpayne@68 35 cdef int start = feature.start
jpayne@68 36 cdef int stop = feature.stop
jpayne@68 37 cdef int center = start + (stop - start) / 2
jpayne@68 38 halfwidth = width / 2
jpayne@68 39 feature.start = center - halfwidth
jpayne@68 40 if feature.start < 1:
jpayne@68 41 feature.start = 1
jpayne@68 42 if halfwidth == 0:
jpayne@68 43 halfwidth = 1
jpayne@68 44 feature.stop = center + halfwidth
jpayne@68 45 return feature
jpayne@68 46
jpayne@68 47
jpayne@68 48 cpdef midpoint(Interval feature):
jpayne@68 49 """
jpayne@68 50 Specialized version of `center()` that just returns the single-bp midpoint
jpayne@68 51 """
jpayne@68 52 start = feature.start + (feature.stop - feature.start) / 2
jpayne@68 53 stop = start + 1
jpayne@68 54 feature.start = start
jpayne@68 55 feature.stop = stop
jpayne@68 56 return feature
jpayne@68 57
jpayne@68 58
jpayne@68 59 cpdef greater_than(Interval feature, int size=100):
jpayne@68 60 """
jpayne@68 61 Return True if feature length > *size*
jpayne@68 62 """
jpayne@68 63 return len(feature) > size
jpayne@68 64
jpayne@68 65
jpayne@68 66 cpdef less_than(Interval feature, int size=100):
jpayne@68 67 """
jpayne@68 68 Return True if feature length < *size*
jpayne@68 69 """
jpayne@68 70 return len(feature) < size
jpayne@68 71
jpayne@68 72
jpayne@68 73 cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001):
jpayne@68 74 """
jpayne@68 75 Normalizes the value at feature[idx] to the feature's length, in kb.
jpayne@68 76
jpayne@68 77 *idx*, by default, is the score field for a BED file, but specify any
jpayne@68 78 integer.
jpayne@68 79
jpayne@68 80 The value at *idx* will be replaced with its scaled value.
jpayne@68 81
jpayne@68 82 *scalar* will be multiplied by the value at *idx*, by default this is
jpayne@68 83 0.001, or per kb.
jpayne@68 84
jpayne@68 85 Useful for calculating RPKM after running intersect with counts
jpayne@68 86 """
jpayne@68 87 feature[idx] = str(float(feature[idx]) * scalar / len(feature))
jpayne@68 88 return feature
jpayne@68 89
jpayne@68 90
jpayne@68 91 cpdef rename(Interval feature, str name):
jpayne@68 92 """
jpayne@68 93 Forces a rename of all features, e.g., for renaming everything in a file
jpayne@68 94 'exon'
jpayne@68 95 """
jpayne@68 96 feature.name = name
jpayne@68 97 return feature
jpayne@68 98
jpayne@68 99
jpayne@68 100 cpdef bedgraph_scale(Interval feature, float scalar):
jpayne@68 101 feature[3] = str(float(feature[3]) * scalar)
jpayne@68 102 return feature
jpayne@68 103
jpayne@68 104
jpayne@68 105 cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None):
jpayne@68 106 """
jpayne@68 107 Alias for five_prime.
jpayne@68 108 """
jpayne@68 109 return star_prime(feature, upstream, downstream, prime=5,
jpayne@68 110 add_to_name=add_to_name, genome=genome)
jpayne@68 111
jpayne@68 112
jpayne@68 113 cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5,
jpayne@68 114 add_to_name=None, genome=None):
jpayne@68 115
jpayne@68 116 if prime == 5:
jpayne@68 117 if feature.strand == '-':
jpayne@68 118 start = feature.stop - downstream
jpayne@68 119 stop = feature.stop + upstream
jpayne@68 120 else:
jpayne@68 121 start = feature.start - upstream
jpayne@68 122 stop = feature.start + downstream
jpayne@68 123 elif prime == 3:
jpayne@68 124 if feature.strand == '-':
jpayne@68 125 start = feature.start - downstream
jpayne@68 126 stop = feature.start + upstream
jpayne@68 127 else:
jpayne@68 128 start = feature.stop - upstream
jpayne@68 129 stop = feature.stop + downstream
jpayne@68 130 if add_to_name:
jpayne@68 131 try:
jpayne@68 132 feature.name += add_to_name
jpayne@68 133 except AttributeError:
jpayne@68 134 pass
jpayne@68 135 if genome is not None:
jpayne@68 136 gstart, gstop = genome[feature.chrom]
jpayne@68 137 stop = min(stop, gstop)
jpayne@68 138 start = max(start, gstart)
jpayne@68 139 if start < 0:
jpayne@68 140 start = 0
jpayne@68 141 if start > stop:
jpayne@68 142 start = stop
jpayne@68 143 feature.start = start
jpayne@68 144 feature.stop = stop
jpayne@68 145 return feature
jpayne@68 146
jpayne@68 147 cpdef five_prime(Interval feature, int upstream=500, int downstream=500,
jpayne@68 148 add_to_name=None, genome=None):
jpayne@68 149 """
jpayne@68 150 Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds
jpayne@68 151 the string `add_to_name` to the feature's name if provided (e.g., "_TSS")
jpayne@68 152
jpayne@68 153 Parameters
jpayne@68 154 ----------
jpayne@68 155 feature : pybedtools.Interval instance
jpayne@68 156
jpayne@68 157 upstream, downstream : int
jpayne@68 158 Number of bp upstream or downstream of the strand-specific start
jpayne@68 159 position of the feature to include. Default is 500 for both upstream
jpayne@68 160 and downstream so that the returned feature is 1kb centered on the 5'
jpayne@68 161 end of the feature. Unstranded features (where strand=".") are treated
jpayne@68 162 as plus-strand features.
jpayne@68 163
jpayne@68 164 add_to_name : str or None
jpayne@68 165 If not None, append the string suffix to the name field of the feature (for
jpayne@68 166 example "_TSS").
jpayne@68 167
jpayne@68 168 genome : dict or None
jpayne@68 169 If not None, then ensure that the start/stop positions are within the
jpayne@68 170 boundaries of the chromosome.
jpayne@68 171 """
jpayne@68 172 return star_prime(feature, upstream, downstream, prime=5,
jpayne@68 173 add_to_name=add_to_name, genome=genome)
jpayne@68 174
jpayne@68 175
jpayne@68 176 cpdef three_prime(Interval feature, int upstream=500, int downstream=500,
jpayne@68 177 add_to_name=None, genome=None):
jpayne@68 178 """
jpayne@68 179 Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds
jpayne@68 180 the string `add_to_name` to the feature's name if provided (e.g.,
jpayne@68 181 "_polyA_site")
jpayne@68 182
jpayne@68 183 Parameters
jpayne@68 184 ----------
jpayne@68 185 feature : pybedtools.Interval instance
jpayne@68 186
jpayne@68 187 upstream, downstrea : int
jpayne@68 188 Number of bp upstream or downstream of the strand-specific stop
jpayne@68 189 position of the feature to include. Default is 500 for both upstream
jpayne@68 190 and downstream so that the returned feature is 1kb centered on the 5'
jpayne@68 191 end of the feature. Unstranded features (where strand=".") are treated
jpayne@68 192 as plus-strand features.
jpayne@68 193
jpayne@68 194 add_to_name : str or None
jpayne@68 195 If not None, append the string suffix to the name field of the feature (for
jpayne@68 196 example "_TSS").
jpayne@68 197
jpayne@68 198 genome : dict or None
jpayne@68 199 If not None, then ensure that the start/stop positions are within the
jpayne@68 200 boundaries of the chromosome.
jpayne@68 201
jpayne@68 202
jpayne@68 203 """
jpayne@68 204 return star_prime(feature, upstream, downstream, prime=3,
jpayne@68 205 add_to_name=add_to_name, genome=genome)
jpayne@68 206
jpayne@68 207 cpdef add_color(Interval feature, cmap, norm):
jpayne@68 208 """
jpayne@68 209 Signature:
jpayne@68 210
jpayne@68 211 add_color(feature, cmap, norm)
jpayne@68 212
jpayne@68 213 Given the matplotlib colormap `cmap` and the matplotlib Normalize instance
jpayne@68 214 `norm`, return a new 9-field feature (extended out if needed) with the RGB
jpayne@68 215 tuple set according to the score.
jpayne@68 216 """
jpayne@68 217 if len(feature.fields) < 9:
jpayne@68 218 feature = extend_fields(feature, 9)
jpayne@68 219 feature[6] = str(feature.start)
jpayne@68 220 feature[7] = str(feature.stop)
jpayne@68 221
jpayne@68 222 rgb_float = cmap(norm(float(feature.score)))
jpayne@68 223 feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]])
jpayne@68 224 return feature
jpayne@68 225
jpayne@68 226
jpayne@68 227 cpdef gff2bed(Interval feature, name_field=None):
jpayne@68 228 """
jpayne@68 229 Signature:
jpayne@68 230
jpayne@68 231 gff2bed(feature, name_field=None)
jpayne@68 232
jpayne@68 233 Converts a GFF feature into a BED6 feature. By default, the name of the
jpayne@68 234 new BED will be feature.name, but if `name_field` is provided then the name
jpayne@68 235 of the new BED will be feature.attrs[name_field].
jpayne@68 236
jpayne@68 237 `name_field` can also be an integer to index into the fields of the object,
jpayne@68 238 so if you want the BED name to be the GFF featuretype, then you can use
jpayne@68 239 `name_field=2`.
jpayne@68 240
jpayne@68 241 If the specified field does not exist, then "." will be used for the name.
jpayne@68 242 """
jpayne@68 243 if name_field is None:
jpayne@68 244 name = feature.name
jpayne@68 245 else:
jpayne@68 246 try:
jpayne@68 247 if isinstance(name_field, basestring):
jpayne@68 248 name = feature.attrs[name_field]
jpayne@68 249 if isinstance(name_field, int):
jpayne@68 250 name = feature[name_field]
jpayne@68 251 except (NameError, KeyError):
jpayne@68 252 name = "."
jpayne@68 253 return create_interval_from_list([
jpayne@68 254 str(feature.chrom),
jpayne@68 255 str(feature.start),
jpayne@68 256 str(feature.stop),
jpayne@68 257 name,
jpayne@68 258 feature.score,
jpayne@68 259 feature.strand])
jpayne@68 260
jpayne@68 261
jpayne@68 262 cpdef bed2gff(Interval feature):
jpayne@68 263 """
jpayne@68 264 Signature:
jpayne@68 265
jpayne@68 266 bed2gff(feature)
jpayne@68 267
jpayne@68 268 Converts a BED feature (BED3 through BED12) into a GFF format.
jpayne@68 269
jpayne@68 270 Chrom, start, stop, score, and strand are put directly into the
jpayne@68 271 corresponding GFF fields. Other BED fields are put into the GFF attributes
jpayne@68 272 field, named according to the UCSC BED format definition.
jpayne@68 273
jpayne@68 274 If there are more than 12 BED fields, the additional fields will be added
jpayne@68 275 to the GFF attributes using the 0-based index (so starting at "12") as the
jpayne@68 276 key.
jpayne@68 277
jpayne@68 278 GFF fields that do not have a direct mapping to BED format (feature type,
jpayne@68 279 source, phase) are set to ".".
jpayne@68 280
jpayne@68 281 1 bp is added to the start position to finish the conversion to GFF.
jpayne@68 282 """
jpayne@68 283
jpayne@68 284 # Note that Interval.score, .strand, and .name have a default of ".", so no
jpayne@68 285 # need to do the extra try/except IndexError for those fields.
jpayne@68 286 mapping = (
jpayne@68 287 (6, "thickStart"),
jpayne@68 288 (7, "thickEnd"),
jpayne@68 289 (8, "itemRgb"),
jpayne@68 290 (9, "blockCount"),
jpayne@68 291 (10, "blockSizes"),
jpayne@68 292 (11, "blockStarts")
jpayne@68 293 )
jpayne@68 294
jpayne@68 295 # Add any standard BED fields we might have
jpayne@68 296 attributes = ['Name="%s"' % feature.name]
jpayne@68 297 for k, v in mapping:
jpayne@68 298 try:
jpayne@68 299 attributes.append('%s="%s"' % (v, feature.fields[k]))
jpayne@68 300 except IndexError:
jpayne@68 301 break
jpayne@68 302
jpayne@68 303 # Add any additional fields, keyed by their index
jpayne@68 304 if len(feature.fields) > 12:
jpayne@68 305 for i in range(12, len(feature.fields)):
jpayne@68 306 attributes.append('%s="%s"' % (i, feature.fields[i]))
jpayne@68 307
jpayne@68 308 attributes = '; '.join(attributes) + ';'
jpayne@68 309
jpayne@68 310 return create_interval_from_list([
jpayne@68 311 str(feature.chrom),
jpayne@68 312 '.',
jpayne@68 313 '.',
jpayne@68 314 str(feature.start + 1),
jpayne@68 315 str(feature.stop),
jpayne@68 316 feature.score,
jpayne@68 317 feature.strand,
jpayne@68 318 '.',
jpayne@68 319 attributes])
jpayne@68 320
jpayne@68 321
jpayne@68 322 class UniqueID(object):
jpayne@68 323 def __init__(self, pattern="%d", first=0):
jpayne@68 324 """
jpayne@68 325 Class to help create uniquely-named features.
jpayne@68 326
jpayne@68 327 Example usage:
jpayne@68 328
jpayne@68 329 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 330 >>> uid = UniqueID("f_%d")
jpayne@68 331 >>> print(a.each(uid)) # doctest: +NORMALIZE_WHITESPACE
jpayne@68 332 chr1 1 100 f_0 0 +
jpayne@68 333 chr1 100 200 f_1 0 +
jpayne@68 334 chr1 150 500 f_2 0 -
jpayne@68 335 chr1 900 950 f_3 0 +
jpayne@68 336
jpayne@68 337 Parameters
jpayne@68 338 ----------
jpayne@68 339 pattern : str
jpayne@68 340
jpayne@68 341 Pattern will be filled in using `% self.count`
jpayne@68 342
jpayne@68 343 first : int
jpayne@68 344 `self.count` will be initialzed to this value.
jpayne@68 345
jpayne@68 346 """
jpayne@68 347 self.pattern = pattern
jpayne@68 348 self.first = first
jpayne@68 349 self.count = first
jpayne@68 350
jpayne@68 351 def __call__(self, feature):
jpayne@68 352 feature.name = self.pattern % self.count
jpayne@68 353 self.count += 1
jpayne@68 354 return feature
jpayne@68 355