comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/featurefuncs.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 # cython: language_level=2
2 # distutils: language = c++
3 from cbedtools cimport Interval
4 from cbedtools import create_interval_from_list
5
6
7 cpdef extend_fields(Interval feature, int n):
8 """
9 Pads the fields of the feature with "." to a total length of `n` fields,
10 """
11 fields = feature.fields[:]
12 while len(fields) < n:
13 fields.append('.')
14 i = create_interval_from_list(fields)
15
16 if n > 4 and (i[4] == '.'):
17 i[4] = '0'
18 if n > 6 and (i[6] == '.'):
19 i[6] = str(i.start)
20 if n > 7 and (i[7] == '.'):
21 i[7] = str(i.stop)
22 if n > 8 and (i[8] == '.'):
23 i[8] = '0,0,0'
24 return i
25
26
27
28 cpdef center(Interval feature, int width=100):
29 """
30 Return the *width* bp from the center of a feature. If a feature is
31 smaller than *width*, then return the entire feature.
32 """
33 if len(feature) < width:
34 return feature
35 cdef int start = feature.start
36 cdef int stop = feature.stop
37 cdef int center = start + (stop - start) / 2
38 halfwidth = width / 2
39 feature.start = center - halfwidth
40 if feature.start < 1:
41 feature.start = 1
42 if halfwidth == 0:
43 halfwidth = 1
44 feature.stop = center + halfwidth
45 return feature
46
47
48 cpdef midpoint(Interval feature):
49 """
50 Specialized version of `center()` that just returns the single-bp midpoint
51 """
52 start = feature.start + (feature.stop - feature.start) / 2
53 stop = start + 1
54 feature.start = start
55 feature.stop = stop
56 return feature
57
58
59 cpdef greater_than(Interval feature, int size=100):
60 """
61 Return True if feature length > *size*
62 """
63 return len(feature) > size
64
65
66 cpdef less_than(Interval feature, int size=100):
67 """
68 Return True if feature length < *size*
69 """
70 return len(feature) < size
71
72
73 cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001):
74 """
75 Normalizes the value at feature[idx] to the feature's length, in kb.
76
77 *idx*, by default, is the score field for a BED file, but specify any
78 integer.
79
80 The value at *idx* will be replaced with its scaled value.
81
82 *scalar* will be multiplied by the value at *idx*, by default this is
83 0.001, or per kb.
84
85 Useful for calculating RPKM after running intersect with counts
86 """
87 feature[idx] = str(float(feature[idx]) * scalar / len(feature))
88 return feature
89
90
91 cpdef rename(Interval feature, str name):
92 """
93 Forces a rename of all features, e.g., for renaming everything in a file
94 'exon'
95 """
96 feature.name = name
97 return feature
98
99
100 cpdef bedgraph_scale(Interval feature, float scalar):
101 feature[3] = str(float(feature[3]) * scalar)
102 return feature
103
104
105 cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None):
106 """
107 Alias for five_prime.
108 """
109 return star_prime(feature, upstream, downstream, prime=5,
110 add_to_name=add_to_name, genome=genome)
111
112
113 cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5,
114 add_to_name=None, genome=None):
115
116 if prime == 5:
117 if feature.strand == '-':
118 start = feature.stop - downstream
119 stop = feature.stop + upstream
120 else:
121 start = feature.start - upstream
122 stop = feature.start + downstream
123 elif prime == 3:
124 if feature.strand == '-':
125 start = feature.start - downstream
126 stop = feature.start + upstream
127 else:
128 start = feature.stop - upstream
129 stop = feature.stop + downstream
130 if add_to_name:
131 try:
132 feature.name += add_to_name
133 except AttributeError:
134 pass
135 if genome is not None:
136 gstart, gstop = genome[feature.chrom]
137 stop = min(stop, gstop)
138 start = max(start, gstart)
139 if start < 0:
140 start = 0
141 if start > stop:
142 start = stop
143 feature.start = start
144 feature.stop = stop
145 return feature
146
147 cpdef five_prime(Interval feature, int upstream=500, int downstream=500,
148 add_to_name=None, genome=None):
149 """
150 Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds
151 the string `add_to_name` to the feature's name if provided (e.g., "_TSS")
152
153 Parameters
154 ----------
155 feature : pybedtools.Interval instance
156
157 upstream, downstream : int
158 Number of bp upstream or downstream of the strand-specific start
159 position of the feature to include. Default is 500 for both upstream
160 and downstream so that the returned feature is 1kb centered on the 5'
161 end of the feature. Unstranded features (where strand=".") are treated
162 as plus-strand features.
163
164 add_to_name : str or None
165 If not None, append the string suffix to the name field of the feature (for
166 example "_TSS").
167
168 genome : dict or None
169 If not None, then ensure that the start/stop positions are within the
170 boundaries of the chromosome.
171 """
172 return star_prime(feature, upstream, downstream, prime=5,
173 add_to_name=add_to_name, genome=genome)
174
175
176 cpdef three_prime(Interval feature, int upstream=500, int downstream=500,
177 add_to_name=None, genome=None):
178 """
179 Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds
180 the string `add_to_name` to the feature's name if provided (e.g.,
181 "_polyA_site")
182
183 Parameters
184 ----------
185 feature : pybedtools.Interval instance
186
187 upstream, downstrea : int
188 Number of bp upstream or downstream of the strand-specific stop
189 position of the feature to include. Default is 500 for both upstream
190 and downstream so that the returned feature is 1kb centered on the 5'
191 end of the feature. Unstranded features (where strand=".") are treated
192 as plus-strand features.
193
194 add_to_name : str or None
195 If not None, append the string suffix to the name field of the feature (for
196 example "_TSS").
197
198 genome : dict or None
199 If not None, then ensure that the start/stop positions are within the
200 boundaries of the chromosome.
201
202
203 """
204 return star_prime(feature, upstream, downstream, prime=3,
205 add_to_name=add_to_name, genome=genome)
206
207 cpdef add_color(Interval feature, cmap, norm):
208 """
209 Signature:
210
211 add_color(feature, cmap, norm)
212
213 Given the matplotlib colormap `cmap` and the matplotlib Normalize instance
214 `norm`, return a new 9-field feature (extended out if needed) with the RGB
215 tuple set according to the score.
216 """
217 if len(feature.fields) < 9:
218 feature = extend_fields(feature, 9)
219 feature[6] = str(feature.start)
220 feature[7] = str(feature.stop)
221
222 rgb_float = cmap(norm(float(feature.score)))
223 feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]])
224 return feature
225
226
227 cpdef gff2bed(Interval feature, name_field=None):
228 """
229 Signature:
230
231 gff2bed(feature, name_field=None)
232
233 Converts a GFF feature into a BED6 feature. By default, the name of the
234 new BED will be feature.name, but if `name_field` is provided then the name
235 of the new BED will be feature.attrs[name_field].
236
237 `name_field` can also be an integer to index into the fields of the object,
238 so if you want the BED name to be the GFF featuretype, then you can use
239 `name_field=2`.
240
241 If the specified field does not exist, then "." will be used for the name.
242 """
243 if name_field is None:
244 name = feature.name
245 else:
246 try:
247 if isinstance(name_field, basestring):
248 name = feature.attrs[name_field]
249 if isinstance(name_field, int):
250 name = feature[name_field]
251 except (NameError, KeyError):
252 name = "."
253 return create_interval_from_list([
254 str(feature.chrom),
255 str(feature.start),
256 str(feature.stop),
257 name,
258 feature.score,
259 feature.strand])
260
261
262 cpdef bed2gff(Interval feature):
263 """
264 Signature:
265
266 bed2gff(feature)
267
268 Converts a BED feature (BED3 through BED12) into a GFF format.
269
270 Chrom, start, stop, score, and strand are put directly into the
271 corresponding GFF fields. Other BED fields are put into the GFF attributes
272 field, named according to the UCSC BED format definition.
273
274 If there are more than 12 BED fields, the additional fields will be added
275 to the GFF attributes using the 0-based index (so starting at "12") as the
276 key.
277
278 GFF fields that do not have a direct mapping to BED format (feature type,
279 source, phase) are set to ".".
280
281 1 bp is added to the start position to finish the conversion to GFF.
282 """
283
284 # Note that Interval.score, .strand, and .name have a default of ".", so no
285 # need to do the extra try/except IndexError for those fields.
286 mapping = (
287 (6, "thickStart"),
288 (7, "thickEnd"),
289 (8, "itemRgb"),
290 (9, "blockCount"),
291 (10, "blockSizes"),
292 (11, "blockStarts")
293 )
294
295 # Add any standard BED fields we might have
296 attributes = ['Name="%s"' % feature.name]
297 for k, v in mapping:
298 try:
299 attributes.append('%s="%s"' % (v, feature.fields[k]))
300 except IndexError:
301 break
302
303 # Add any additional fields, keyed by their index
304 if len(feature.fields) > 12:
305 for i in range(12, len(feature.fields)):
306 attributes.append('%s="%s"' % (i, feature.fields[i]))
307
308 attributes = '; '.join(attributes) + ';'
309
310 return create_interval_from_list([
311 str(feature.chrom),
312 '.',
313 '.',
314 str(feature.start + 1),
315 str(feature.stop),
316 feature.score,
317 feature.strand,
318 '.',
319 attributes])
320
321
322 class UniqueID(object):
323 def __init__(self, pattern="%d", first=0):
324 """
325 Class to help create uniquely-named features.
326
327 Example usage:
328
329 >>> a = pybedtools.example_bedtool('a.bed')
330 >>> uid = UniqueID("f_%d")
331 >>> print(a.each(uid)) # doctest: +NORMALIZE_WHITESPACE
332 chr1 1 100 f_0 0 +
333 chr1 100 200 f_1 0 +
334 chr1 150 500 f_2 0 -
335 chr1 900 950 f_3 0 +
336
337 Parameters
338 ----------
339 pattern : str
340
341 Pattern will be filled in using `% self.count`
342
343 first : int
344 `self.count` will be initialzed to this value.
345
346 """
347 self.pattern = pattern
348 self.first = first
349 self.count = first
350
351 def __call__(self, feature):
352 feature.name = self.pattern % self.count
353 self.count += 1
354 return feature
355