Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/featurefuncs.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 # cython: language_level=2 | |
2 # distutils: language = c++ | |
3 from cbedtools cimport Interval | |
4 from cbedtools import create_interval_from_list | |
5 | |
6 | |
7 cpdef extend_fields(Interval feature, int n): | |
8 """ | |
9 Pads the fields of the feature with "." to a total length of `n` fields, | |
10 """ | |
11 fields = feature.fields[:] | |
12 while len(fields) < n: | |
13 fields.append('.') | |
14 i = create_interval_from_list(fields) | |
15 | |
16 if n > 4 and (i[4] == '.'): | |
17 i[4] = '0' | |
18 if n > 6 and (i[6] == '.'): | |
19 i[6] = str(i.start) | |
20 if n > 7 and (i[7] == '.'): | |
21 i[7] = str(i.stop) | |
22 if n > 8 and (i[8] == '.'): | |
23 i[8] = '0,0,0' | |
24 return i | |
25 | |
26 | |
27 | |
28 cpdef center(Interval feature, int width=100): | |
29 """ | |
30 Return the *width* bp from the center of a feature. If a feature is | |
31 smaller than *width*, then return the entire feature. | |
32 """ | |
33 if len(feature) < width: | |
34 return feature | |
35 cdef int start = feature.start | |
36 cdef int stop = feature.stop | |
37 cdef int center = start + (stop - start) / 2 | |
38 halfwidth = width / 2 | |
39 feature.start = center - halfwidth | |
40 if feature.start < 1: | |
41 feature.start = 1 | |
42 if halfwidth == 0: | |
43 halfwidth = 1 | |
44 feature.stop = center + halfwidth | |
45 return feature | |
46 | |
47 | |
48 cpdef midpoint(Interval feature): | |
49 """ | |
50 Specialized version of `center()` that just returns the single-bp midpoint | |
51 """ | |
52 start = feature.start + (feature.stop - feature.start) / 2 | |
53 stop = start + 1 | |
54 feature.start = start | |
55 feature.stop = stop | |
56 return feature | |
57 | |
58 | |
59 cpdef greater_than(Interval feature, int size=100): | |
60 """ | |
61 Return True if feature length > *size* | |
62 """ | |
63 return len(feature) > size | |
64 | |
65 | |
66 cpdef less_than(Interval feature, int size=100): | |
67 """ | |
68 Return True if feature length < *size* | |
69 """ | |
70 return len(feature) < size | |
71 | |
72 | |
73 cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001): | |
74 """ | |
75 Normalizes the value at feature[idx] to the feature's length, in kb. | |
76 | |
77 *idx*, by default, is the score field for a BED file, but specify any | |
78 integer. | |
79 | |
80 The value at *idx* will be replaced with its scaled value. | |
81 | |
82 *scalar* will be multiplied by the value at *idx*, by default this is | |
83 0.001, or per kb. | |
84 | |
85 Useful for calculating RPKM after running intersect with counts | |
86 """ | |
87 feature[idx] = str(float(feature[idx]) * scalar / len(feature)) | |
88 return feature | |
89 | |
90 | |
91 cpdef rename(Interval feature, str name): | |
92 """ | |
93 Forces a rename of all features, e.g., for renaming everything in a file | |
94 'exon' | |
95 """ | |
96 feature.name = name | |
97 return feature | |
98 | |
99 | |
100 cpdef bedgraph_scale(Interval feature, float scalar): | |
101 feature[3] = str(float(feature[3]) * scalar) | |
102 return feature | |
103 | |
104 | |
105 cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None): | |
106 """ | |
107 Alias for five_prime. | |
108 """ | |
109 return star_prime(feature, upstream, downstream, prime=5, | |
110 add_to_name=add_to_name, genome=genome) | |
111 | |
112 | |
113 cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5, | |
114 add_to_name=None, genome=None): | |
115 | |
116 if prime == 5: | |
117 if feature.strand == '-': | |
118 start = feature.stop - downstream | |
119 stop = feature.stop + upstream | |
120 else: | |
121 start = feature.start - upstream | |
122 stop = feature.start + downstream | |
123 elif prime == 3: | |
124 if feature.strand == '-': | |
125 start = feature.start - downstream | |
126 stop = feature.start + upstream | |
127 else: | |
128 start = feature.stop - upstream | |
129 stop = feature.stop + downstream | |
130 if add_to_name: | |
131 try: | |
132 feature.name += add_to_name | |
133 except AttributeError: | |
134 pass | |
135 if genome is not None: | |
136 gstart, gstop = genome[feature.chrom] | |
137 stop = min(stop, gstop) | |
138 start = max(start, gstart) | |
139 if start < 0: | |
140 start = 0 | |
141 if start > stop: | |
142 start = stop | |
143 feature.start = start | |
144 feature.stop = stop | |
145 return feature | |
146 | |
147 cpdef five_prime(Interval feature, int upstream=500, int downstream=500, | |
148 add_to_name=None, genome=None): | |
149 """ | |
150 Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds | |
151 the string `add_to_name` to the feature's name if provided (e.g., "_TSS") | |
152 | |
153 Parameters | |
154 ---------- | |
155 feature : pybedtools.Interval instance | |
156 | |
157 upstream, downstream : int | |
158 Number of bp upstream or downstream of the strand-specific start | |
159 position of the feature to include. Default is 500 for both upstream | |
160 and downstream so that the returned feature is 1kb centered on the 5' | |
161 end of the feature. Unstranded features (where strand=".") are treated | |
162 as plus-strand features. | |
163 | |
164 add_to_name : str or None | |
165 If not None, append the string suffix to the name field of the feature (for | |
166 example "_TSS"). | |
167 | |
168 genome : dict or None | |
169 If not None, then ensure that the start/stop positions are within the | |
170 boundaries of the chromosome. | |
171 """ | |
172 return star_prime(feature, upstream, downstream, prime=5, | |
173 add_to_name=add_to_name, genome=genome) | |
174 | |
175 | |
176 cpdef three_prime(Interval feature, int upstream=500, int downstream=500, | |
177 add_to_name=None, genome=None): | |
178 """ | |
179 Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds | |
180 the string `add_to_name` to the feature's name if provided (e.g., | |
181 "_polyA_site") | |
182 | |
183 Parameters | |
184 ---------- | |
185 feature : pybedtools.Interval instance | |
186 | |
187 upstream, downstrea : int | |
188 Number of bp upstream or downstream of the strand-specific stop | |
189 position of the feature to include. Default is 500 for both upstream | |
190 and downstream so that the returned feature is 1kb centered on the 5' | |
191 end of the feature. Unstranded features (where strand=".") are treated | |
192 as plus-strand features. | |
193 | |
194 add_to_name : str or None | |
195 If not None, append the string suffix to the name field of the feature (for | |
196 example "_TSS"). | |
197 | |
198 genome : dict or None | |
199 If not None, then ensure that the start/stop positions are within the | |
200 boundaries of the chromosome. | |
201 | |
202 | |
203 """ | |
204 return star_prime(feature, upstream, downstream, prime=3, | |
205 add_to_name=add_to_name, genome=genome) | |
206 | |
207 cpdef add_color(Interval feature, cmap, norm): | |
208 """ | |
209 Signature: | |
210 | |
211 add_color(feature, cmap, norm) | |
212 | |
213 Given the matplotlib colormap `cmap` and the matplotlib Normalize instance | |
214 `norm`, return a new 9-field feature (extended out if needed) with the RGB | |
215 tuple set according to the score. | |
216 """ | |
217 if len(feature.fields) < 9: | |
218 feature = extend_fields(feature, 9) | |
219 feature[6] = str(feature.start) | |
220 feature[7] = str(feature.stop) | |
221 | |
222 rgb_float = cmap(norm(float(feature.score))) | |
223 feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]]) | |
224 return feature | |
225 | |
226 | |
227 cpdef gff2bed(Interval feature, name_field=None): | |
228 """ | |
229 Signature: | |
230 | |
231 gff2bed(feature, name_field=None) | |
232 | |
233 Converts a GFF feature into a BED6 feature. By default, the name of the | |
234 new BED will be feature.name, but if `name_field` is provided then the name | |
235 of the new BED will be feature.attrs[name_field]. | |
236 | |
237 `name_field` can also be an integer to index into the fields of the object, | |
238 so if you want the BED name to be the GFF featuretype, then you can use | |
239 `name_field=2`. | |
240 | |
241 If the specified field does not exist, then "." will be used for the name. | |
242 """ | |
243 if name_field is None: | |
244 name = feature.name | |
245 else: | |
246 try: | |
247 if isinstance(name_field, basestring): | |
248 name = feature.attrs[name_field] | |
249 if isinstance(name_field, int): | |
250 name = feature[name_field] | |
251 except (NameError, KeyError): | |
252 name = "." | |
253 return create_interval_from_list([ | |
254 str(feature.chrom), | |
255 str(feature.start), | |
256 str(feature.stop), | |
257 name, | |
258 feature.score, | |
259 feature.strand]) | |
260 | |
261 | |
262 cpdef bed2gff(Interval feature): | |
263 """ | |
264 Signature: | |
265 | |
266 bed2gff(feature) | |
267 | |
268 Converts a BED feature (BED3 through BED12) into a GFF format. | |
269 | |
270 Chrom, start, stop, score, and strand are put directly into the | |
271 corresponding GFF fields. Other BED fields are put into the GFF attributes | |
272 field, named according to the UCSC BED format definition. | |
273 | |
274 If there are more than 12 BED fields, the additional fields will be added | |
275 to the GFF attributes using the 0-based index (so starting at "12") as the | |
276 key. | |
277 | |
278 GFF fields that do not have a direct mapping to BED format (feature type, | |
279 source, phase) are set to ".". | |
280 | |
281 1 bp is added to the start position to finish the conversion to GFF. | |
282 """ | |
283 | |
284 # Note that Interval.score, .strand, and .name have a default of ".", so no | |
285 # need to do the extra try/except IndexError for those fields. | |
286 mapping = ( | |
287 (6, "thickStart"), | |
288 (7, "thickEnd"), | |
289 (8, "itemRgb"), | |
290 (9, "blockCount"), | |
291 (10, "blockSizes"), | |
292 (11, "blockStarts") | |
293 ) | |
294 | |
295 # Add any standard BED fields we might have | |
296 attributes = ['Name="%s"' % feature.name] | |
297 for k, v in mapping: | |
298 try: | |
299 attributes.append('%s="%s"' % (v, feature.fields[k])) | |
300 except IndexError: | |
301 break | |
302 | |
303 # Add any additional fields, keyed by their index | |
304 if len(feature.fields) > 12: | |
305 for i in range(12, len(feature.fields)): | |
306 attributes.append('%s="%s"' % (i, feature.fields[i])) | |
307 | |
308 attributes = '; '.join(attributes) + ';' | |
309 | |
310 return create_interval_from_list([ | |
311 str(feature.chrom), | |
312 '.', | |
313 '.', | |
314 str(feature.start + 1), | |
315 str(feature.stop), | |
316 feature.score, | |
317 feature.strand, | |
318 '.', | |
319 attributes]) | |
320 | |
321 | |
322 class UniqueID(object): | |
323 def __init__(self, pattern="%d", first=0): | |
324 """ | |
325 Class to help create uniquely-named features. | |
326 | |
327 Example usage: | |
328 | |
329 >>> a = pybedtools.example_bedtool('a.bed') | |
330 >>> uid = UniqueID("f_%d") | |
331 >>> print(a.each(uid)) # doctest: +NORMALIZE_WHITESPACE | |
332 chr1 1 100 f_0 0 + | |
333 chr1 100 200 f_1 0 + | |
334 chr1 150 500 f_2 0 - | |
335 chr1 900 950 f_3 0 + | |
336 | |
337 Parameters | |
338 ---------- | |
339 pattern : str | |
340 | |
341 Pattern will be filled in using `% self.count` | |
342 | |
343 first : int | |
344 `self.count` will be initialzed to this value. | |
345 | |
346 """ | |
347 self.pattern = pattern | |
348 self.first = first | |
349 self.count = first | |
350 | |
351 def __call__(self, feature): | |
352 feature.name = self.pattern % self.count | |
353 self.count += 1 | |
354 return feature | |
355 |