jpayne@68
|
1 # cython: language_level=2
|
jpayne@68
|
2 # distutils: language = c++
|
jpayne@68
|
3 from cbedtools cimport Interval
|
jpayne@68
|
4 from cbedtools import create_interval_from_list
|
jpayne@68
|
5
|
jpayne@68
|
6
|
jpayne@68
|
7 cpdef extend_fields(Interval feature, int n):
|
jpayne@68
|
8 """
|
jpayne@68
|
9 Pads the fields of the feature with "." to a total length of `n` fields,
|
jpayne@68
|
10 """
|
jpayne@68
|
11 fields = feature.fields[:]
|
jpayne@68
|
12 while len(fields) < n:
|
jpayne@68
|
13 fields.append('.')
|
jpayne@68
|
14 i = create_interval_from_list(fields)
|
jpayne@68
|
15
|
jpayne@68
|
16 if n > 4 and (i[4] == '.'):
|
jpayne@68
|
17 i[4] = '0'
|
jpayne@68
|
18 if n > 6 and (i[6] == '.'):
|
jpayne@68
|
19 i[6] = str(i.start)
|
jpayne@68
|
20 if n > 7 and (i[7] == '.'):
|
jpayne@68
|
21 i[7] = str(i.stop)
|
jpayne@68
|
22 if n > 8 and (i[8] == '.'):
|
jpayne@68
|
23 i[8] = '0,0,0'
|
jpayne@68
|
24 return i
|
jpayne@68
|
25
|
jpayne@68
|
26
|
jpayne@68
|
27
|
jpayne@68
|
28 cpdef center(Interval feature, int width=100):
|
jpayne@68
|
29 """
|
jpayne@68
|
30 Return the *width* bp from the center of a feature. If a feature is
|
jpayne@68
|
31 smaller than *width*, then return the entire feature.
|
jpayne@68
|
32 """
|
jpayne@68
|
33 if len(feature) < width:
|
jpayne@68
|
34 return feature
|
jpayne@68
|
35 cdef int start = feature.start
|
jpayne@68
|
36 cdef int stop = feature.stop
|
jpayne@68
|
37 cdef int center = start + (stop - start) / 2
|
jpayne@68
|
38 halfwidth = width / 2
|
jpayne@68
|
39 feature.start = center - halfwidth
|
jpayne@68
|
40 if feature.start < 1:
|
jpayne@68
|
41 feature.start = 1
|
jpayne@68
|
42 if halfwidth == 0:
|
jpayne@68
|
43 halfwidth = 1
|
jpayne@68
|
44 feature.stop = center + halfwidth
|
jpayne@68
|
45 return feature
|
jpayne@68
|
46
|
jpayne@68
|
47
|
jpayne@68
|
48 cpdef midpoint(Interval feature):
|
jpayne@68
|
49 """
|
jpayne@68
|
50 Specialized version of `center()` that just returns the single-bp midpoint
|
jpayne@68
|
51 """
|
jpayne@68
|
52 start = feature.start + (feature.stop - feature.start) / 2
|
jpayne@68
|
53 stop = start + 1
|
jpayne@68
|
54 feature.start = start
|
jpayne@68
|
55 feature.stop = stop
|
jpayne@68
|
56 return feature
|
jpayne@68
|
57
|
jpayne@68
|
58
|
jpayne@68
|
59 cpdef greater_than(Interval feature, int size=100):
|
jpayne@68
|
60 """
|
jpayne@68
|
61 Return True if feature length > *size*
|
jpayne@68
|
62 """
|
jpayne@68
|
63 return len(feature) > size
|
jpayne@68
|
64
|
jpayne@68
|
65
|
jpayne@68
|
66 cpdef less_than(Interval feature, int size=100):
|
jpayne@68
|
67 """
|
jpayne@68
|
68 Return True if feature length < *size*
|
jpayne@68
|
69 """
|
jpayne@68
|
70 return len(feature) < size
|
jpayne@68
|
71
|
jpayne@68
|
72
|
jpayne@68
|
73 cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001):
|
jpayne@68
|
74 """
|
jpayne@68
|
75 Normalizes the value at feature[idx] to the feature's length, in kb.
|
jpayne@68
|
76
|
jpayne@68
|
77 *idx*, by default, is the score field for a BED file, but specify any
|
jpayne@68
|
78 integer.
|
jpayne@68
|
79
|
jpayne@68
|
80 The value at *idx* will be replaced with its scaled value.
|
jpayne@68
|
81
|
jpayne@68
|
82 *scalar* will be multiplied by the value at *idx*, by default this is
|
jpayne@68
|
83 0.001, or per kb.
|
jpayne@68
|
84
|
jpayne@68
|
85 Useful for calculating RPKM after running intersect with counts
|
jpayne@68
|
86 """
|
jpayne@68
|
87 feature[idx] = str(float(feature[idx]) * scalar / len(feature))
|
jpayne@68
|
88 return feature
|
jpayne@68
|
89
|
jpayne@68
|
90
|
jpayne@68
|
91 cpdef rename(Interval feature, str name):
|
jpayne@68
|
92 """
|
jpayne@68
|
93 Forces a rename of all features, e.g., for renaming everything in a file
|
jpayne@68
|
94 'exon'
|
jpayne@68
|
95 """
|
jpayne@68
|
96 feature.name = name
|
jpayne@68
|
97 return feature
|
jpayne@68
|
98
|
jpayne@68
|
99
|
jpayne@68
|
100 cpdef bedgraph_scale(Interval feature, float scalar):
|
jpayne@68
|
101 feature[3] = str(float(feature[3]) * scalar)
|
jpayne@68
|
102 return feature
|
jpayne@68
|
103
|
jpayne@68
|
104
|
jpayne@68
|
105 cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None):
|
jpayne@68
|
106 """
|
jpayne@68
|
107 Alias for five_prime.
|
jpayne@68
|
108 """
|
jpayne@68
|
109 return star_prime(feature, upstream, downstream, prime=5,
|
jpayne@68
|
110 add_to_name=add_to_name, genome=genome)
|
jpayne@68
|
111
|
jpayne@68
|
112
|
jpayne@68
|
113 cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5,
|
jpayne@68
|
114 add_to_name=None, genome=None):
|
jpayne@68
|
115
|
jpayne@68
|
116 if prime == 5:
|
jpayne@68
|
117 if feature.strand == '-':
|
jpayne@68
|
118 start = feature.stop - downstream
|
jpayne@68
|
119 stop = feature.stop + upstream
|
jpayne@68
|
120 else:
|
jpayne@68
|
121 start = feature.start - upstream
|
jpayne@68
|
122 stop = feature.start + downstream
|
jpayne@68
|
123 elif prime == 3:
|
jpayne@68
|
124 if feature.strand == '-':
|
jpayne@68
|
125 start = feature.start - downstream
|
jpayne@68
|
126 stop = feature.start + upstream
|
jpayne@68
|
127 else:
|
jpayne@68
|
128 start = feature.stop - upstream
|
jpayne@68
|
129 stop = feature.stop + downstream
|
jpayne@68
|
130 if add_to_name:
|
jpayne@68
|
131 try:
|
jpayne@68
|
132 feature.name += add_to_name
|
jpayne@68
|
133 except AttributeError:
|
jpayne@68
|
134 pass
|
jpayne@68
|
135 if genome is not None:
|
jpayne@68
|
136 gstart, gstop = genome[feature.chrom]
|
jpayne@68
|
137 stop = min(stop, gstop)
|
jpayne@68
|
138 start = max(start, gstart)
|
jpayne@68
|
139 if start < 0:
|
jpayne@68
|
140 start = 0
|
jpayne@68
|
141 if start > stop:
|
jpayne@68
|
142 start = stop
|
jpayne@68
|
143 feature.start = start
|
jpayne@68
|
144 feature.stop = stop
|
jpayne@68
|
145 return feature
|
jpayne@68
|
146
|
jpayne@68
|
147 cpdef five_prime(Interval feature, int upstream=500, int downstream=500,
|
jpayne@68
|
148 add_to_name=None, genome=None):
|
jpayne@68
|
149 """
|
jpayne@68
|
150 Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds
|
jpayne@68
|
151 the string `add_to_name` to the feature's name if provided (e.g., "_TSS")
|
jpayne@68
|
152
|
jpayne@68
|
153 Parameters
|
jpayne@68
|
154 ----------
|
jpayne@68
|
155 feature : pybedtools.Interval instance
|
jpayne@68
|
156
|
jpayne@68
|
157 upstream, downstream : int
|
jpayne@68
|
158 Number of bp upstream or downstream of the strand-specific start
|
jpayne@68
|
159 position of the feature to include. Default is 500 for both upstream
|
jpayne@68
|
160 and downstream so that the returned feature is 1kb centered on the 5'
|
jpayne@68
|
161 end of the feature. Unstranded features (where strand=".") are treated
|
jpayne@68
|
162 as plus-strand features.
|
jpayne@68
|
163
|
jpayne@68
|
164 add_to_name : str or None
|
jpayne@68
|
165 If not None, append the string suffix to the name field of the feature (for
|
jpayne@68
|
166 example "_TSS").
|
jpayne@68
|
167
|
jpayne@68
|
168 genome : dict or None
|
jpayne@68
|
169 If not None, then ensure that the start/stop positions are within the
|
jpayne@68
|
170 boundaries of the chromosome.
|
jpayne@68
|
171 """
|
jpayne@68
|
172 return star_prime(feature, upstream, downstream, prime=5,
|
jpayne@68
|
173 add_to_name=add_to_name, genome=genome)
|
jpayne@68
|
174
|
jpayne@68
|
175
|
jpayne@68
|
176 cpdef three_prime(Interval feature, int upstream=500, int downstream=500,
|
jpayne@68
|
177 add_to_name=None, genome=None):
|
jpayne@68
|
178 """
|
jpayne@68
|
179 Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds
|
jpayne@68
|
180 the string `add_to_name` to the feature's name if provided (e.g.,
|
jpayne@68
|
181 "_polyA_site")
|
jpayne@68
|
182
|
jpayne@68
|
183 Parameters
|
jpayne@68
|
184 ----------
|
jpayne@68
|
185 feature : pybedtools.Interval instance
|
jpayne@68
|
186
|
jpayne@68
|
187 upstream, downstrea : int
|
jpayne@68
|
188 Number of bp upstream or downstream of the strand-specific stop
|
jpayne@68
|
189 position of the feature to include. Default is 500 for both upstream
|
jpayne@68
|
190 and downstream so that the returned feature is 1kb centered on the 5'
|
jpayne@68
|
191 end of the feature. Unstranded features (where strand=".") are treated
|
jpayne@68
|
192 as plus-strand features.
|
jpayne@68
|
193
|
jpayne@68
|
194 add_to_name : str or None
|
jpayne@68
|
195 If not None, append the string suffix to the name field of the feature (for
|
jpayne@68
|
196 example "_TSS").
|
jpayne@68
|
197
|
jpayne@68
|
198 genome : dict or None
|
jpayne@68
|
199 If not None, then ensure that the start/stop positions are within the
|
jpayne@68
|
200 boundaries of the chromosome.
|
jpayne@68
|
201
|
jpayne@68
|
202
|
jpayne@68
|
203 """
|
jpayne@68
|
204 return star_prime(feature, upstream, downstream, prime=3,
|
jpayne@68
|
205 add_to_name=add_to_name, genome=genome)
|
jpayne@68
|
206
|
jpayne@68
|
207 cpdef add_color(Interval feature, cmap, norm):
|
jpayne@68
|
208 """
|
jpayne@68
|
209 Signature:
|
jpayne@68
|
210
|
jpayne@68
|
211 add_color(feature, cmap, norm)
|
jpayne@68
|
212
|
jpayne@68
|
213 Given the matplotlib colormap `cmap` and the matplotlib Normalize instance
|
jpayne@68
|
214 `norm`, return a new 9-field feature (extended out if needed) with the RGB
|
jpayne@68
|
215 tuple set according to the score.
|
jpayne@68
|
216 """
|
jpayne@68
|
217 if len(feature.fields) < 9:
|
jpayne@68
|
218 feature = extend_fields(feature, 9)
|
jpayne@68
|
219 feature[6] = str(feature.start)
|
jpayne@68
|
220 feature[7] = str(feature.stop)
|
jpayne@68
|
221
|
jpayne@68
|
222 rgb_float = cmap(norm(float(feature.score)))
|
jpayne@68
|
223 feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]])
|
jpayne@68
|
224 return feature
|
jpayne@68
|
225
|
jpayne@68
|
226
|
jpayne@68
|
227 cpdef gff2bed(Interval feature, name_field=None):
|
jpayne@68
|
228 """
|
jpayne@68
|
229 Signature:
|
jpayne@68
|
230
|
jpayne@68
|
231 gff2bed(feature, name_field=None)
|
jpayne@68
|
232
|
jpayne@68
|
233 Converts a GFF feature into a BED6 feature. By default, the name of the
|
jpayne@68
|
234 new BED will be feature.name, but if `name_field` is provided then the name
|
jpayne@68
|
235 of the new BED will be feature.attrs[name_field].
|
jpayne@68
|
236
|
jpayne@68
|
237 `name_field` can also be an integer to index into the fields of the object,
|
jpayne@68
|
238 so if you want the BED name to be the GFF featuretype, then you can use
|
jpayne@68
|
239 `name_field=2`.
|
jpayne@68
|
240
|
jpayne@68
|
241 If the specified field does not exist, then "." will be used for the name.
|
jpayne@68
|
242 """
|
jpayne@68
|
243 if name_field is None:
|
jpayne@68
|
244 name = feature.name
|
jpayne@68
|
245 else:
|
jpayne@68
|
246 try:
|
jpayne@68
|
247 if isinstance(name_field, basestring):
|
jpayne@68
|
248 name = feature.attrs[name_field]
|
jpayne@68
|
249 if isinstance(name_field, int):
|
jpayne@68
|
250 name = feature[name_field]
|
jpayne@68
|
251 except (NameError, KeyError):
|
jpayne@68
|
252 name = "."
|
jpayne@68
|
253 return create_interval_from_list([
|
jpayne@68
|
254 str(feature.chrom),
|
jpayne@68
|
255 str(feature.start),
|
jpayne@68
|
256 str(feature.stop),
|
jpayne@68
|
257 name,
|
jpayne@68
|
258 feature.score,
|
jpayne@68
|
259 feature.strand])
|
jpayne@68
|
260
|
jpayne@68
|
261
|
jpayne@68
|
262 cpdef bed2gff(Interval feature):
|
jpayne@68
|
263 """
|
jpayne@68
|
264 Signature:
|
jpayne@68
|
265
|
jpayne@68
|
266 bed2gff(feature)
|
jpayne@68
|
267
|
jpayne@68
|
268 Converts a BED feature (BED3 through BED12) into a GFF format.
|
jpayne@68
|
269
|
jpayne@68
|
270 Chrom, start, stop, score, and strand are put directly into the
|
jpayne@68
|
271 corresponding GFF fields. Other BED fields are put into the GFF attributes
|
jpayne@68
|
272 field, named according to the UCSC BED format definition.
|
jpayne@68
|
273
|
jpayne@68
|
274 If there are more than 12 BED fields, the additional fields will be added
|
jpayne@68
|
275 to the GFF attributes using the 0-based index (so starting at "12") as the
|
jpayne@68
|
276 key.
|
jpayne@68
|
277
|
jpayne@68
|
278 GFF fields that do not have a direct mapping to BED format (feature type,
|
jpayne@68
|
279 source, phase) are set to ".".
|
jpayne@68
|
280
|
jpayne@68
|
281 1 bp is added to the start position to finish the conversion to GFF.
|
jpayne@68
|
282 """
|
jpayne@68
|
283
|
jpayne@68
|
284 # Note that Interval.score, .strand, and .name have a default of ".", so no
|
jpayne@68
|
285 # need to do the extra try/except IndexError for those fields.
|
jpayne@68
|
286 mapping = (
|
jpayne@68
|
287 (6, "thickStart"),
|
jpayne@68
|
288 (7, "thickEnd"),
|
jpayne@68
|
289 (8, "itemRgb"),
|
jpayne@68
|
290 (9, "blockCount"),
|
jpayne@68
|
291 (10, "blockSizes"),
|
jpayne@68
|
292 (11, "blockStarts")
|
jpayne@68
|
293 )
|
jpayne@68
|
294
|
jpayne@68
|
295 # Add any standard BED fields we might have
|
jpayne@68
|
296 attributes = ['Name="%s"' % feature.name]
|
jpayne@68
|
297 for k, v in mapping:
|
jpayne@68
|
298 try:
|
jpayne@68
|
299 attributes.append('%s="%s"' % (v, feature.fields[k]))
|
jpayne@68
|
300 except IndexError:
|
jpayne@68
|
301 break
|
jpayne@68
|
302
|
jpayne@68
|
303 # Add any additional fields, keyed by their index
|
jpayne@68
|
304 if len(feature.fields) > 12:
|
jpayne@68
|
305 for i in range(12, len(feature.fields)):
|
jpayne@68
|
306 attributes.append('%s="%s"' % (i, feature.fields[i]))
|
jpayne@68
|
307
|
jpayne@68
|
308 attributes = '; '.join(attributes) + ';'
|
jpayne@68
|
309
|
jpayne@68
|
310 return create_interval_from_list([
|
jpayne@68
|
311 str(feature.chrom),
|
jpayne@68
|
312 '.',
|
jpayne@68
|
313 '.',
|
jpayne@68
|
314 str(feature.start + 1),
|
jpayne@68
|
315 str(feature.stop),
|
jpayne@68
|
316 feature.score,
|
jpayne@68
|
317 feature.strand,
|
jpayne@68
|
318 '.',
|
jpayne@68
|
319 attributes])
|
jpayne@68
|
320
|
jpayne@68
|
321
|
jpayne@68
|
322 class UniqueID(object):
|
jpayne@68
|
323 def __init__(self, pattern="%d", first=0):
|
jpayne@68
|
324 """
|
jpayne@68
|
325 Class to help create uniquely-named features.
|
jpayne@68
|
326
|
jpayne@68
|
327 Example usage:
|
jpayne@68
|
328
|
jpayne@68
|
329 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
330 >>> uid = UniqueID("f_%d")
|
jpayne@68
|
331 >>> print(a.each(uid)) # doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
332 chr1 1 100 f_0 0 +
|
jpayne@68
|
333 chr1 100 200 f_1 0 +
|
jpayne@68
|
334 chr1 150 500 f_2 0 -
|
jpayne@68
|
335 chr1 900 950 f_3 0 +
|
jpayne@68
|
336
|
jpayne@68
|
337 Parameters
|
jpayne@68
|
338 ----------
|
jpayne@68
|
339 pattern : str
|
jpayne@68
|
340
|
jpayne@68
|
341 Pattern will be filled in using `% self.count`
|
jpayne@68
|
342
|
jpayne@68
|
343 first : int
|
jpayne@68
|
344 `self.count` will be initialzed to this value.
|
jpayne@68
|
345
|
jpayne@68
|
346 """
|
jpayne@68
|
347 self.pattern = pattern
|
jpayne@68
|
348 self.first = first
|
jpayne@68
|
349 self.count = first
|
jpayne@68
|
350
|
jpayne@68
|
351 def __call__(self, feature):
|
jpayne@68
|
352 feature.name = self.pattern % self.count
|
jpayne@68
|
353 self.count += 1
|
jpayne@68
|
354 return feature
|
jpayne@68
|
355
|