jpayne@68: # cython: language_level=2 jpayne@68: # distutils: language = c++ jpayne@68: from cbedtools cimport Interval jpayne@68: from cbedtools import create_interval_from_list jpayne@68: jpayne@68: jpayne@68: cpdef extend_fields(Interval feature, int n): jpayne@68: """ jpayne@68: Pads the fields of the feature with "." to a total length of `n` fields, jpayne@68: """ jpayne@68: fields = feature.fields[:] jpayne@68: while len(fields) < n: jpayne@68: fields.append('.') jpayne@68: i = create_interval_from_list(fields) jpayne@68: jpayne@68: if n > 4 and (i[4] == '.'): jpayne@68: i[4] = '0' jpayne@68: if n > 6 and (i[6] == '.'): jpayne@68: i[6] = str(i.start) jpayne@68: if n > 7 and (i[7] == '.'): jpayne@68: i[7] = str(i.stop) jpayne@68: if n > 8 and (i[8] == '.'): jpayne@68: i[8] = '0,0,0' jpayne@68: return i jpayne@68: jpayne@68: jpayne@68: jpayne@68: cpdef center(Interval feature, int width=100): jpayne@68: """ jpayne@68: Return the *width* bp from the center of a feature. If a feature is jpayne@68: smaller than *width*, then return the entire feature. jpayne@68: """ jpayne@68: if len(feature) < width: jpayne@68: return feature jpayne@68: cdef int start = feature.start jpayne@68: cdef int stop = feature.stop jpayne@68: cdef int center = start + (stop - start) / 2 jpayne@68: halfwidth = width / 2 jpayne@68: feature.start = center - halfwidth jpayne@68: if feature.start < 1: jpayne@68: feature.start = 1 jpayne@68: if halfwidth == 0: jpayne@68: halfwidth = 1 jpayne@68: feature.stop = center + halfwidth jpayne@68: return feature jpayne@68: jpayne@68: jpayne@68: cpdef midpoint(Interval feature): jpayne@68: """ jpayne@68: Specialized version of `center()` that just returns the single-bp midpoint jpayne@68: """ jpayne@68: start = feature.start + (feature.stop - feature.start) / 2 jpayne@68: stop = start + 1 jpayne@68: feature.start = start jpayne@68: feature.stop = stop jpayne@68: return feature jpayne@68: jpayne@68: jpayne@68: cpdef greater_than(Interval feature, int size=100): jpayne@68: """ jpayne@68: Return True if feature length > *size* jpayne@68: """ jpayne@68: return len(feature) > size jpayne@68: jpayne@68: jpayne@68: cpdef less_than(Interval feature, int size=100): jpayne@68: """ jpayne@68: Return True if feature length < *size* jpayne@68: """ jpayne@68: return len(feature) < size jpayne@68: jpayne@68: jpayne@68: cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001): jpayne@68: """ jpayne@68: Normalizes the value at feature[idx] to the feature's length, in kb. jpayne@68: jpayne@68: *idx*, by default, is the score field for a BED file, but specify any jpayne@68: integer. jpayne@68: jpayne@68: The value at *idx* will be replaced with its scaled value. jpayne@68: jpayne@68: *scalar* will be multiplied by the value at *idx*, by default this is jpayne@68: 0.001, or per kb. jpayne@68: jpayne@68: Useful for calculating RPKM after running intersect with counts jpayne@68: """ jpayne@68: feature[idx] = str(float(feature[idx]) * scalar / len(feature)) jpayne@68: return feature jpayne@68: jpayne@68: jpayne@68: cpdef rename(Interval feature, str name): jpayne@68: """ jpayne@68: Forces a rename of all features, e.g., for renaming everything in a file jpayne@68: 'exon' jpayne@68: """ jpayne@68: feature.name = name jpayne@68: return feature jpayne@68: jpayne@68: jpayne@68: cpdef bedgraph_scale(Interval feature, float scalar): jpayne@68: feature[3] = str(float(feature[3]) * scalar) jpayne@68: return feature jpayne@68: jpayne@68: jpayne@68: cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None): jpayne@68: """ jpayne@68: Alias for five_prime. jpayne@68: """ jpayne@68: return star_prime(feature, upstream, downstream, prime=5, jpayne@68: add_to_name=add_to_name, genome=genome) jpayne@68: jpayne@68: jpayne@68: cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5, jpayne@68: add_to_name=None, genome=None): jpayne@68: jpayne@68: if prime == 5: jpayne@68: if feature.strand == '-': jpayne@68: start = feature.stop - downstream jpayne@68: stop = feature.stop + upstream jpayne@68: else: jpayne@68: start = feature.start - upstream jpayne@68: stop = feature.start + downstream jpayne@68: elif prime == 3: jpayne@68: if feature.strand == '-': jpayne@68: start = feature.start - downstream jpayne@68: stop = feature.start + upstream jpayne@68: else: jpayne@68: start = feature.stop - upstream jpayne@68: stop = feature.stop + downstream jpayne@68: if add_to_name: jpayne@68: try: jpayne@68: feature.name += add_to_name jpayne@68: except AttributeError: jpayne@68: pass jpayne@68: if genome is not None: jpayne@68: gstart, gstop = genome[feature.chrom] jpayne@68: stop = min(stop, gstop) jpayne@68: start = max(start, gstart) jpayne@68: if start < 0: jpayne@68: start = 0 jpayne@68: if start > stop: jpayne@68: start = stop jpayne@68: feature.start = start jpayne@68: feature.stop = stop jpayne@68: return feature jpayne@68: jpayne@68: cpdef five_prime(Interval feature, int upstream=500, int downstream=500, jpayne@68: add_to_name=None, genome=None): jpayne@68: """ jpayne@68: Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds jpayne@68: the string `add_to_name` to the feature's name if provided (e.g., "_TSS") jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: feature : pybedtools.Interval instance jpayne@68: jpayne@68: upstream, downstream : int jpayne@68: Number of bp upstream or downstream of the strand-specific start jpayne@68: position of the feature to include. Default is 500 for both upstream jpayne@68: and downstream so that the returned feature is 1kb centered on the 5' jpayne@68: end of the feature. Unstranded features (where strand=".") are treated jpayne@68: as plus-strand features. jpayne@68: jpayne@68: add_to_name : str or None jpayne@68: If not None, append the string suffix to the name field of the feature (for jpayne@68: example "_TSS"). jpayne@68: jpayne@68: genome : dict or None jpayne@68: If not None, then ensure that the start/stop positions are within the jpayne@68: boundaries of the chromosome. jpayne@68: """ jpayne@68: return star_prime(feature, upstream, downstream, prime=5, jpayne@68: add_to_name=add_to_name, genome=genome) jpayne@68: jpayne@68: jpayne@68: cpdef three_prime(Interval feature, int upstream=500, int downstream=500, jpayne@68: add_to_name=None, genome=None): jpayne@68: """ jpayne@68: Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds jpayne@68: the string `add_to_name` to the feature's name if provided (e.g., jpayne@68: "_polyA_site") jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: feature : pybedtools.Interval instance jpayne@68: jpayne@68: upstream, downstrea : int jpayne@68: Number of bp upstream or downstream of the strand-specific stop jpayne@68: position of the feature to include. Default is 500 for both upstream jpayne@68: and downstream so that the returned feature is 1kb centered on the 5' jpayne@68: end of the feature. Unstranded features (where strand=".") are treated jpayne@68: as plus-strand features. jpayne@68: jpayne@68: add_to_name : str or None jpayne@68: If not None, append the string suffix to the name field of the feature (for jpayne@68: example "_TSS"). jpayne@68: jpayne@68: genome : dict or None jpayne@68: If not None, then ensure that the start/stop positions are within the jpayne@68: boundaries of the chromosome. jpayne@68: jpayne@68: jpayne@68: """ jpayne@68: return star_prime(feature, upstream, downstream, prime=3, jpayne@68: add_to_name=add_to_name, genome=genome) jpayne@68: jpayne@68: cpdef add_color(Interval feature, cmap, norm): jpayne@68: """ jpayne@68: Signature: jpayne@68: jpayne@68: add_color(feature, cmap, norm) jpayne@68: jpayne@68: Given the matplotlib colormap `cmap` and the matplotlib Normalize instance jpayne@68: `norm`, return a new 9-field feature (extended out if needed) with the RGB jpayne@68: tuple set according to the score. jpayne@68: """ jpayne@68: if len(feature.fields) < 9: jpayne@68: feature = extend_fields(feature, 9) jpayne@68: feature[6] = str(feature.start) jpayne@68: feature[7] = str(feature.stop) jpayne@68: jpayne@68: rgb_float = cmap(norm(float(feature.score))) jpayne@68: feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]]) jpayne@68: return feature jpayne@68: jpayne@68: jpayne@68: cpdef gff2bed(Interval feature, name_field=None): jpayne@68: """ jpayne@68: Signature: jpayne@68: jpayne@68: gff2bed(feature, name_field=None) jpayne@68: jpayne@68: Converts a GFF feature into a BED6 feature. By default, the name of the jpayne@68: new BED will be feature.name, but if `name_field` is provided then the name jpayne@68: of the new BED will be feature.attrs[name_field]. jpayne@68: jpayne@68: `name_field` can also be an integer to index into the fields of the object, jpayne@68: so if you want the BED name to be the GFF featuretype, then you can use jpayne@68: `name_field=2`. jpayne@68: jpayne@68: If the specified field does not exist, then "." will be used for the name. jpayne@68: """ jpayne@68: if name_field is None: jpayne@68: name = feature.name jpayne@68: else: jpayne@68: try: jpayne@68: if isinstance(name_field, basestring): jpayne@68: name = feature.attrs[name_field] jpayne@68: if isinstance(name_field, int): jpayne@68: name = feature[name_field] jpayne@68: except (NameError, KeyError): jpayne@68: name = "." jpayne@68: return create_interval_from_list([ jpayne@68: str(feature.chrom), jpayne@68: str(feature.start), jpayne@68: str(feature.stop), jpayne@68: name, jpayne@68: feature.score, jpayne@68: feature.strand]) jpayne@68: jpayne@68: jpayne@68: cpdef bed2gff(Interval feature): jpayne@68: """ jpayne@68: Signature: jpayne@68: jpayne@68: bed2gff(feature) jpayne@68: jpayne@68: Converts a BED feature (BED3 through BED12) into a GFF format. jpayne@68: jpayne@68: Chrom, start, stop, score, and strand are put directly into the jpayne@68: corresponding GFF fields. Other BED fields are put into the GFF attributes jpayne@68: field, named according to the UCSC BED format definition. jpayne@68: jpayne@68: If there are more than 12 BED fields, the additional fields will be added jpayne@68: to the GFF attributes using the 0-based index (so starting at "12") as the jpayne@68: key. jpayne@68: jpayne@68: GFF fields that do not have a direct mapping to BED format (feature type, jpayne@68: source, phase) are set to ".". jpayne@68: jpayne@68: 1 bp is added to the start position to finish the conversion to GFF. jpayne@68: """ jpayne@68: jpayne@68: # Note that Interval.score, .strand, and .name have a default of ".", so no jpayne@68: # need to do the extra try/except IndexError for those fields. jpayne@68: mapping = ( jpayne@68: (6, "thickStart"), jpayne@68: (7, "thickEnd"), jpayne@68: (8, "itemRgb"), jpayne@68: (9, "blockCount"), jpayne@68: (10, "blockSizes"), jpayne@68: (11, "blockStarts") jpayne@68: ) jpayne@68: jpayne@68: # Add any standard BED fields we might have jpayne@68: attributes = ['Name="%s"' % feature.name] jpayne@68: for k, v in mapping: jpayne@68: try: jpayne@68: attributes.append('%s="%s"' % (v, feature.fields[k])) jpayne@68: except IndexError: jpayne@68: break jpayne@68: jpayne@68: # Add any additional fields, keyed by their index jpayne@68: if len(feature.fields) > 12: jpayne@68: for i in range(12, len(feature.fields)): jpayne@68: attributes.append('%s="%s"' % (i, feature.fields[i])) jpayne@68: jpayne@68: attributes = '; '.join(attributes) + ';' jpayne@68: jpayne@68: return create_interval_from_list([ jpayne@68: str(feature.chrom), jpayne@68: '.', jpayne@68: '.', jpayne@68: str(feature.start + 1), jpayne@68: str(feature.stop), jpayne@68: feature.score, jpayne@68: feature.strand, jpayne@68: '.', jpayne@68: attributes]) jpayne@68: jpayne@68: jpayne@68: class UniqueID(object): jpayne@68: def __init__(self, pattern="%d", first=0): jpayne@68: """ jpayne@68: Class to help create uniquely-named features. jpayne@68: jpayne@68: Example usage: jpayne@68: jpayne@68: >>> a = pybedtools.example_bedtool('a.bed') jpayne@68: >>> uid = UniqueID("f_%d") jpayne@68: >>> print(a.each(uid)) # doctest: +NORMALIZE_WHITESPACE jpayne@68: chr1 1 100 f_0 0 + jpayne@68: chr1 100 200 f_1 0 + jpayne@68: chr1 150 500 f_2 0 - jpayne@68: chr1 900 950 f_3 0 + jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: pattern : str jpayne@68: jpayne@68: Pattern will be filled in using `% self.count` jpayne@68: jpayne@68: first : int jpayne@68: `self.count` will be initialzed to this value. jpayne@68: jpayne@68: """ jpayne@68: self.pattern = pattern jpayne@68: self.first = first jpayne@68: self.count = first jpayne@68: jpayne@68: def __call__(self, feature): jpayne@68: feature.name = self.pattern % self.count jpayne@68: self.count += 1 jpayne@68: return feature jpayne@68: