jpayne@68: # cython: language_level=2
jpayne@68: # distutils: language = c++
jpayne@68: from cbedtools cimport Interval
jpayne@68: from cbedtools import create_interval_from_list
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef extend_fields(Interval feature, int n):
jpayne@68:     """
jpayne@68:     Pads the fields of the feature with "." to a total length of `n` fields,
jpayne@68:     """
jpayne@68:     fields = feature.fields[:]
jpayne@68:     while len(fields) < n:
jpayne@68:         fields.append('.')
jpayne@68:     i = create_interval_from_list(fields)
jpayne@68: 
jpayne@68:     if n > 4 and (i[4] == '.'):
jpayne@68:         i[4] = '0'
jpayne@68:     if n > 6 and (i[6] == '.'):
jpayne@68:         i[6] = str(i.start)
jpayne@68:     if n > 7 and (i[7] == '.'):
jpayne@68:         i[7] = str(i.stop)
jpayne@68:     if n > 8 and (i[8] == '.'):
jpayne@68:         i[8] = '0,0,0'
jpayne@68:     return i
jpayne@68: 
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef center(Interval feature, int width=100):
jpayne@68:     """
jpayne@68:     Return the *width* bp from the center of a feature.  If a feature is
jpayne@68:     smaller than *width*, then return the entire feature.
jpayne@68:     """
jpayne@68:     if len(feature) < width:
jpayne@68:         return feature
jpayne@68:     cdef int start = feature.start
jpayne@68:     cdef int stop = feature.stop
jpayne@68:     cdef int center = start + (stop - start) / 2
jpayne@68:     halfwidth = width / 2
jpayne@68:     feature.start = center - halfwidth
jpayne@68:     if feature.start < 1:
jpayne@68:         feature.start = 1
jpayne@68:     if halfwidth == 0:
jpayne@68:         halfwidth = 1
jpayne@68:     feature.stop = center + halfwidth
jpayne@68:     return feature
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef midpoint(Interval feature):
jpayne@68:     """
jpayne@68:     Specialized version of `center()` that just returns the single-bp midpoint
jpayne@68:     """
jpayne@68:     start = feature.start + (feature.stop - feature.start) / 2
jpayne@68:     stop = start + 1
jpayne@68:     feature.start = start
jpayne@68:     feature.stop = stop
jpayne@68:     return feature
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef greater_than(Interval feature, int size=100):
jpayne@68:     """
jpayne@68:     Return True if feature length > *size*
jpayne@68:     """
jpayne@68:     return len(feature) > size
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef less_than(Interval feature, int size=100):
jpayne@68:     """
jpayne@68:     Return True if feature length < *size*
jpayne@68:     """
jpayne@68:     return len(feature) < size
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001):
jpayne@68:     """
jpayne@68:     Normalizes the value at feature[idx] to the feature's length, in kb.
jpayne@68: 
jpayne@68:     *idx*, by default, is the score field for a BED file, but specify any
jpayne@68:     integer.
jpayne@68: 
jpayne@68:     The value at *idx* will be replaced with its scaled value.
jpayne@68: 
jpayne@68:     *scalar* will be multiplied by the value at *idx*, by default this is
jpayne@68:     0.001, or per kb.
jpayne@68: 
jpayne@68:     Useful for calculating RPKM after running intersect with counts
jpayne@68:     """
jpayne@68:     feature[idx] = str(float(feature[idx]) * scalar / len(feature))
jpayne@68:     return feature
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef rename(Interval feature, str name):
jpayne@68:     """
jpayne@68:     Forces a rename of all features, e.g., for renaming everything in a file
jpayne@68:     'exon'
jpayne@68:     """
jpayne@68:     feature.name = name
jpayne@68:     return feature
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef bedgraph_scale(Interval feature, float scalar):
jpayne@68:     feature[3] = str(float(feature[3]) * scalar)
jpayne@68:     return feature
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None):
jpayne@68:     """
jpayne@68:     Alias for five_prime.
jpayne@68:     """
jpayne@68:     return star_prime(feature, upstream, downstream, prime=5,
jpayne@68:                       add_to_name=add_to_name, genome=genome)
jpayne@68: 
jpayne@68: 
jpayne@68: cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5,
jpayne@68:                  add_to_name=None, genome=None):
jpayne@68: 
jpayne@68:     if prime == 5:
jpayne@68:         if feature.strand == '-':
jpayne@68:             start = feature.stop - downstream
jpayne@68:             stop = feature.stop + upstream
jpayne@68:         else:
jpayne@68:             start = feature.start - upstream
jpayne@68:             stop = feature.start + downstream
jpayne@68:     elif prime == 3:
jpayne@68:         if feature.strand == '-':
jpayne@68:             start = feature.start - downstream
jpayne@68:             stop = feature.start + upstream
jpayne@68:         else:
jpayne@68:             start = feature.stop - upstream
jpayne@68:             stop = feature.stop + downstream
jpayne@68:     if add_to_name:
jpayne@68:         try:
jpayne@68:             feature.name += add_to_name
jpayne@68:         except AttributeError:
jpayne@68:             pass
jpayne@68:     if genome is not None:
jpayne@68:         gstart, gstop = genome[feature.chrom]
jpayne@68:         stop = min(stop, gstop)
jpayne@68:         start = max(start, gstart)
jpayne@68:     if start < 0:
jpayne@68:         start = 0
jpayne@68:     if start > stop:
jpayne@68:         start = stop
jpayne@68:     feature.start = start
jpayne@68:     feature.stop = stop
jpayne@68:     return feature
jpayne@68: 
jpayne@68: cpdef five_prime(Interval feature, int upstream=500, int downstream=500,
jpayne@68:                  add_to_name=None, genome=None):
jpayne@68:     """
jpayne@68:     Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds
jpayne@68:     the string `add_to_name` to the feature's name if provided (e.g., "_TSS")
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     feature : pybedtools.Interval instance
jpayne@68: 
jpayne@68:     upstream, downstream : int
jpayne@68:         Number of bp upstream or downstream of the strand-specific start
jpayne@68:         position of the feature to include. Default is 500 for both upstream
jpayne@68:         and downstream so that the returned feature is 1kb centered on the 5'
jpayne@68:         end of the feature. Unstranded features (where strand=".") are treated
jpayne@68:         as plus-strand features.
jpayne@68: 
jpayne@68:     add_to_name : str or None
jpayne@68:         If not None, append the string suffix to the name field of the feature (for
jpayne@68:         example "_TSS").
jpayne@68: 
jpayne@68:     genome : dict or None
jpayne@68:         If not None, then ensure that the start/stop positions are within the
jpayne@68:         boundaries of the chromosome.
jpayne@68:     """
jpayne@68:     return star_prime(feature, upstream, downstream, prime=5,
jpayne@68:                       add_to_name=add_to_name, genome=genome)
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef three_prime(Interval feature, int upstream=500, int downstream=500,
jpayne@68:                   add_to_name=None, genome=None):
jpayne@68:     """
jpayne@68:     Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds
jpayne@68:     the string `add_to_name` to the feature's name if provided (e.g.,
jpayne@68:     "_polyA_site")
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     feature : pybedtools.Interval instance
jpayne@68: 
jpayne@68:     upstream, downstrea : int
jpayne@68:         Number of bp upstream or downstream of the strand-specific stop
jpayne@68:         position of the feature to include. Default is 500 for both upstream
jpayne@68:         and downstream so that the returned feature is 1kb centered on the 5'
jpayne@68:         end of the feature. Unstranded features (where strand=".") are treated
jpayne@68:         as plus-strand features.
jpayne@68: 
jpayne@68:     add_to_name : str or None
jpayne@68:         If not None, append the string suffix to the name field of the feature (for
jpayne@68:         example "_TSS").
jpayne@68: 
jpayne@68:     genome : dict or None
jpayne@68:         If not None, then ensure that the start/stop positions are within the
jpayne@68:         boundaries of the chromosome.
jpayne@68: 
jpayne@68: 
jpayne@68:     """
jpayne@68:     return star_prime(feature, upstream, downstream, prime=3,
jpayne@68:                       add_to_name=add_to_name, genome=genome)
jpayne@68: 
jpayne@68: cpdef add_color(Interval feature, cmap, norm):
jpayne@68:     """
jpayne@68:     Signature:
jpayne@68: 
jpayne@68:         add_color(feature, cmap, norm)
jpayne@68: 
jpayne@68:     Given the matplotlib colormap `cmap` and the matplotlib Normalize instance
jpayne@68:     `norm`, return a new 9-field feature (extended out if needed) with the RGB
jpayne@68:     tuple set according to the score.
jpayne@68:     """
jpayne@68:     if len(feature.fields) < 9:
jpayne@68:         feature = extend_fields(feature, 9)
jpayne@68:         feature[6] = str(feature.start)
jpayne@68:         feature[7] = str(feature.stop)
jpayne@68: 
jpayne@68:     rgb_float = cmap(norm(float(feature.score)))
jpayne@68:     feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]])
jpayne@68:     return feature
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef gff2bed(Interval feature, name_field=None):
jpayne@68:     """
jpayne@68:     Signature:
jpayne@68: 
jpayne@68:         gff2bed(feature, name_field=None)
jpayne@68: 
jpayne@68:     Converts a GFF feature into a BED6 feature.  By default, the name of the
jpayne@68:     new BED will be feature.name, but if `name_field` is provided then the name
jpayne@68:     of the new BED will be feature.attrs[name_field].
jpayne@68: 
jpayne@68:     `name_field` can also be an integer to index into the fields of the object,
jpayne@68:     so if you want the BED name to be the GFF featuretype, then you can use
jpayne@68:     `name_field=2`.
jpayne@68: 
jpayne@68:     If the specified field does not exist, then "." will be used for the name.
jpayne@68:     """
jpayne@68:     if name_field is None:
jpayne@68:         name = feature.name
jpayne@68:     else:
jpayne@68:         try:
jpayne@68:             if isinstance(name_field, basestring):
jpayne@68:                 name = feature.attrs[name_field]
jpayne@68:             if isinstance(name_field, int):
jpayne@68:                 name = feature[name_field]
jpayne@68:         except (NameError, KeyError):
jpayne@68:             name = "."
jpayne@68:     return create_interval_from_list([
jpayne@68:         str(feature.chrom),
jpayne@68:         str(feature.start),
jpayne@68:         str(feature.stop),
jpayne@68:         name,
jpayne@68:         feature.score,
jpayne@68:         feature.strand])
jpayne@68: 
jpayne@68: 
jpayne@68: cpdef bed2gff(Interval feature):
jpayne@68:     """
jpayne@68:     Signature:
jpayne@68: 
jpayne@68:         bed2gff(feature)
jpayne@68: 
jpayne@68:     Converts a BED feature (BED3 through BED12) into a GFF format.
jpayne@68: 
jpayne@68:     Chrom, start, stop, score, and strand are put directly into the
jpayne@68:     corresponding GFF fields.  Other BED fields are put into the GFF attributes
jpayne@68:     field, named according to the UCSC BED format definition.
jpayne@68: 
jpayne@68:     If there are more than 12 BED fields, the additional fields will be added
jpayne@68:     to the GFF attributes using the 0-based index (so starting at "12") as the
jpayne@68:     key.
jpayne@68: 
jpayne@68:     GFF fields that do not have a direct mapping to BED format (feature type,
jpayne@68:     source, phase) are set to ".".
jpayne@68: 
jpayne@68:     1 bp is added to the start position to finish the conversion to GFF.
jpayne@68:     """
jpayne@68: 
jpayne@68:     # Note that Interval.score, .strand, and .name have a default of ".", so no
jpayne@68:     # need to do the extra try/except IndexError for those fields.
jpayne@68:     mapping = (
jpayne@68:         (6, "thickStart"),
jpayne@68:         (7, "thickEnd"),
jpayne@68:         (8, "itemRgb"),
jpayne@68:         (9, "blockCount"),
jpayne@68:         (10, "blockSizes"),
jpayne@68:         (11, "blockStarts")
jpayne@68:     )
jpayne@68: 
jpayne@68:     # Add any standard BED fields we might have
jpayne@68:     attributes = ['Name="%s"' % feature.name]
jpayne@68:     for k, v in mapping:
jpayne@68:         try:
jpayne@68:             attributes.append('%s="%s"' % (v, feature.fields[k]))
jpayne@68:         except IndexError:
jpayne@68:             break
jpayne@68: 
jpayne@68:     # Add any additional fields, keyed by their index
jpayne@68:     if len(feature.fields) > 12:
jpayne@68:         for i in range(12, len(feature.fields)):
jpayne@68:             attributes.append('%s="%s"' % (i, feature.fields[i]))
jpayne@68: 
jpayne@68:     attributes = '; '.join(attributes) + ';'
jpayne@68: 
jpayne@68:     return create_interval_from_list([
jpayne@68:         str(feature.chrom),
jpayne@68:         '.',
jpayne@68:         '.',
jpayne@68:         str(feature.start + 1),
jpayne@68:         str(feature.stop),
jpayne@68:         feature.score,
jpayne@68:         feature.strand,
jpayne@68:         '.',
jpayne@68:         attributes])
jpayne@68: 
jpayne@68: 
jpayne@68: class UniqueID(object):
jpayne@68:     def __init__(self, pattern="%d", first=0):
jpayne@68:         """
jpayne@68:         Class to help create uniquely-named features.
jpayne@68: 
jpayne@68:         Example usage:
jpayne@68: 
jpayne@68:         >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68:         >>> uid = UniqueID("f_%d")
jpayne@68:         >>> print(a.each(uid))  # doctest: +NORMALIZE_WHITESPACE
jpayne@68:         chr1    1    100    f_0    0    +
jpayne@68:         chr1  100    200    f_1    0    +
jpayne@68:         chr1  150    500    f_2    0    -
jpayne@68:         chr1  900    950    f_3    0    +
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         pattern : str
jpayne@68: 
jpayne@68:             Pattern will be filled in using `% self.count`
jpayne@68: 
jpayne@68:         first : int
jpayne@68:             `self.count` will be initialzed to this value.
jpayne@68: 
jpayne@68:         """
jpayne@68:         self.pattern = pattern
jpayne@68:         self.first = first
jpayne@68:         self.count = first
jpayne@68: 
jpayne@68:     def __call__(self, feature):
jpayne@68:         feature.name = self.pattern % self.count
jpayne@68:         self.count += 1
jpayne@68:         return feature
jpayne@68: