Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/featurefuncs.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
# cython: language_level=2 # distutils: language = c++ from cbedtools cimport Interval from cbedtools import create_interval_from_list cpdef extend_fields(Interval feature, int n): """ Pads the fields of the feature with "." to a total length of `n` fields, """ fields = feature.fields[:] while len(fields) < n: fields.append('.') i = create_interval_from_list(fields) if n > 4 and (i[4] == '.'): i[4] = '0' if n > 6 and (i[6] == '.'): i[6] = str(i.start) if n > 7 and (i[7] == '.'): i[7] = str(i.stop) if n > 8 and (i[8] == '.'): i[8] = '0,0,0' return i cpdef center(Interval feature, int width=100): """ Return the *width* bp from the center of a feature. If a feature is smaller than *width*, then return the entire feature. """ if len(feature) < width: return feature cdef int start = feature.start cdef int stop = feature.stop cdef int center = start + (stop - start) / 2 halfwidth = width / 2 feature.start = center - halfwidth if feature.start < 1: feature.start = 1 if halfwidth == 0: halfwidth = 1 feature.stop = center + halfwidth return feature cpdef midpoint(Interval feature): """ Specialized version of `center()` that just returns the single-bp midpoint """ start = feature.start + (feature.stop - feature.start) / 2 stop = start + 1 feature.start = start feature.stop = stop return feature cpdef greater_than(Interval feature, int size=100): """ Return True if feature length > *size* """ return len(feature) > size cpdef less_than(Interval feature, int size=100): """ Return True if feature length < *size* """ return len(feature) < size cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001): """ Normalizes the value at feature[idx] to the feature's length, in kb. *idx*, by default, is the score field for a BED file, but specify any integer. The value at *idx* will be replaced with its scaled value. *scalar* will be multiplied by the value at *idx*, by default this is 0.001, or per kb. Useful for calculating RPKM after running intersect with counts """ feature[idx] = str(float(feature[idx]) * scalar / len(feature)) return feature cpdef rename(Interval feature, str name): """ Forces a rename of all features, e.g., for renaming everything in a file 'exon' """ feature.name = name return feature cpdef bedgraph_scale(Interval feature, float scalar): feature[3] = str(float(feature[3]) * scalar) return feature cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None): """ Alias for five_prime. """ return star_prime(feature, upstream, downstream, prime=5, add_to_name=add_to_name, genome=genome) cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5, add_to_name=None, genome=None): if prime == 5: if feature.strand == '-': start = feature.stop - downstream stop = feature.stop + upstream else: start = feature.start - upstream stop = feature.start + downstream elif prime == 3: if feature.strand == '-': start = feature.start - downstream stop = feature.start + upstream else: start = feature.stop - upstream stop = feature.stop + downstream if add_to_name: try: feature.name += add_to_name except AttributeError: pass if genome is not None: gstart, gstop = genome[feature.chrom] stop = min(stop, gstop) start = max(start, gstart) if start < 0: start = 0 if start > stop: start = stop feature.start = start feature.stop = stop return feature cpdef five_prime(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None): """ Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds the string `add_to_name` to the feature's name if provided (e.g., "_TSS") Parameters ---------- feature : pybedtools.Interval instance upstream, downstream : int Number of bp upstream or downstream of the strand-specific start position of the feature to include. Default is 500 for both upstream and downstream so that the returned feature is 1kb centered on the 5' end of the feature. Unstranded features (where strand=".") are treated as plus-strand features. add_to_name : str or None If not None, append the string suffix to the name field of the feature (for example "_TSS"). genome : dict or None If not None, then ensure that the start/stop positions are within the boundaries of the chromosome. """ return star_prime(feature, upstream, downstream, prime=5, add_to_name=add_to_name, genome=genome) cpdef three_prime(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None): """ Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds the string `add_to_name` to the feature's name if provided (e.g., "_polyA_site") Parameters ---------- feature : pybedtools.Interval instance upstream, downstrea : int Number of bp upstream or downstream of the strand-specific stop position of the feature to include. Default is 500 for both upstream and downstream so that the returned feature is 1kb centered on the 5' end of the feature. Unstranded features (where strand=".") are treated as plus-strand features. add_to_name : str or None If not None, append the string suffix to the name field of the feature (for example "_TSS"). genome : dict or None If not None, then ensure that the start/stop positions are within the boundaries of the chromosome. """ return star_prime(feature, upstream, downstream, prime=3, add_to_name=add_to_name, genome=genome) cpdef add_color(Interval feature, cmap, norm): """ Signature: add_color(feature, cmap, norm) Given the matplotlib colormap `cmap` and the matplotlib Normalize instance `norm`, return a new 9-field feature (extended out if needed) with the RGB tuple set according to the score. """ if len(feature.fields) < 9: feature = extend_fields(feature, 9) feature[6] = str(feature.start) feature[7] = str(feature.stop) rgb_float = cmap(norm(float(feature.score))) feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]]) return feature cpdef gff2bed(Interval feature, name_field=None): """ Signature: gff2bed(feature, name_field=None) Converts a GFF feature into a BED6 feature. By default, the name of the new BED will be feature.name, but if `name_field` is provided then the name of the new BED will be feature.attrs[name_field]. `name_field` can also be an integer to index into the fields of the object, so if you want the BED name to be the GFF featuretype, then you can use `name_field=2`. If the specified field does not exist, then "." will be used for the name. """ if name_field is None: name = feature.name else: try: if isinstance(name_field, basestring): name = feature.attrs[name_field] if isinstance(name_field, int): name = feature[name_field] except (NameError, KeyError): name = "." return create_interval_from_list([ str(feature.chrom), str(feature.start), str(feature.stop), name, feature.score, feature.strand]) cpdef bed2gff(Interval feature): """ Signature: bed2gff(feature) Converts a BED feature (BED3 through BED12) into a GFF format. Chrom, start, stop, score, and strand are put directly into the corresponding GFF fields. Other BED fields are put into the GFF attributes field, named according to the UCSC BED format definition. If there are more than 12 BED fields, the additional fields will be added to the GFF attributes using the 0-based index (so starting at "12") as the key. GFF fields that do not have a direct mapping to BED format (feature type, source, phase) are set to ".". 1 bp is added to the start position to finish the conversion to GFF. """ # Note that Interval.score, .strand, and .name have a default of ".", so no # need to do the extra try/except IndexError for those fields. mapping = ( (6, "thickStart"), (7, "thickEnd"), (8, "itemRgb"), (9, "blockCount"), (10, "blockSizes"), (11, "blockStarts") ) # Add any standard BED fields we might have attributes = ['Name="%s"' % feature.name] for k, v in mapping: try: attributes.append('%s="%s"' % (v, feature.fields[k])) except IndexError: break # Add any additional fields, keyed by their index if len(feature.fields) > 12: for i in range(12, len(feature.fields)): attributes.append('%s="%s"' % (i, feature.fields[i])) attributes = '; '.join(attributes) + ';' return create_interval_from_list([ str(feature.chrom), '.', '.', str(feature.start + 1), str(feature.stop), feature.score, feature.strand, '.', attributes]) class UniqueID(object): def __init__(self, pattern="%d", first=0): """ Class to help create uniquely-named features. Example usage: >>> a = pybedtools.example_bedtool('a.bed') >>> uid = UniqueID("f_%d") >>> print(a.each(uid)) # doctest: +NORMALIZE_WHITESPACE chr1 1 100 f_0 0 + chr1 100 200 f_1 0 + chr1 150 500 f_2 0 - chr1 900 950 f_3 0 + Parameters ---------- pattern : str Pattern will be filled in using `% self.count` first : int `self.count` will be initialzed to this value. """ self.pattern = pattern self.first = first self.count = first def __call__(self, feature): feature.name = self.pattern % self.count self.count += 1 return feature