Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/featurefuncs.pyx @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/featurefuncs.pyx Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,355 @@ +# cython: language_level=2 +# distutils: language = c++ +from cbedtools cimport Interval +from cbedtools import create_interval_from_list + + +cpdef extend_fields(Interval feature, int n): + """ + Pads the fields of the feature with "." to a total length of `n` fields, + """ + fields = feature.fields[:] + while len(fields) < n: + fields.append('.') + i = create_interval_from_list(fields) + + if n > 4 and (i[4] == '.'): + i[4] = '0' + if n > 6 and (i[6] == '.'): + i[6] = str(i.start) + if n > 7 and (i[7] == '.'): + i[7] = str(i.stop) + if n > 8 and (i[8] == '.'): + i[8] = '0,0,0' + return i + + + +cpdef center(Interval feature, int width=100): + """ + Return the *width* bp from the center of a feature. If a feature is + smaller than *width*, then return the entire feature. + """ + if len(feature) < width: + return feature + cdef int start = feature.start + cdef int stop = feature.stop + cdef int center = start + (stop - start) / 2 + halfwidth = width / 2 + feature.start = center - halfwidth + if feature.start < 1: + feature.start = 1 + if halfwidth == 0: + halfwidth = 1 + feature.stop = center + halfwidth + return feature + + +cpdef midpoint(Interval feature): + """ + Specialized version of `center()` that just returns the single-bp midpoint + """ + start = feature.start + (feature.stop - feature.start) / 2 + stop = start + 1 + feature.start = start + feature.stop = stop + return feature + + +cpdef greater_than(Interval feature, int size=100): + """ + Return True if feature length > *size* + """ + return len(feature) > size + + +cpdef less_than(Interval feature, int size=100): + """ + Return True if feature length < *size* + """ + return len(feature) < size + + +cpdef normalized_to_length(Interval feature, int idx=4, float scalar=0.001): + """ + Normalizes the value at feature[idx] to the feature's length, in kb. + + *idx*, by default, is the score field for a BED file, but specify any + integer. + + The value at *idx* will be replaced with its scaled value. + + *scalar* will be multiplied by the value at *idx*, by default this is + 0.001, or per kb. + + Useful for calculating RPKM after running intersect with counts + """ + feature[idx] = str(float(feature[idx]) * scalar / len(feature)) + return feature + + +cpdef rename(Interval feature, str name): + """ + Forces a rename of all features, e.g., for renaming everything in a file + 'exon' + """ + feature.name = name + return feature + + +cpdef bedgraph_scale(Interval feature, float scalar): + feature[3] = str(float(feature[3]) * scalar) + return feature + + +cpdef TSS(Interval feature, int upstream=500, int downstream=500, add_to_name=None, genome=None): + """ + Alias for five_prime. + """ + return star_prime(feature, upstream, downstream, prime=5, + add_to_name=add_to_name, genome=genome) + + +cdef star_prime(Interval feature, int upstream=500, int downstream=500, int prime=5, + add_to_name=None, genome=None): + + if prime == 5: + if feature.strand == '-': + start = feature.stop - downstream + stop = feature.stop + upstream + else: + start = feature.start - upstream + stop = feature.start + downstream + elif prime == 3: + if feature.strand == '-': + start = feature.start - downstream + stop = feature.start + upstream + else: + start = feature.stop - upstream + stop = feature.stop + downstream + if add_to_name: + try: + feature.name += add_to_name + except AttributeError: + pass + if genome is not None: + gstart, gstop = genome[feature.chrom] + stop = min(stop, gstop) + start = max(start, gstart) + if start < 0: + start = 0 + if start > stop: + start = stop + feature.start = start + feature.stop = stop + return feature + +cpdef five_prime(Interval feature, int upstream=500, int downstream=500, + add_to_name=None, genome=None): + """ + Returns the 5'-most coordinate, plus `upstream` and `downstream` bp; adds + the string `add_to_name` to the feature's name if provided (e.g., "_TSS") + + Parameters + ---------- + feature : pybedtools.Interval instance + + upstream, downstream : int + Number of bp upstream or downstream of the strand-specific start + position of the feature to include. Default is 500 for both upstream + and downstream so that the returned feature is 1kb centered on the 5' + end of the feature. Unstranded features (where strand=".") are treated + as plus-strand features. + + add_to_name : str or None + If not None, append the string suffix to the name field of the feature (for + example "_TSS"). + + genome : dict or None + If not None, then ensure that the start/stop positions are within the + boundaries of the chromosome. + """ + return star_prime(feature, upstream, downstream, prime=5, + add_to_name=add_to_name, genome=genome) + + +cpdef three_prime(Interval feature, int upstream=500, int downstream=500, + add_to_name=None, genome=None): + """ + Returns the 3'-most coordinate, plus `upstream` and `downstream` bp; adds + the string `add_to_name` to the feature's name if provided (e.g., + "_polyA_site") + + Parameters + ---------- + feature : pybedtools.Interval instance + + upstream, downstrea : int + Number of bp upstream or downstream of the strand-specific stop + position of the feature to include. Default is 500 for both upstream + and downstream so that the returned feature is 1kb centered on the 5' + end of the feature. Unstranded features (where strand=".") are treated + as plus-strand features. + + add_to_name : str or None + If not None, append the string suffix to the name field of the feature (for + example "_TSS"). + + genome : dict or None + If not None, then ensure that the start/stop positions are within the + boundaries of the chromosome. + + + """ + return star_prime(feature, upstream, downstream, prime=3, + add_to_name=add_to_name, genome=genome) + +cpdef add_color(Interval feature, cmap, norm): + """ + Signature: + + add_color(feature, cmap, norm) + + Given the matplotlib colormap `cmap` and the matplotlib Normalize instance + `norm`, return a new 9-field feature (extended out if needed) with the RGB + tuple set according to the score. + """ + if len(feature.fields) < 9: + feature = extend_fields(feature, 9) + feature[6] = str(feature.start) + feature[7] = str(feature.stop) + + rgb_float = cmap(norm(float(feature.score))) + feature[8] = ','.join([str(int(i * 255)) for i in rgb_float[:3]]) + return feature + + +cpdef gff2bed(Interval feature, name_field=None): + """ + Signature: + + gff2bed(feature, name_field=None) + + Converts a GFF feature into a BED6 feature. By default, the name of the + new BED will be feature.name, but if `name_field` is provided then the name + of the new BED will be feature.attrs[name_field]. + + `name_field` can also be an integer to index into the fields of the object, + so if you want the BED name to be the GFF featuretype, then you can use + `name_field=2`. + + If the specified field does not exist, then "." will be used for the name. + """ + if name_field is None: + name = feature.name + else: + try: + if isinstance(name_field, basestring): + name = feature.attrs[name_field] + if isinstance(name_field, int): + name = feature[name_field] + except (NameError, KeyError): + name = "." + return create_interval_from_list([ + str(feature.chrom), + str(feature.start), + str(feature.stop), + name, + feature.score, + feature.strand]) + + +cpdef bed2gff(Interval feature): + """ + Signature: + + bed2gff(feature) + + Converts a BED feature (BED3 through BED12) into a GFF format. + + Chrom, start, stop, score, and strand are put directly into the + corresponding GFF fields. Other BED fields are put into the GFF attributes + field, named according to the UCSC BED format definition. + + If there are more than 12 BED fields, the additional fields will be added + to the GFF attributes using the 0-based index (so starting at "12") as the + key. + + GFF fields that do not have a direct mapping to BED format (feature type, + source, phase) are set to ".". + + 1 bp is added to the start position to finish the conversion to GFF. + """ + + # Note that Interval.score, .strand, and .name have a default of ".", so no + # need to do the extra try/except IndexError for those fields. + mapping = ( + (6, "thickStart"), + (7, "thickEnd"), + (8, "itemRgb"), + (9, "blockCount"), + (10, "blockSizes"), + (11, "blockStarts") + ) + + # Add any standard BED fields we might have + attributes = ['Name="%s"' % feature.name] + for k, v in mapping: + try: + attributes.append('%s="%s"' % (v, feature.fields[k])) + except IndexError: + break + + # Add any additional fields, keyed by their index + if len(feature.fields) > 12: + for i in range(12, len(feature.fields)): + attributes.append('%s="%s"' % (i, feature.fields[i])) + + attributes = '; '.join(attributes) + ';' + + return create_interval_from_list([ + str(feature.chrom), + '.', + '.', + str(feature.start + 1), + str(feature.stop), + feature.score, + feature.strand, + '.', + attributes]) + + +class UniqueID(object): + def __init__(self, pattern="%d", first=0): + """ + Class to help create uniquely-named features. + + Example usage: + + >>> a = pybedtools.example_bedtool('a.bed') + >>> uid = UniqueID("f_%d") + >>> print(a.each(uid)) # doctest: +NORMALIZE_WHITESPACE + chr1 1 100 f_0 0 + + chr1 100 200 f_1 0 + + chr1 150 500 f_2 0 - + chr1 900 950 f_3 0 + + + Parameters + ---------- + pattern : str + + Pattern will be filled in using `% self.count` + + first : int + `self.count` will be initialzed to this value. + + """ + self.pattern = pattern + self.first = first + self.count = first + + def __call__(self, feature): + feature.name = self.pattern % self.count + self.count += 1 + return feature +