jpayne@68
|
1 from __future__ import annotations
|
jpayne@68
|
2 import tempfile
|
jpayne@68
|
3 from textwrap import dedent
|
jpayne@68
|
4 import shutil
|
jpayne@68
|
5 import subprocess
|
jpayne@68
|
6 import operator
|
jpayne@68
|
7 import os
|
jpayne@68
|
8 import sys
|
jpayne@68
|
9 import random
|
jpayne@68
|
10 import string
|
jpayne@68
|
11 import pprint
|
jpayne@68
|
12 from itertools import islice
|
jpayne@68
|
13 from multiprocessing import Pool
|
jpayne@68
|
14 import gzip
|
jpayne@68
|
15 from typing import Any, Callable, Iterable, Iterator, Literal, Optional, TYPE_CHECKING, cast
|
jpayne@68
|
16 import pysam
|
jpayne@68
|
17 from warnings import warn
|
jpayne@68
|
18 import pathlib
|
jpayne@68
|
19 from pathlib import Path
|
jpayne@68
|
20
|
jpayne@68
|
21 from .helpers import (
|
jpayne@68
|
22 get_tempdir,
|
jpayne@68
|
23 _tags,
|
jpayne@68
|
24 call_bedtools,
|
jpayne@68
|
25 _flatten_list,
|
jpayne@68
|
26 _check_sequence_stderr,
|
jpayne@68
|
27 isBAM,
|
jpayne@68
|
28 isBGZIP,
|
jpayne@68
|
29 isGZIP,
|
jpayne@68
|
30 BEDToolsError,
|
jpayne@68
|
31 pybedtoolsError,
|
jpayne@68
|
32 _call_randomintersect,
|
jpayne@68
|
33 SplitOutput,
|
jpayne@68
|
34 FisherOutput,
|
jpayne@68
|
35 )
|
jpayne@68
|
36 from . import helpers
|
jpayne@68
|
37 from .cbedtools import (
|
jpayne@68
|
38 IntervalFile,
|
jpayne@68
|
39 IntervalIterator,
|
jpayne@68
|
40 Interval,
|
jpayne@68
|
41 create_interval_from_list,
|
jpayne@68
|
42 BedToolsFileError,
|
jpayne@68
|
43 )
|
jpayne@68
|
44 import pybedtools
|
jpayne@68
|
45 from . import settings
|
jpayne@68
|
46 from . import filenames
|
jpayne@68
|
47
|
jpayne@68
|
48 if TYPE_CHECKING:
|
jpayne@68
|
49 import pandas as pd
|
jpayne@68
|
50 import matplotlib.colors as mcolors
|
jpayne@68
|
51
|
jpayne@68
|
52 _implicit_registry = {}
|
jpayne@68
|
53 _other_registry = {}
|
jpayne@68
|
54 _bam_registry = {}
|
jpayne@68
|
55
|
jpayne@68
|
56
|
jpayne@68
|
57 def _jaccard_output_to_dict(s, **kwargs) -> dict:
|
jpayne@68
|
58 """
|
jpayne@68
|
59 jaccard method doesn't return an interval file, rather, it returns a short
|
jpayne@68
|
60 summary of results. Here, we simply parse it into a dict for convenience.
|
jpayne@68
|
61 """
|
jpayne@68
|
62 if isinstance(s, str):
|
jpayne@68
|
63 _s = open(s).read()
|
jpayne@68
|
64 elif hasattr(s, "next") or hasattr(s, "__next__"):
|
jpayne@68
|
65 _s = "".join([i for i in s])
|
jpayne@68
|
66 else:
|
jpayne@68
|
67 raise ValueError("Unexpected object %r" % s)
|
jpayne@68
|
68 header, data = _s.splitlines()
|
jpayne@68
|
69 header = header.split()
|
jpayne@68
|
70 data = data.split()
|
jpayne@68
|
71 data[0] = int(data[0])
|
jpayne@68
|
72 data[1] = int(data[1])
|
jpayne@68
|
73 data[2] = float(data[2])
|
jpayne@68
|
74 data[3] = int(data[3])
|
jpayne@68
|
75 return dict(list(zip(header, data)))
|
jpayne@68
|
76
|
jpayne@68
|
77
|
jpayne@68
|
78 def _reldist_output_handler(s, **kwargs):
|
jpayne@68
|
79 """
|
jpayne@68
|
80 reldist, if called with -detail, returns a valid BED file with the relative
|
jpayne@68
|
81 distance as the last field. In that case, return the BedTool immediately.
|
jpayne@68
|
82 If not -detail, then the results are a table, in which case here we parse
|
jpayne@68
|
83 into a dict for convenience.
|
jpayne@68
|
84 """
|
jpayne@68
|
85 if "detail" in kwargs:
|
jpayne@68
|
86 return BedTool(s)
|
jpayne@68
|
87 if isinstance(s, str):
|
jpayne@68
|
88 iterable = open(s)
|
jpayne@68
|
89 if hasattr(s, "next"):
|
jpayne@68
|
90 iterable = s
|
jpayne@68
|
91 header = next(iterable).split()
|
jpayne@68
|
92 results = {}
|
jpayne@68
|
93 for h in header:
|
jpayne@68
|
94 results[h] = []
|
jpayne@68
|
95 for i in iterable:
|
jpayne@68
|
96 reldist, count, total, fraction = i.split()
|
jpayne@68
|
97 data = [float(reldist), int(count), int(total), float(fraction)]
|
jpayne@68
|
98 for h, d in zip(header, data):
|
jpayne@68
|
99 results[h].append(d)
|
jpayne@68
|
100 return results
|
jpayne@68
|
101
|
jpayne@68
|
102
|
jpayne@68
|
103 def _wraps(
|
jpayne@68
|
104 prog: Optional[str] = None,
|
jpayne@68
|
105 implicit: Optional[str] = None,
|
jpayne@68
|
106 bam: Optional[str] = None,
|
jpayne@68
|
107 other: Optional[str] = None,
|
jpayne@68
|
108 uses_genome: bool = False,
|
jpayne@68
|
109 make_tempfile_for: Optional[str] = None,
|
jpayne@68
|
110 check_stderr:Optional[Callable]=None,
|
jpayne@68
|
111 add_to_bedtool:Optional[dict] = None,
|
jpayne@68
|
112 nonbam: Optional[Literal["ALL"] | str | list[str]] = None,
|
jpayne@68
|
113 force_bam: bool = False,
|
jpayne@68
|
114 genome_none_if: Optional[list[str]] =None,
|
jpayne@68
|
115 genome_if: Optional[list[str]] = None,
|
jpayne@68
|
116 genome_ok_if: Optional[list[str]] = None,
|
jpayne@68
|
117 does_not_return_bedtool: Optional[Callable] =None,
|
jpayne@68
|
118 arg_order: Optional[list[str]] = None,
|
jpayne@68
|
119 ):
|
jpayne@68
|
120 """
|
jpayne@68
|
121 Do-it-all wrapper, to be used as a decorator.
|
jpayne@68
|
122
|
jpayne@68
|
123 *prog* is the name of the BEDTools program that will be called. The help
|
jpayne@68
|
124 for this program will also be added to the decorated method's docstring.
|
jpayne@68
|
125
|
jpayne@68
|
126 *implicit* is the BEDTools program arg that should be filled in
|
jpayne@68
|
127 automatically.
|
jpayne@68
|
128
|
jpayne@68
|
129 *bam* will disable the implicit substitution if *bam* is in the kwargs.
|
jpayne@68
|
130 This is typically 'abam' or 'ibam' if the program accepts BAM input.
|
jpayne@68
|
131
|
jpayne@68
|
132 *other* is the BEDTools program arg that is passed in as the second input,
|
jpayne@68
|
133 if supported. Within the semantics of BEDTools, the typical case will be
|
jpayne@68
|
134 that if implicit='a' then other='b'; if implicit='i' then other=None.
|
jpayne@68
|
135
|
jpayne@68
|
136 *uses_genome*, if True, will check for 'g' and/or 'genome' args and
|
jpayne@68
|
137 retrieve the corresponding genome files as needed.
|
jpayne@68
|
138
|
jpayne@68
|
139 *make_tempfile_for* is used for the sequence methods and indicates which
|
jpayne@68
|
140 kwarg should have a tempfile made for it if it's not provided ('fo' for the
|
jpayne@68
|
141 sequence methods)
|
jpayne@68
|
142
|
jpayne@68
|
143 *check_stderr*, if not None, is a function that accepts a string (which
|
jpayne@68
|
144 will be anything written to stdout when calling the wrapped program). This
|
jpayne@68
|
145 function should return True if the string is OK, and False if it should
|
jpayne@68
|
146 truly be considered an error. This is needed for wrapping fastaFromBed,
|
jpayne@68
|
147 which will report to stderr that it's creating an index file.
|
jpayne@68
|
148
|
jpayne@68
|
149 *add_to_bedtool* is used for sequence methods. It is a dictionary mapping
|
jpayne@68
|
150 kwargs to attributes to be created in the resulting BedTool. Typically it
|
jpayne@68
|
151 is {'fo':'seqfn'} which will add the resulting sequence name to the
|
jpayne@68
|
152 BedTool's .seqfn attribute. If *add_to_bedtool* is not None, then the
|
jpayne@68
|
153 returned BedTool will be *self* with the added attribute. If a key is
|
jpayne@68
|
154 "stdout" (e.g., {"stdout": attr_name}), then save the stdout of the command
|
jpayne@68
|
155 as a tempfile and store the tempfile's name in the attribute. This is
|
jpayne@68
|
156 required for linksBed and bedToIgv.
|
jpayne@68
|
157
|
jpayne@68
|
158 *nonbam* is a kwarg that even if the input file was a BAM, the output will
|
jpayne@68
|
159 *not* be BAM format. For example, the `-bed` arg for intersectBed will
|
jpayne@68
|
160 cause the output to be in BED format, not BAM. If not None, this can be a
|
jpayne@68
|
161 string, a list of strings, or the special string "ALL", which means that
|
jpayne@68
|
162 the wrapped program will never return BAM output.
|
jpayne@68
|
163
|
jpayne@68
|
164 *force_bam*, if True, will force the output to be BAM. This is used for
|
jpayne@68
|
165 bedToBam.
|
jpayne@68
|
166
|
jpayne@68
|
167 *genome_none_if* is a list of arguments that will ignore the requirement
|
jpayne@68
|
168 for a genome. This is needed for window_maker, where -b and -g are
|
jpayne@68
|
169 mutually exclusive.
|
jpayne@68
|
170
|
jpayne@68
|
171 *genome_ok_if* is a list of arguments that, if they are in
|
jpayne@68
|
172 *genome_none_if*, are still OK to pass in. This is needed for bedtool
|
jpayne@68
|
173 genomecov, where -g is not needed if -ibam is specified...but it's still OK
|
jpayne@68
|
174 if the user passes a genome arg.
|
jpayne@68
|
175
|
jpayne@68
|
176 *genome_if* is a list of arguments that will trigger the requirement for
|
jpayne@68
|
177 a genome; otherwise no genome needs to be specified.
|
jpayne@68
|
178
|
jpayne@68
|
179 *does_not_return_bedtool*, if not None, should be a function that handles
|
jpayne@68
|
180 the returned output. Its signature should be ``func(output, kwargs)``,
|
jpayne@68
|
181 where `output` is the output from the [possibly streaming] call to BEDTools
|
jpayne@68
|
182 and `kwargs` are passed verbatim from the wrapped method call. Some
|
jpayne@68
|
183 examples of methods that use this are jaccard, reldist, fisher, and split
|
jpayne@68
|
184 methods.
|
jpayne@68
|
185
|
jpayne@68
|
186 *arg_order*, if not None, is a sorted list of arguments. This is used by
|
jpayne@68
|
187 handle_kwargs() to deal with things like issues 81 and 345, where some
|
jpayne@68
|
188 BEDTools programs are sensitive to argument order.
|
jpayne@68
|
189 """
|
jpayne@68
|
190
|
jpayne@68
|
191 # NOTE: We are calling each BEDTools program to get its help and adding
|
jpayne@68
|
192 # that to the docstring of each method. This is run at import time. However
|
jpayne@68
|
193 # if BEDTools is not on the path at import time, `not_implemented` is set
|
jpayne@68
|
194 # to True and isn't reset later until the module is reloaded.
|
jpayne@68
|
195 #
|
jpayne@68
|
196 # helpers.set_bedtools_path therefore will trigger a module reload.
|
jpayne@68
|
197 not_implemented = False
|
jpayne@68
|
198
|
jpayne@68
|
199 # Call the program with -h to get help, which prints to stderr.
|
jpayne@68
|
200 try:
|
jpayne@68
|
201 p = subprocess.Popen(
|
jpayne@68
|
202 helpers._version_2_15_plus_names(prog) + ["-h"],
|
jpayne@68
|
203 stdout=subprocess.PIPE,
|
jpayne@68
|
204 stderr=subprocess.PIPE,
|
jpayne@68
|
205 )
|
jpayne@68
|
206 help_str = p.communicate()[1].decode()
|
jpayne@68
|
207
|
jpayne@68
|
208 # underscores throw off ReStructuredText syntax of docstrings, so
|
jpayne@68
|
209 # replace 'em
|
jpayne@68
|
210 help_str = help_str.replace("_", "**")
|
jpayne@68
|
211
|
jpayne@68
|
212 # indent
|
jpayne@68
|
213 help_str = help_str.split("\n")
|
jpayne@68
|
214 help_str = ["\n\n**Original BEDTools help:**::"] + ["\t" + i for i in help_str]
|
jpayne@68
|
215 help_str = "\n".join(help_str) + "\n"
|
jpayne@68
|
216
|
jpayne@68
|
217 # If the program can't be found, then we'll eventually replace the method
|
jpayne@68
|
218 # with a version that does nothing but raise a NotImplementedError (plus
|
jpayne@68
|
219 # a helpful message).
|
jpayne@68
|
220 except OSError:
|
jpayne@68
|
221 help_str = (
|
jpayne@68
|
222 '"%s" does not appear to be installed '
|
jpayne@68
|
223 "or on the path, so this method is "
|
jpayne@68
|
224 "disabled. Please install a more recent "
|
jpayne@68
|
225 "version of BEDTools and re-import to "
|
jpayne@68
|
226 "use this method." % prog
|
jpayne@68
|
227 )
|
jpayne@68
|
228 not_implemented = True
|
jpayne@68
|
229
|
jpayne@68
|
230 def decorator(func):
|
jpayne@68
|
231 """
|
jpayne@68
|
232 Accepts a function to be wrapped; discards the original and returns a
|
jpayne@68
|
233 new, rebuilt-from-scratch function based on the kwargs passed to
|
jpayne@68
|
234 _wraps().
|
jpayne@68
|
235 """
|
jpayne@68
|
236 # Register the implicit (as well as bam and other) args in the global
|
jpayne@68
|
237 # registry. BedTool.handle_kwargs() will access these at runtime. The
|
jpayne@68
|
238 # registry is keyed by program name (like intersectBed).
|
jpayne@68
|
239 _implicit_registry[prog] = implicit
|
jpayne@68
|
240 if other is not None:
|
jpayne@68
|
241 _other_registry[prog] = other
|
jpayne@68
|
242 if bam is not None:
|
jpayne@68
|
243 _bam_registry[prog] = bam
|
jpayne@68
|
244
|
jpayne@68
|
245 # Here's where we replace an unable-to-be-found program's method with
|
jpayne@68
|
246 # one that only returns a NotImplementedError
|
jpayne@68
|
247 if not_implemented:
|
jpayne@68
|
248
|
jpayne@68
|
249 def not_implemented_func(*args, **kwargs):
|
jpayne@68
|
250 raise NotImplementedError(help_str)
|
jpayne@68
|
251
|
jpayne@68
|
252 return not_implemented_func
|
jpayne@68
|
253
|
jpayne@68
|
254 _add_doc = []
|
jpayne@68
|
255 if implicit:
|
jpayne@68
|
256 _add_doc.append(
|
jpayne@68
|
257 dedent(
|
jpayne@68
|
258 """
|
jpayne@68
|
259 For convenience, the file or stream this BedTool points to
|
jpayne@68
|
260 is implicitly passed as the `-%s` argument to `%s`
|
jpayne@68
|
261 """
|
jpayne@68
|
262 % (implicit, prog)
|
jpayne@68
|
263 )
|
jpayne@68
|
264 )
|
jpayne@68
|
265
|
jpayne@68
|
266 if uses_genome:
|
jpayne@68
|
267 _add_doc.append(
|
jpayne@68
|
268 dedent(
|
jpayne@68
|
269 """
|
jpayne@68
|
270 There are two alternatives for supplying a genome. Use
|
jpayne@68
|
271 `g="genome.filename"` if you have a genome's chrom sizes
|
jpayne@68
|
272 saved as a file. This is the what BEDTools expects when
|
jpayne@68
|
273 using it from the command line. Alternatively, use the
|
jpayne@68
|
274 `genome="assembly.name"` (for example, `genome="hg19"`) to
|
jpayne@68
|
275 use chrom sizes for that assembly without having to manage
|
jpayne@68
|
276 a separate file. The `genome` argument triggers a call
|
jpayne@68
|
277 `pybedtools.chromsizes`, so see that method for more
|
jpayne@68
|
278 details.
|
jpayne@68
|
279 """
|
jpayne@68
|
280 )
|
jpayne@68
|
281 )
|
jpayne@68
|
282
|
jpayne@68
|
283 def wrapped(self, *args, **kwargs):
|
jpayne@68
|
284 """
|
jpayne@68
|
285 A newly created function that will be returned by the _wraps()
|
jpayne@68
|
286 decorator
|
jpayne@68
|
287 """
|
jpayne@68
|
288
|
jpayne@68
|
289 # Only one non-keyword argument is supported; this is then assumed
|
jpayne@68
|
290 # to be "other" (e.g., `-b` for intersectBed)
|
jpayne@68
|
291 if len(args) > 0:
|
jpayne@68
|
292 assert len(args) == 1
|
jpayne@68
|
293 kwargs[other] = args[0]
|
jpayne@68
|
294
|
jpayne@68
|
295 # Add the implicit values to kwargs. If the current BedTool is
|
jpayne@68
|
296 # BAM, it will automatically be passed to the appropriate
|
jpayne@68
|
297 # BAM-support arg (like `-abam`). But this also allows the user to
|
jpayne@68
|
298 # explicitly specify the abam kwarg, which will override the
|
jpayne@68
|
299 # auto-substitution.
|
jpayne@68
|
300 # Note: here, `implicit` is something like "a"; `bam` is something
|
jpayne@68
|
301 # like "abam"
|
jpayne@68
|
302 if (
|
jpayne@68
|
303 (implicit not in kwargs)
|
jpayne@68
|
304 and (bam not in kwargs)
|
jpayne@68
|
305 and (implicit is not None)
|
jpayne@68
|
306 ):
|
jpayne@68
|
307 if not self._isbam:
|
jpayne@68
|
308 kwargs[implicit] = self.fn
|
jpayne@68
|
309 else:
|
jpayne@68
|
310 # It is a bam file. If this program supports BAM as the
|
jpayne@68
|
311 # first input, then we set it here
|
jpayne@68
|
312 if bam is not None:
|
jpayne@68
|
313 kwargs[bam] = self.fn
|
jpayne@68
|
314
|
jpayne@68
|
315 # Otherwise, BEDTools can't currently handle it, so raise
|
jpayne@68
|
316 # an exception.
|
jpayne@68
|
317 else:
|
jpayne@68
|
318 raise pybedtoolsError(
|
jpayne@68
|
319 '"%s" currently can\'t handle BAM '
|
jpayne@68
|
320 "input, please use bam_to_bed() first." % prog
|
jpayne@68
|
321 )
|
jpayne@68
|
322
|
jpayne@68
|
323 # Should this function handle genome files?
|
jpayne@68
|
324 check_for_genome = uses_genome
|
jpayne@68
|
325 if uses_genome:
|
jpayne@68
|
326 if genome_none_if:
|
jpayne@68
|
327 for i in genome_none_if:
|
jpayne@68
|
328 if i in kwargs or i == implicit:
|
jpayne@68
|
329 check_for_genome = False
|
jpayne@68
|
330
|
jpayne@68
|
331 # for genomecov, if -ibam then -g is optional. So it's OK
|
jpayne@68
|
332 # for the user to provide genome or g kwargs, even if
|
jpayne@68
|
333 # -ibam.
|
jpayne@68
|
334 if genome_ok_if:
|
jpayne@68
|
335 for i in genome_ok_if:
|
jpayne@68
|
336 if i in kwargs or i == implicit:
|
jpayne@68
|
337 if ("g" in kwargs) or ("genome" in kwargs):
|
jpayne@68
|
338 check_for_genome = True
|
jpayne@68
|
339 if genome_if:
|
jpayne@68
|
340 check_for_genome = False
|
jpayne@68
|
341 for i in genome_if:
|
jpayne@68
|
342 if (i in kwargs) or (i == implicit):
|
jpayne@68
|
343 check_for_genome = True
|
jpayne@68
|
344 if check_for_genome:
|
jpayne@68
|
345 kwargs = self.check_genome(**kwargs)
|
jpayne@68
|
346
|
jpayne@68
|
347 # For sequence methods, we may need to make a tempfile that will
|
jpayne@68
|
348 # hold the resulting sequence. For example, fastaFromBed needs to
|
jpayne@68
|
349 # make a tempfile for 'fo' if no 'fo' was explicitly specified by
|
jpayne@68
|
350 # the user.
|
jpayne@68
|
351 if make_tempfile_for is not None:
|
jpayne@68
|
352 if make_tempfile_for not in kwargs:
|
jpayne@68
|
353 kwargs[make_tempfile_for] = self._tmp()
|
jpayne@68
|
354
|
jpayne@68
|
355 # At runtime, this will parse the kwargs, convert streams to
|
jpayne@68
|
356 # tempfiles if needed, and return all the goodies
|
jpayne@68
|
357 cmds, tmp, stdin = self.handle_kwargs(prog=prog,
|
jpayne@68
|
358 arg_order=arg_order,
|
jpayne@68
|
359 **kwargs)
|
jpayne@68
|
360
|
jpayne@68
|
361 # Decide whether the output is BAM format or not.
|
jpayne@68
|
362 result_is_bam = False
|
jpayne@68
|
363
|
jpayne@68
|
364 # By default, if the current BedTool is BAM, then the result should
|
jpayne@68
|
365 # be, too.
|
jpayne@68
|
366 if self._isbam:
|
jpayne@68
|
367 result_is_bam = True
|
jpayne@68
|
368
|
jpayne@68
|
369 # If nonbam is "ALL", then this method will never return BAM
|
jpayne@68
|
370 # output.
|
jpayne@68
|
371 if nonbam == "ALL":
|
jpayne@68
|
372 result_is_bam = False
|
jpayne@68
|
373
|
jpayne@68
|
374 # If any of the `nonbam` args are found in kwargs, then result is
|
jpayne@68
|
375 # not a BAM. Side note: the _nonbam name mangling is necessary to
|
jpayne@68
|
376 # keep the nonbam arg passed into the original _wraps() decorator
|
jpayne@68
|
377 # in scope.
|
jpayne@68
|
378 if nonbam is not None and nonbam != "ALL":
|
jpayne@68
|
379 if isinstance(nonbam, str):
|
jpayne@68
|
380 _nonbam = [nonbam]
|
jpayne@68
|
381 else:
|
jpayne@68
|
382 _nonbam = nonbam
|
jpayne@68
|
383 for i in _nonbam:
|
jpayne@68
|
384 if i in kwargs:
|
jpayne@68
|
385 result_is_bam = False
|
jpayne@68
|
386 break
|
jpayne@68
|
387
|
jpayne@68
|
388 if force_bam:
|
jpayne@68
|
389 result_is_bam = True
|
jpayne@68
|
390
|
jpayne@68
|
391 decode_output = not result_is_bam
|
jpayne@68
|
392
|
jpayne@68
|
393 # Do the actual call
|
jpayne@68
|
394 stream = call_bedtools(
|
jpayne@68
|
395 cmds,
|
jpayne@68
|
396 tmp,
|
jpayne@68
|
397 stdin=stdin,
|
jpayne@68
|
398 check_stderr=check_stderr,
|
jpayne@68
|
399 decode_output=decode_output,
|
jpayne@68
|
400 )
|
jpayne@68
|
401
|
jpayne@68
|
402 if does_not_return_bedtool:
|
jpayne@68
|
403 return does_not_return_bedtool(stream, **kwargs)
|
jpayne@68
|
404
|
jpayne@68
|
405 # Post-hoc editing of the BedTool -- for example, this is used for
|
jpayne@68
|
406 # the sequence methods to add a `seqfn` attribute to the resulting
|
jpayne@68
|
407 # BedTool.
|
jpayne@68
|
408 if add_to_bedtool is not None:
|
jpayne@68
|
409 for kw, attr in list(add_to_bedtool.items()):
|
jpayne@68
|
410 if kw == "stdout":
|
jpayne@68
|
411 value = stream
|
jpayne@68
|
412 else:
|
jpayne@68
|
413 value = kwargs[kw]
|
jpayne@68
|
414 setattr(self, attr, value)
|
jpayne@68
|
415 result = self
|
jpayne@68
|
416 else:
|
jpayne@68
|
417 result = BedTool(stream)
|
jpayne@68
|
418
|
jpayne@68
|
419 result._isbam = result_is_bam
|
jpayne@68
|
420 result._cmds = cmds
|
jpayne@68
|
421 del kwargs
|
jpayne@68
|
422 return result
|
jpayne@68
|
423
|
jpayne@68
|
424 # Now add the edited docstring (original Python docstring plus BEDTools
|
jpayne@68
|
425 # help) to the newly created method above
|
jpayne@68
|
426 if func.__doc__ is None:
|
jpayne@68
|
427 orig = ""
|
jpayne@68
|
428 else:
|
jpayne@68
|
429 orig = func.__doc__
|
jpayne@68
|
430
|
jpayne@68
|
431 wrapped.__doc__ = orig + "\n".join(_add_doc) + help_str
|
jpayne@68
|
432
|
jpayne@68
|
433 # Add the original method's name to a new attribute so we can access it
|
jpayne@68
|
434 # when logging history
|
jpayne@68
|
435 wrapped._name = func.__name__ # type: ignore
|
jpayne@68
|
436
|
jpayne@68
|
437 return wrapped
|
jpayne@68
|
438
|
jpayne@68
|
439 return decorator
|
jpayne@68
|
440
|
jpayne@68
|
441
|
jpayne@68
|
442 def _log_to_history(method: Callable):
|
jpayne@68
|
443 """
|
jpayne@68
|
444 Decorator to add a method and its kwargs to the history.
|
jpayne@68
|
445
|
jpayne@68
|
446 Assumes that you only add this decorator to bedtool instances that
|
jpayne@68
|
447 return other bedtool instances
|
jpayne@68
|
448 """
|
jpayne@68
|
449
|
jpayne@68
|
450 def decorated(self, *args, **kwargs):
|
jpayne@68
|
451
|
jpayne@68
|
452 # this calls the actual method in the first place; *result* is
|
jpayne@68
|
453 # whatever you get back
|
jpayne@68
|
454 result = method(self, *args, **kwargs)
|
jpayne@68
|
455
|
jpayne@68
|
456 # add appropriate tags
|
jpayne@68
|
457 parent_tag = self._tag
|
jpayne@68
|
458 result_tag = result._tag
|
jpayne@68
|
459
|
jpayne@68
|
460 history_step = HistoryStep(
|
jpayne@68
|
461 method, args, kwargs, self, parent_tag, result_tag
|
jpayne@68
|
462 )
|
jpayne@68
|
463
|
jpayne@68
|
464 # only add the current history to the new bedtool if there's
|
jpayne@68
|
465 # something to add
|
jpayne@68
|
466 if len(self.history) > 0:
|
jpayne@68
|
467 result.history.append(self.history)
|
jpayne@68
|
468
|
jpayne@68
|
469 # but either way, add this history step to the result.
|
jpayne@68
|
470 result.history.append(history_step)
|
jpayne@68
|
471
|
jpayne@68
|
472 return result
|
jpayne@68
|
473
|
jpayne@68
|
474 decorated.__doc__ = method.__doc__
|
jpayne@68
|
475 return decorated
|
jpayne@68
|
476
|
jpayne@68
|
477
|
jpayne@68
|
478 class BedTool(object):
|
jpayne@68
|
479 TEMPFILES = filenames.TEMPFILES
|
jpayne@68
|
480
|
jpayne@68
|
481 def __init__(self, fn: Optional[Any] = None,
|
jpayne@68
|
482 from_string: bool = False,
|
jpayne@68
|
483 remote: bool = False):
|
jpayne@68
|
484 """
|
jpayne@68
|
485 Wrapper around Aaron Quinlan's ``BEDtools`` suite of programs
|
jpayne@68
|
486 (https://github.com/arq5x/bedtools); also contains many useful
|
jpayne@68
|
487 methods for more detailed work with BED files.
|
jpayne@68
|
488
|
jpayne@68
|
489 *fn* is typically the name of a BED-like file, but can also be
|
jpayne@68
|
490 one of the following:
|
jpayne@68
|
491
|
jpayne@68
|
492 * a string filename
|
jpayne@68
|
493 * another BedTool object
|
jpayne@68
|
494 * an iterable of Interval objects
|
jpayne@68
|
495 * an open file object
|
jpayne@68
|
496 * a "file contents" string (see below)
|
jpayne@68
|
497
|
jpayne@68
|
498 If *from_string* is True, then you can pass a string that contains
|
jpayne@68
|
499 the contents of the BedTool you want to create. This will treat all
|
jpayne@68
|
500 spaces as TABs and write to tempfile, treating whatever you pass as
|
jpayne@68
|
501 *fn* as the contents of the bed file. This also strips empty lines.
|
jpayne@68
|
502
|
jpayne@68
|
503 Typical usage is to point to an existing file::
|
jpayne@68
|
504
|
jpayne@68
|
505 a = BedTool('a.bed')
|
jpayne@68
|
506
|
jpayne@68
|
507 But you can also create one from scratch from a string::
|
jpayne@68
|
508
|
jpayne@68
|
509 >>> s = '''
|
jpayne@68
|
510 ... chrX 1 100
|
jpayne@68
|
511 ... chrX 25 800
|
jpayne@68
|
512 ... '''
|
jpayne@68
|
513 >>> a = BedTool(s, from_string=True)
|
jpayne@68
|
514
|
jpayne@68
|
515 Or use examples that come with pybedtools::
|
jpayne@68
|
516
|
jpayne@68
|
517 >>> example_files = pybedtools.list_example_files()
|
jpayne@68
|
518 >>> assert 'a.bed' in example_files
|
jpayne@68
|
519 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
520
|
jpayne@68
|
521 """
|
jpayne@68
|
522 if remote:
|
jpayne@68
|
523 raise ValueError(
|
jpayne@68
|
524 "Remote BAM no longer supported (since BEDTools does not " "support it)"
|
jpayne@68
|
525 )
|
jpayne@68
|
526 self.remote = remote
|
jpayne@68
|
527 self._isbam = False
|
jpayne@68
|
528 self._bam_header = ""
|
jpayne@68
|
529 self._cmds = []
|
jpayne@68
|
530 if from_string:
|
jpayne@68
|
531 if fn is None or not isinstance(fn, str):
|
jpayne@68
|
532 raise ValueError("from_string=True requires a string to parse")
|
jpayne@68
|
533 bed_contents = fn
|
jpayne@68
|
534 fn = self._tmp()
|
jpayne@68
|
535 fout = open(fn, "w")
|
jpayne@68
|
536 for line in bed_contents.splitlines():
|
jpayne@68
|
537 if len(line.strip()) == 0:
|
jpayne@68
|
538 continue
|
jpayne@68
|
539 line = "\t".join(line.split()) + "\n"
|
jpayne@68
|
540 fout.write(line)
|
jpayne@68
|
541 fout.close()
|
jpayne@68
|
542
|
jpayne@68
|
543 else:
|
jpayne@68
|
544 # if fn is a Path object, we have to use its string representation
|
jpayne@68
|
545 if isinstance(fn, pathlib.PurePath):
|
jpayne@68
|
546 fn = str(fn)
|
jpayne@68
|
547
|
jpayne@68
|
548 # our work is already done
|
jpayne@68
|
549 if isinstance(fn, BedTool):
|
jpayne@68
|
550 fn = fn.fn
|
jpayne@68
|
551
|
jpayne@68
|
552 # from_string=False, so assume it's a filename
|
jpayne@68
|
553 elif isinstance(fn, str):
|
jpayne@68
|
554 if remote:
|
jpayne@68
|
555 self._isbam = True
|
jpayne@68
|
556 else:
|
jpayne@68
|
557 if not os.path.exists(fn):
|
jpayne@68
|
558 msg = 'File "%s" does not exist' % fn
|
jpayne@68
|
559 raise FileNotFoundError(msg)
|
jpayne@68
|
560 self._isbam = isBAM(fn)
|
jpayne@68
|
561
|
jpayne@68
|
562 # TODO: we dont' really need this, but it's added here for
|
jpayne@68
|
563 # compatibility with existing tests
|
jpayne@68
|
564 if self._isbam:
|
jpayne@68
|
565 header = pysam.Samfile(fn).header.to_dict()
|
jpayne@68
|
566 # For example:
|
jpayne@68
|
567 # {
|
jpayne@68
|
568 # 'HD': {'VN': '1.0', 'SO': 'coordinate'},
|
jpayne@68
|
569 # 'SQ': [
|
jpayne@68
|
570 # {'LN': 23011544,
|
jpayne@68
|
571 # 'SN': 'chr2L'},
|
jpayne@68
|
572 # {'LN': 21146708,
|
jpayne@68
|
573 # 'SN': 'chr2R'},
|
jpayne@68
|
574 # {'LN': 24543557,
|
jpayne@68
|
575 # 'SN': 'chr3L'},
|
jpayne@68
|
576 # {'LN': 27905053,
|
jpayne@68
|
577 # 'SN': 'chr3R'},
|
jpayne@68
|
578 # {'LN': 1351857,
|
jpayne@68
|
579 # 'SN': 'chr4'},
|
jpayne@68
|
580 # {'LN': 22422827,
|
jpayne@68
|
581 # 'SN': 'chrX'}
|
jpayne@68
|
582 # ]
|
jpayne@68
|
583 # }
|
jpayne@68
|
584
|
jpayne@68
|
585 txt_header = []
|
jpayne@68
|
586 for k, v in header.items():
|
jpayne@68
|
587 if isinstance(v, list):
|
jpayne@68
|
588 for i in v:
|
jpayne@68
|
589 if isinstance(i, dict):
|
jpayne@68
|
590 txt_header.append(
|
jpayne@68
|
591 "\t".join(
|
jpayne@68
|
592 ["@" + k]
|
jpayne@68
|
593 + [
|
jpayne@68
|
594 ":".join(map(str, j))
|
jpayne@68
|
595 for j in sorted(i.items(), reverse=True)
|
jpayne@68
|
596 ]
|
jpayne@68
|
597 )
|
jpayne@68
|
598 )
|
jpayne@68
|
599 elif isinstance(i, str):
|
jpayne@68
|
600 txt_header.append(i)
|
jpayne@68
|
601
|
jpayne@68
|
602 elif isinstance(v, dict):
|
jpayne@68
|
603 txt_header.append(
|
jpayne@68
|
604 "\t".join(
|
jpayne@68
|
605 ["@" + k]
|
jpayne@68
|
606 + [
|
jpayne@68
|
607 ":".join(map(str, j))
|
jpayne@68
|
608 for j in sorted(v.items(), reverse=True)
|
jpayne@68
|
609 ]
|
jpayne@68
|
610 )
|
jpayne@68
|
611 )
|
jpayne@68
|
612 else:
|
jpayne@68
|
613 raise ValueError("unhandled type in BAM header")
|
jpayne@68
|
614 self._bam_header = "\n".join(txt_header) + "\n"
|
jpayne@68
|
615
|
jpayne@68
|
616 # If tuple or list, then save as file first
|
jpayne@68
|
617 # (fixes #73)
|
jpayne@68
|
618 elif isinstance(fn, (list, tuple)):
|
jpayne@68
|
619 fn = BedTool(iter(fn)).saveas().fn
|
jpayne@68
|
620
|
jpayne@68
|
621 # Otherwise assume iterator, say an open file as from
|
jpayne@68
|
622 # subprocess.PIPE
|
jpayne@68
|
623 else:
|
jpayne@68
|
624 fn = fn
|
jpayne@68
|
625
|
jpayne@68
|
626 self.fn = fn
|
jpayne@68
|
627 tag = "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
|
jpayne@68
|
628 self._tag = tag
|
jpayne@68
|
629 _tags[tag] = self
|
jpayne@68
|
630 self._hascounts = False
|
jpayne@68
|
631 self._file_type = None
|
jpayne@68
|
632 self.seqfn = None
|
jpayne@68
|
633 self.fastq = None
|
jpayne@68
|
634 self.igv_script = None
|
jpayne@68
|
635 self.links_html = None
|
jpayne@68
|
636 self.history = History()
|
jpayne@68
|
637
|
jpayne@68
|
638 @classmethod
|
jpayne@68
|
639 def from_dataframe(
|
jpayne@68
|
640 cls,
|
jpayne@68
|
641 df: pd.DataFrame,
|
jpayne@68
|
642 outfile: Optional[str] =None,
|
jpayne@68
|
643 sep: str ="\t",
|
jpayne@68
|
644 header: bool =False,
|
jpayne@68
|
645 na_rep:str =".",
|
jpayne@68
|
646 index:bool =False,
|
jpayne@68
|
647 **kwargs
|
jpayne@68
|
648 ) -> BedTool:
|
jpayne@68
|
649 """
|
jpayne@68
|
650 Creates a BedTool from a pandas.DataFrame.
|
jpayne@68
|
651
|
jpayne@68
|
652 If `outfile` is None, a temporary file will be used. Otherwise it can
|
jpayne@68
|
653 be a specific filename or an open file handle. Additional kwargs will
|
jpayne@68
|
654 be passed to `pandas.DataFrame.to_csv`.
|
jpayne@68
|
655
|
jpayne@68
|
656 The fields of the resulting BedTool will match the order of columns in
|
jpayne@68
|
657 the dataframe.
|
jpayne@68
|
658 """
|
jpayne@68
|
659 try:
|
jpayne@68
|
660 import pandas
|
jpayne@68
|
661 except ImportError:
|
jpayne@68
|
662 raise ImportError("pandas must be installed to use dataframes")
|
jpayne@68
|
663 if outfile is None:
|
jpayne@68
|
664 outfile = cls._tmp()
|
jpayne@68
|
665 default_kwargs = dict(sep=sep, header=header, na_rep=na_rep, index=index)
|
jpayne@68
|
666 default_kwargs.update(kwargs)
|
jpayne@68
|
667 df.to_csv(outfile, **default_kwargs)
|
jpayne@68
|
668
|
jpayne@68
|
669 if isinstance(outfile, str):
|
jpayne@68
|
670 fn = outfile
|
jpayne@68
|
671 else:
|
jpayne@68
|
672 try:
|
jpayne@68
|
673 fn = outfile.name
|
jpayne@68
|
674 except AttributeError:
|
jpayne@68
|
675 raise ValueError(
|
jpayne@68
|
676 "`outfile` is not a string and doesn't have a `name` attribute. "
|
jpayne@68
|
677 "Unable to determine filename."
|
jpayne@68
|
678 )
|
jpayne@68
|
679 return BedTool(fn)
|
jpayne@68
|
680
|
jpayne@68
|
681 def split(self, func: Callable, *args, **kwargs) -> BedTool:
|
jpayne@68
|
682 """
|
jpayne@68
|
683 Split each feature using a user-defined function.
|
jpayne@68
|
684
|
jpayne@68
|
685 Calls the provided function `func` with each interval. In contrast to
|
jpayne@68
|
686 `each` (which does something similar), this method expects `func` to
|
jpayne@68
|
687 return an *iterable* of Interval objects.
|
jpayne@68
|
688
|
jpayne@68
|
689 args and kwargs are passed directly to `func`.
|
jpayne@68
|
690
|
jpayne@68
|
691 Returns a new BedTool.
|
jpayne@68
|
692 """
|
jpayne@68
|
693
|
jpayne@68
|
694 def generator():
|
jpayne@68
|
695 for orig_interval in self:
|
jpayne@68
|
696 for interval in func(orig_interval, *args, **kwargs):
|
jpayne@68
|
697 yield interval
|
jpayne@68
|
698
|
jpayne@68
|
699 return BedTool(generator())
|
jpayne@68
|
700
|
jpayne@68
|
701 def truncate_to_chrom(self, genome: str | dict) -> BedTool:
|
jpayne@68
|
702 """
|
jpayne@68
|
703 Ensure all features fall within chromosome limits.
|
jpayne@68
|
704
|
jpayne@68
|
705 Some peak-callers extend peaks such that the boundaries overstep
|
jpayne@68
|
706 chromosome coordinates. Upon uploading such a file to a genome browser
|
jpayne@68
|
707 like UCSC, this results in an error like::
|
jpayne@68
|
708
|
jpayne@68
|
709 Error line 101 of custom track: chromEnd larger than chrom chr2
|
jpayne@68
|
710 size
|
jpayne@68
|
711
|
jpayne@68
|
712 Use this method to clean your file, truncating any out-of-bounds
|
jpayne@68
|
713 features to fit within the chromosome coordinates of `genome`.
|
jpayne@68
|
714
|
jpayne@68
|
715 `genome` can be either an assembly name ('dm3') or a dictionary where
|
jpayne@68
|
716 keys are chrom and values are (start, stop) tuples.
|
jpayne@68
|
717 """
|
jpayne@68
|
718 if isinstance(genome, dict):
|
jpayne@68
|
719 chromdict = genome
|
jpayne@68
|
720 else:
|
jpayne@68
|
721 assert isinstance(genome, str)
|
jpayne@68
|
722 chromdict = helpers.chromsizes(genome)
|
jpayne@68
|
723
|
jpayne@68
|
724 tmp = self._tmp()
|
jpayne@68
|
725 with open(tmp, "w") as fout:
|
jpayne@68
|
726 for chrom, coords in list(chromdict.items()):
|
jpayne@68
|
727 start, stop = coords
|
jpayne@68
|
728 start = str(start)
|
jpayne@68
|
729 stop = str(stop)
|
jpayne@68
|
730 fout.write("\t".join([chrom, start, stop]) + "\n")
|
jpayne@68
|
731 return self.intersect(tmp)
|
jpayne@68
|
732
|
jpayne@68
|
733 def tabix_intervals(self, interval_or_string: Interval | str, check_coordinates: bool=False) -> BedTool:
|
jpayne@68
|
734 """
|
jpayne@68
|
735 Retrieve all intervals within coordinates from a "tabixed" BedTool.
|
jpayne@68
|
736
|
jpayne@68
|
737 Given either a string in "chrom:start-stop" format, or an interval-like
|
jpayne@68
|
738 object with chrom, start, stop attributes, return a *streaming* BedTool
|
jpayne@68
|
739 of the features in this BedTool that overlap the provided interval.
|
jpayne@68
|
740
|
jpayne@68
|
741 If the coordinates are invalid, an empty generator is returned unless
|
jpayne@68
|
742 `check_coordinates=True` in which case a ValueError will be raised.
|
jpayne@68
|
743 """
|
jpayne@68
|
744 if not self._tabixed():
|
jpayne@68
|
745 raise ValueError(
|
jpayne@68
|
746 "This BedTool has not been indexed for tabix "
|
jpayne@68
|
747 "-- please use the .tabix() method"
|
jpayne@68
|
748 )
|
jpayne@68
|
749
|
jpayne@68
|
750 # tabix expects 1-based coords, but BEDTools works with
|
jpayne@68
|
751 # zero-based. pybedtools and pysam also work with zero-based. So we can
|
jpayne@68
|
752 # pass zero-based directly to the pysam tabix interface.
|
jpayne@68
|
753 tbx = pysam.TabixFile(self.fn)
|
jpayne@68
|
754
|
jpayne@68
|
755 # If an interval is passed, use its coordinates directly
|
jpayne@68
|
756 if isinstance(interval_or_string, Interval):
|
jpayne@68
|
757 interval: Interval = interval_or_string
|
jpayne@68
|
758 chrom, start, end = interval.chrom, interval.start, interval.stop
|
jpayne@68
|
759 # Parse string directly instead of relying on Interval, in order to
|
jpayne@68
|
760 # permit full chromosome fetching
|
jpayne@68
|
761 else:
|
jpayne@68
|
762 match = helpers.coord_re.search(interval_or_string)
|
jpayne@68
|
763 # Assume string is contig if it doesn't fit chrom:start-end format
|
jpayne@68
|
764 if match is None:
|
jpayne@68
|
765 chrom = interval_or_string
|
jpayne@68
|
766 start, end = None, None
|
jpayne@68
|
767 # Otherwise parse the coordinates
|
jpayne@68
|
768 else:
|
jpayne@68
|
769 chrom, start, end = match.group(1, 2, 3)
|
jpayne@68
|
770 start, end = int(start), int(end)
|
jpayne@68
|
771
|
jpayne@68
|
772 # Fetch results.
|
jpayne@68
|
773 try:
|
jpayne@68
|
774 results = tbx.fetch(str(chrom), start, end)
|
jpayne@68
|
775 except ValueError:
|
jpayne@68
|
776 if check_coordinates:
|
jpayne@68
|
777 raise
|
jpayne@68
|
778 else:
|
jpayne@68
|
779 results = []
|
jpayne@68
|
780
|
jpayne@68
|
781 # pysam.ctabix.TabixIterator does not include newlines when yielding so
|
jpayne@68
|
782 # we need to add them.
|
jpayne@68
|
783 def gen():
|
jpayne@68
|
784 for i in results:
|
jpayne@68
|
785 yield i + "\n"
|
jpayne@68
|
786
|
jpayne@68
|
787 # xref #190
|
jpayne@68
|
788 x = BedTool(gen()).saveas()
|
jpayne@68
|
789 tbx.close()
|
jpayne@68
|
790 return x
|
jpayne@68
|
791
|
jpayne@68
|
792 def tabix_contigs(self):
|
jpayne@68
|
793 """
|
jpayne@68
|
794 Returns a list of contigs from the tabix index.
|
jpayne@68
|
795 """
|
jpayne@68
|
796 if not self._tabixed():
|
jpayne@68
|
797 raise ValueError(
|
jpayne@68
|
798 "This BedTool has not been indexed for tabix "
|
jpayne@68
|
799 "-- please use the .tabix() method"
|
jpayne@68
|
800 )
|
jpayne@68
|
801
|
jpayne@68
|
802 tbx = pysam.TabixFile(self.fn)
|
jpayne@68
|
803 return tbx.contigs
|
jpayne@68
|
804
|
jpayne@68
|
805 def tabix(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> BedTool:
|
jpayne@68
|
806 """
|
jpayne@68
|
807 Prepare a BedTool for use with Tabix.
|
jpayne@68
|
808
|
jpayne@68
|
809 Returns a new BedTool that has been BGZIP compressed
|
jpayne@68
|
810 and indexed by tabix.
|
jpayne@68
|
811
|
jpayne@68
|
812 Parameters
|
jpayne@68
|
813 ----------
|
jpayne@68
|
814
|
jpayne@68
|
815 in_place : bool
|
jpayne@68
|
816 If True (default), then assume the file is already sorted and
|
jpayne@68
|
817 replace the existing file with the BGZIPed version.
|
jpayne@68
|
818
|
jpayne@68
|
819 force : bool
|
jpayne@68
|
820 If True (default is False), then overwrite both the index and the
|
jpayne@68
|
821 BGZIP file.
|
jpayne@68
|
822
|
jpayne@68
|
823 is_sorted : bool
|
jpayne@68
|
824 If True (default is False), then assume the file is already sorted
|
jpayne@68
|
825 so that BedTool.bgzip() doesn't have to do that work.
|
jpayne@68
|
826 """
|
jpayne@68
|
827 # Return quickly if nothing to do
|
jpayne@68
|
828 if self._tabixed() and not force:
|
jpayne@68
|
829 return self
|
jpayne@68
|
830
|
jpayne@68
|
831 # Make sure it's BGZIPed
|
jpayne@68
|
832 fn = self.bgzip(in_place=in_place, force=force, is_sorted=is_sorted)
|
jpayne@68
|
833 if self.file_type is not None and self.file_type not in ["bam", "empty"]:
|
jpayne@68
|
834 pysam.tabix_index(fn, force=force, preset=self.file_type) # type: ignore
|
jpayne@68
|
835 return BedTool(fn)
|
jpayne@68
|
836
|
jpayne@68
|
837 def _tabixed(self):
|
jpayne@68
|
838 """
|
jpayne@68
|
839 Verifies that we're working with a tabixed file: a string filename
|
jpayne@68
|
840 pointing to a BGZIPed file with a .tbi file in the same dir.
|
jpayne@68
|
841 """
|
jpayne@68
|
842 if (
|
jpayne@68
|
843 isinstance(self.fn, str)
|
jpayne@68
|
844 and isBGZIP(self.fn)
|
jpayne@68
|
845 and os.path.exists(self.fn + ".tbi")
|
jpayne@68
|
846 ):
|
jpayne@68
|
847 return True
|
jpayne@68
|
848
|
jpayne@68
|
849 def bgzip(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> str:
|
jpayne@68
|
850 """
|
jpayne@68
|
851 Helper function for more control over "tabixed" BedTools.
|
jpayne@68
|
852
|
jpayne@68
|
853 Checks to see if we already have a BGZIP file; if not then prepare one.
|
jpayne@68
|
854 Always leaves the original file alone. You can always just make a
|
jpayne@68
|
855 BedTool out of an already sorted and BGZIPed file to avoid this step.
|
jpayne@68
|
856
|
jpayne@68
|
857 `in_place` will put the BGZIPed file in the same dir (possibly after
|
jpayne@68
|
858 sorting to tempfile).
|
jpayne@68
|
859
|
jpayne@68
|
860 If `is_sorted`, then assume the file is already sorted. Otherwise call
|
jpayne@68
|
861 bedtools sort with the `-header` option.
|
jpayne@68
|
862
|
jpayne@68
|
863 `force` will overwrite without asking.
|
jpayne@68
|
864 """
|
jpayne@68
|
865 # It may already be BGZIPed...
|
jpayne@68
|
866 if isinstance(self.fn, str) and not force:
|
jpayne@68
|
867 if isBGZIP(self.fn):
|
jpayne@68
|
868 return self.fn
|
jpayne@68
|
869
|
jpayne@68
|
870 # If not in_place, then make a tempfile for the BGZIPed version
|
jpayne@68
|
871 if not in_place:
|
jpayne@68
|
872 # Get tempfile name, sorted or not
|
jpayne@68
|
873 if not is_sorted:
|
jpayne@68
|
874 fn = self.sort(header=True).fn
|
jpayne@68
|
875 else:
|
jpayne@68
|
876 fn = self._tmp()
|
jpayne@68
|
877
|
jpayne@68
|
878 # Register for later deletion
|
jpayne@68
|
879 outfn = fn + ".gz"
|
jpayne@68
|
880 BedTool.TEMPFILES.append(outfn)
|
jpayne@68
|
881
|
jpayne@68
|
882 # Creates tempfile.gz
|
jpayne@68
|
883 pysam.tabix_compress(fn, outfn, force=force)
|
jpayne@68
|
884 return outfn
|
jpayne@68
|
885
|
jpayne@68
|
886 # Otherwise, make sure the BGZIPed version has a similar name to the
|
jpayne@68
|
887 # current BedTool's file
|
jpayne@68
|
888 if in_place:
|
jpayne@68
|
889 if not is_sorted:
|
jpayne@68
|
890 fn = self.sort(header=True).saveas().fn
|
jpayne@68
|
891 else:
|
jpayne@68
|
892 fn = self.fn
|
jpayne@68
|
893 outfn = self.fn + ".gz"
|
jpayne@68
|
894 pysam.tabix_compress(fn, outfn, force=force)
|
jpayne@68
|
895 return outfn
|
jpayne@68
|
896
|
jpayne@68
|
897 def delete_temporary_history(self, ask: bool = True, raw_input_func=None):
|
jpayne@68
|
898 """
|
jpayne@68
|
899 Use at your own risk! This method will delete temp files. You will be
|
jpayne@68
|
900 prompted for deletion of files unless you specify *ask=False*.
|
jpayne@68
|
901
|
jpayne@68
|
902 Deletes all temporary files created during the history of this BedTool
|
jpayne@68
|
903 up to but not including the file this current BedTool points to.
|
jpayne@68
|
904
|
jpayne@68
|
905 Any filenames that are in the history and have the following pattern
|
jpayne@68
|
906 will be deleted::
|
jpayne@68
|
907
|
jpayne@68
|
908 <TEMP_DIR>/pybedtools.*.tmp
|
jpayne@68
|
909
|
jpayne@68
|
910 (where <TEMP_DIR> is the result from get_tempdir() and is by default
|
jpayne@68
|
911 "/tmp")
|
jpayne@68
|
912
|
jpayne@68
|
913 Any files that don't have this format will be left alone.
|
jpayne@68
|
914
|
jpayne@68
|
915 (*raw_input_func* is used for testing)
|
jpayne@68
|
916 """
|
jpayne@68
|
917 flattened_history = _flatten_list(self.history)
|
jpayne@68
|
918 to_delete = []
|
jpayne@68
|
919 tempdir = get_tempdir()
|
jpayne@68
|
920 for i in flattened_history:
|
jpayne@68
|
921 fn = i.fn
|
jpayne@68
|
922 if fn.startswith(os.path.join(os.path.abspath(tempdir), "pybedtools")):
|
jpayne@68
|
923 if fn.endswith(".tmp"):
|
jpayne@68
|
924 to_delete.append(fn)
|
jpayne@68
|
925
|
jpayne@68
|
926 if raw_input_func is None:
|
jpayne@68
|
927 raw_input_func = input
|
jpayne@68
|
928
|
jpayne@68
|
929 str_fns = "\n\t".join(to_delete)
|
jpayne@68
|
930 if ask:
|
jpayne@68
|
931 answer = raw_input_func("Delete these files?\n\t%s\n(y/N) " % str_fns)
|
jpayne@68
|
932
|
jpayne@68
|
933 if not answer.lower()[0] == "y":
|
jpayne@68
|
934 print("OK, not deleting.")
|
jpayne@68
|
935 return
|
jpayne@68
|
936 for fn in to_delete:
|
jpayne@68
|
937 os.unlink(fn)
|
jpayne@68
|
938 return
|
jpayne@68
|
939
|
jpayne@68
|
940 def filter(self, func: Callable, *args, **kwargs) -> BedTool:
|
jpayne@68
|
941 """
|
jpayne@68
|
942 Filter features by user-defined function.
|
jpayne@68
|
943
|
jpayne@68
|
944 Takes a function *func* that is called for each feature in the
|
jpayne@68
|
945 `BedTool` object and returns only those for which the function returns
|
jpayne@68
|
946 True.
|
jpayne@68
|
947
|
jpayne@68
|
948 *args and **kwargs are passed directly to *func*.
|
jpayne@68
|
949
|
jpayne@68
|
950 Returns a streaming BedTool; if you want the filename then use the
|
jpayne@68
|
951 .saveas() method.
|
jpayne@68
|
952
|
jpayne@68
|
953 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
954 >>> subset = a.filter(lambda b: b.chrom == 'chr1' and b.start < 150)
|
jpayne@68
|
955 >>> len(a), len(subset)
|
jpayne@68
|
956 (4, 2)
|
jpayne@68
|
957
|
jpayne@68
|
958 so it has extracted 2 records from the original 4.
|
jpayne@68
|
959
|
jpayne@68
|
960 """
|
jpayne@68
|
961 return BedTool((f for f in self if func(f, *args, **kwargs)))
|
jpayne@68
|
962
|
jpayne@68
|
963 def field_count(self, n:int=10) -> int:
|
jpayne@68
|
964 """
|
jpayne@68
|
965 Number of fields in each line of this BedTool (checks `n` lines)
|
jpayne@68
|
966
|
jpayne@68
|
967 Return the number of fields in the features this file contains. Checks
|
jpayne@68
|
968 the first *n* features.
|
jpayne@68
|
969
|
jpayne@68
|
970 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
971 >>> a.field_count()
|
jpayne@68
|
972 6
|
jpayne@68
|
973 """
|
jpayne@68
|
974 if self.file_type == "empty":
|
jpayne@68
|
975 return 0
|
jpayne@68
|
976 i = 0
|
jpayne@68
|
977 fields = set([])
|
jpayne@68
|
978 for feat in self:
|
jpayne@68
|
979 if i > n:
|
jpayne@68
|
980 break
|
jpayne@68
|
981 i += 1
|
jpayne@68
|
982 # TODO: make this more efficient.
|
jpayne@68
|
983 fields.update([len(feat.fields)])
|
jpayne@68
|
984 assert len(fields) == 1, fields
|
jpayne@68
|
985 return list(fields)[0]
|
jpayne@68
|
986
|
jpayne@68
|
987 def each(self, func: Callable, *args, **kwargs) -> BedTool:
|
jpayne@68
|
988 """
|
jpayne@68
|
989 Modify each feature with a user-defined function.
|
jpayne@68
|
990
|
jpayne@68
|
991 Applies user-defined function *func* to each feature. *func* must
|
jpayne@68
|
992 accept an Interval as its first argument; *args and **kwargs will be
|
jpayne@68
|
993 passed to *func*.
|
jpayne@68
|
994
|
jpayne@68
|
995 *func* must return an Interval object OR a value that evaluates to
|
jpayne@68
|
996 False, in which case the original feature will be removed from the
|
jpayne@68
|
997 output. This way, an additional "filter" call is not necessary.
|
jpayne@68
|
998
|
jpayne@68
|
999 >>> def truncate_feature(feature, limit=0):
|
jpayne@68
|
1000 ... feature.score = str(len(feature))
|
jpayne@68
|
1001 ... if len(feature) > limit:
|
jpayne@68
|
1002 ... feature.stop = feature.start + limit
|
jpayne@68
|
1003 ... feature.name = feature.name + '.short'
|
jpayne@68
|
1004 ... return feature
|
jpayne@68
|
1005
|
jpayne@68
|
1006 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
1007 >>> b = a.each(truncate_feature, limit=100)
|
jpayne@68
|
1008 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
1009 chr1 1 100 feature1 99 +
|
jpayne@68
|
1010 chr1 100 200 feature2 100 +
|
jpayne@68
|
1011 chr1 150 250 feature3.short 350 -
|
jpayne@68
|
1012 chr1 900 950 feature4 50 +
|
jpayne@68
|
1013 <BLANKLINE>
|
jpayne@68
|
1014
|
jpayne@68
|
1015 """
|
jpayne@68
|
1016
|
jpayne@68
|
1017 def _generator():
|
jpayne@68
|
1018 for f in self:
|
jpayne@68
|
1019 result = func(f, *args, **kwargs)
|
jpayne@68
|
1020 if result:
|
jpayne@68
|
1021 yield result
|
jpayne@68
|
1022
|
jpayne@68
|
1023 return BedTool(_generator())
|
jpayne@68
|
1024
|
jpayne@68
|
1025 def introns(self, gene: str = "gene", exon: str = "exon") -> BedTool:
|
jpayne@68
|
1026 """
|
jpayne@68
|
1027 Create intron features (requires specific input format).
|
jpayne@68
|
1028
|
jpayne@68
|
1029 NOTE: this method assumes a simple file with non-overlapping exons. For
|
jpayne@68
|
1030 more sophisticated features, consider the gffutils package instead.
|
jpayne@68
|
1031
|
jpayne@68
|
1032 Given a BED12 or a GFF with exons, create a new `BedTool` with just
|
jpayne@68
|
1033 introns. The output is a bed6 file with the score column (5) being one
|
jpayne@68
|
1034 of 'intron'/'utr5'/'utr3'
|
jpayne@68
|
1035 """
|
jpayne@68
|
1036 # iterate over all the features in the gene.
|
jpayne@68
|
1037 s = self.sort()
|
jpayne@68
|
1038 if self.file_type == "gff":
|
jpayne@68
|
1039 exon_iter = BedTool((f for f in s if f[2] == exon)).saveas()
|
jpayne@68
|
1040 gene_iter = BedTool((f for f in s if f[2] == gene)).saveas()
|
jpayne@68
|
1041
|
jpayne@68
|
1042 elif self.file_type == "bed":
|
jpayne@68
|
1043 if s.field_count() == 12:
|
jpayne@68
|
1044 exon_iter = s.bed6().saveas()
|
jpayne@68
|
1045 gene_iter = s.saveas()
|
jpayne@68
|
1046 else:
|
jpayne@68
|
1047 # TODO: bed6. groupby on name and find smallest start,
|
jpayne@68
|
1048 # largest stop.
|
jpayne@68
|
1049 exon_iter = s
|
jpayne@68
|
1050 gene_iter = None
|
jpayne@68
|
1051 raise NotImplementedError(
|
jpayne@68
|
1052 ".introns() only supported for bed12" "and GFF"
|
jpayne@68
|
1053 )
|
jpayne@68
|
1054
|
jpayne@68
|
1055 else:
|
jpayne@68
|
1056 raise NotImplementedError(".introns() only supported for BED and GFF")
|
jpayne@68
|
1057
|
jpayne@68
|
1058 with open(BedTool._tmp(), "w") as fh:
|
jpayne@68
|
1059 # group on the name.
|
jpayne@68
|
1060 exon_intervals = IntervalFile(exon_iter.fn)
|
jpayne@68
|
1061 for g in gene_iter:
|
jpayne@68
|
1062 # search finds all, but we just want the ones that completely
|
jpayne@68
|
1063 # overlap this gene.
|
jpayne@68
|
1064 exons = [
|
jpayne@68
|
1065 e
|
jpayne@68
|
1066 for e in exon_intervals.search(g, same_strand=True)
|
jpayne@68
|
1067 if e.start >= g.start and e.end <= g.end
|
jpayne@68
|
1068 ]
|
jpayne@68
|
1069
|
jpayne@68
|
1070 for i, exon_instance in enumerate(exons):
|
jpayne@68
|
1071 exon_instance: pybedtools.Interval
|
jpayne@68
|
1072 # 5' utr between gene start and first intron
|
jpayne@68
|
1073 if i == 0 and exon_instance.start > g.start:
|
jpayne@68
|
1074 utr = {"+": "utr5", "-": "utr3"}[g.strand]
|
jpayne@68
|
1075 print(
|
jpayne@68
|
1076 "%s\t%i\t%i\t%s\t%s\t%s"
|
jpayne@68
|
1077 % (g.chrom, g.start, exon_instance.start, g.name, utr, g.strand),
|
jpayne@68
|
1078 file=fh,
|
jpayne@68
|
1079 )
|
jpayne@68
|
1080 elif i == len(exons) - 1 and exon_instance.end < g.end:
|
jpayne@68
|
1081 utr = {"+": "utr3", "-": "utr5"}[g.strand]
|
jpayne@68
|
1082 print(
|
jpayne@68
|
1083 "%s\t%i\t%i\t%s\t%s\t%s"
|
jpayne@68
|
1084 % (g.chrom, exon_instance.end, g.end, g.name, utr, g.strand),
|
jpayne@68
|
1085 file=fh,
|
jpayne@68
|
1086 )
|
jpayne@68
|
1087 elif i != len(exons) - 1:
|
jpayne@68
|
1088 istart = exon_instance.end
|
jpayne@68
|
1089 iend = exons[i + 1].start
|
jpayne@68
|
1090 print(
|
jpayne@68
|
1091 "%s\t%i\t%i\t%s\tintron\t%s"
|
jpayne@68
|
1092 % (g.chrom, istart, iend, g.name, g.strand),
|
jpayne@68
|
1093 file=fh,
|
jpayne@68
|
1094 )
|
jpayne@68
|
1095 return BedTool(fh.name)
|
jpayne@68
|
1096
|
jpayne@68
|
1097 def features(self):
|
jpayne@68
|
1098 """
|
jpayne@68
|
1099 Returns an iterable of features
|
jpayne@68
|
1100 """
|
jpayne@68
|
1101 if hasattr(self, "next") or hasattr(self, "__next__"):
|
jpayne@68
|
1102 return self
|
jpayne@68
|
1103 return iter(self)
|
jpayne@68
|
1104
|
jpayne@68
|
1105 FileType = Literal['bed', 'vcf', 'gff', 'bam', 'sam', 'empty']
|
jpayne@68
|
1106
|
jpayne@68
|
1107 @property
|
jpayne@68
|
1108 def file_type(self) -> Optional[FileType]:
|
jpayne@68
|
1109 """
|
jpayne@68
|
1110 Return the type of the current file. One of ('bed','vcf','gff', 'bam',
|
jpayne@68
|
1111 'sam', 'empty').
|
jpayne@68
|
1112
|
jpayne@68
|
1113 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
1114 >>> print(a.file_type)
|
jpayne@68
|
1115 bed
|
jpayne@68
|
1116 """
|
jpayne@68
|
1117 if not isinstance(self.fn, str):
|
jpayne@68
|
1118 raise ValueError(
|
jpayne@68
|
1119 "Checking file_type not supported for "
|
jpayne@68
|
1120 "non-file BedTools. Use .saveas() to "
|
jpayne@68
|
1121 "save as a temp file first."
|
jpayne@68
|
1122 )
|
jpayne@68
|
1123 if self._isbam:
|
jpayne@68
|
1124 self._file_type = "bam"
|
jpayne@68
|
1125 else:
|
jpayne@68
|
1126 try:
|
jpayne@68
|
1127 self._file_type = next(iter(self)).file_type
|
jpayne@68
|
1128 except StopIteration:
|
jpayne@68
|
1129 self._file_type = "empty"
|
jpayne@68
|
1130
|
jpayne@68
|
1131 return self._file_type
|
jpayne@68
|
1132
|
jpayne@68
|
1133 def cut(self, indexes: list[int], stream: bool = False) -> BedTool:
|
jpayne@68
|
1134 """
|
jpayne@68
|
1135 Analogous to unix `cut`.
|
jpayne@68
|
1136
|
jpayne@68
|
1137 Similar to unix `cut` except indexes are 0-based, must be a list
|
jpayne@68
|
1138 and the columns are returned in the order requested.
|
jpayne@68
|
1139
|
jpayne@68
|
1140 This method returns a BedTool of results, which means that the indexes
|
jpayne@68
|
1141 returned must be valid GFF/GTF/BED/SAM features.
|
jpayne@68
|
1142
|
jpayne@68
|
1143 If you would like arbitrary columns -- say, just chrom and featuretype
|
jpayne@68
|
1144 of a GFF, which would not comprise a valid feature -- then instead of
|
jpayne@68
|
1145 this method, simply use indexes on each feature, e.g,
|
jpayne@68
|
1146
|
jpayne@68
|
1147 >>> gff = pybedtools.example_bedtool('d.gff')
|
jpayne@68
|
1148 >>> results = [(f[0], f[2]) for f in gff]
|
jpayne@68
|
1149
|
jpayne@68
|
1150 In addition, `indexes` can contain keys of the GFF/GTF attributes, in
|
jpayne@68
|
1151 which case the values are returned. e.g. 'gene_name' will return the
|
jpayne@68
|
1152 corresponding name from a GTF, or 'start' will return the start
|
jpayne@68
|
1153 attribute of a BED Interval.
|
jpayne@68
|
1154 """
|
jpayne@68
|
1155 if stream:
|
jpayne@68
|
1156 return BedTool(([f[attr] for attr in indexes] for f in self))
|
jpayne@68
|
1157 else:
|
jpayne@68
|
1158 with open(self._tmp(), "w") as fh:
|
jpayne@68
|
1159 for f in self:
|
jpayne@68
|
1160 print("\t".join(map(str, [f[attr] for attr in indexes])), file=fh)
|
jpayne@68
|
1161 return BedTool(fh.name)
|
jpayne@68
|
1162
|
jpayne@68
|
1163 @classmethod
|
jpayne@68
|
1164 def _tmp(cls) -> str:
|
jpayne@68
|
1165 """
|
jpayne@68
|
1166 Makes a tempfile and registers it in the BedTool.TEMPFILES class
|
jpayne@68
|
1167 variable. Adds a "pybedtools." prefix and ".tmp" extension for easy
|
jpayne@68
|
1168 deletion if you forget to call pybedtools.cleanup().
|
jpayne@68
|
1169 """
|
jpayne@68
|
1170 tmpfn = tempfile.NamedTemporaryFile(
|
jpayne@68
|
1171 prefix=settings.tempfile_prefix,
|
jpayne@68
|
1172 suffix=settings.tempfile_suffix,
|
jpayne@68
|
1173 delete=False,
|
jpayne@68
|
1174 )
|
jpayne@68
|
1175 tmpfn = tmpfn.name
|
jpayne@68
|
1176 cls.TEMPFILES.append(tmpfn)
|
jpayne@68
|
1177 return tmpfn
|
jpayne@68
|
1178
|
jpayne@68
|
1179 def __iter__(self):
|
jpayne@68
|
1180 """
|
jpayne@68
|
1181 Dispatches the right iterator depending on how this BedTool was
|
jpayne@68
|
1182 created
|
jpayne@68
|
1183 """
|
jpayne@68
|
1184 if self._isbam:
|
jpayne@68
|
1185 # Note: BAM class takes filename or stream, so self.fn is OK
|
jpayne@68
|
1186 # here
|
jpayne@68
|
1187 return BAM(self.fn)
|
jpayne@68
|
1188
|
jpayne@68
|
1189 # Plain ol' filename
|
jpayne@68
|
1190 if isinstance(self.fn, str):
|
jpayne@68
|
1191 if not os.path.exists(self.fn):
|
jpayne@68
|
1192 raise BedToolsFileError("{0} does not exist".format(self.fn))
|
jpayne@68
|
1193 if isGZIP(self.fn):
|
jpayne@68
|
1194 return IntervalIterator(gzip.open(self.fn, "rt"))
|
jpayne@68
|
1195 else:
|
jpayne@68
|
1196 return IntervalIterator(open(self.fn, "r"))
|
jpayne@68
|
1197 # Any other kind of input (streaming string from stdout; iterable of
|
jpayne@68
|
1198 # Intervals, iterable of (chrom, start, stop) tuples, etc are handled
|
jpayne@68
|
1199 # appropriately by IntervalIterator.
|
jpayne@68
|
1200 else:
|
jpayne@68
|
1201 return IntervalIterator(self.fn)
|
jpayne@68
|
1202
|
jpayne@68
|
1203 @property
|
jpayne@68
|
1204 def intervals(self):
|
jpayne@68
|
1205 if isinstance(self.fn, str):
|
jpayne@68
|
1206 return IntervalFile(self.fn)
|
jpayne@68
|
1207 else:
|
jpayne@68
|
1208 raise ValueError("Please convert to a file-based BedTool using saveas")
|
jpayne@68
|
1209
|
jpayne@68
|
1210 def __repr__(self):
|
jpayne@68
|
1211 if isinstance(self.fn, str):
|
jpayne@68
|
1212 if os.path.exists(self.fn) or self.remote:
|
jpayne@68
|
1213 return "<BedTool(%s)>" % self.fn
|
jpayne@68
|
1214 else:
|
jpayne@68
|
1215 return "<BedTool(MISSING FILE: %s)>" % self.fn
|
jpayne@68
|
1216 elif isinstance(self.fn, BedTool):
|
jpayne@68
|
1217 return repr(self.fn)
|
jpayne@68
|
1218 else:
|
jpayne@68
|
1219 return "<BedTool(%s)>" % repr(self.fn)
|
jpayne@68
|
1220
|
jpayne@68
|
1221 def __str__(self):
|
jpayne@68
|
1222 """
|
jpayne@68
|
1223 Returns the string representation of the whole `BedTool`
|
jpayne@68
|
1224 """
|
jpayne@68
|
1225 items = []
|
jpayne@68
|
1226 for i in iter(self):
|
jpayne@68
|
1227 i = str(i)
|
jpayne@68
|
1228 if isinstance(i, bytes):
|
jpayne@68
|
1229 i = i.decode("UTF-8")
|
jpayne@68
|
1230 items.append(i)
|
jpayne@68
|
1231 return "".join(items)
|
jpayne@68
|
1232
|
jpayne@68
|
1233 def __len__(self):
|
jpayne@68
|
1234 return self.count()
|
jpayne@68
|
1235
|
jpayne@68
|
1236 def __eq__(self, other: object) -> bool:
|
jpayne@68
|
1237 if isinstance(other, BedTool):
|
jpayne@68
|
1238 if not isinstance(self.fn, str) or not isinstance(
|
jpayne@68
|
1239 other.fn, str
|
jpayne@68
|
1240 ):
|
jpayne@68
|
1241 raise NotImplementedError(
|
jpayne@68
|
1242 "Testing equality only supported for"
|
jpayne@68
|
1243 " BedTools that point to files"
|
jpayne@68
|
1244 )
|
jpayne@68
|
1245 elif not isinstance(other, str):
|
jpayne@68
|
1246 raise NotImplementedError(
|
jpayne@68
|
1247 "Testing equality only supported for"
|
jpayne@68
|
1248 " BedTools that point to files or str of content"
|
jpayne@68
|
1249 )
|
jpayne@68
|
1250 return str(self) == str(other)
|
jpayne@68
|
1251
|
jpayne@68
|
1252 def __ne__(self, other:object):
|
jpayne@68
|
1253 return not self.__eq__(other)
|
jpayne@68
|
1254
|
jpayne@68
|
1255 def __getitem__(self, key: slice|int):
|
jpayne@68
|
1256 if isinstance(key, slice):
|
jpayne@68
|
1257 return islice(self, key.start, key.stop, key.step)
|
jpayne@68
|
1258 elif isinstance(key, int):
|
jpayne@68
|
1259 return list(islice(self, key, key + 1))[0]
|
jpayne@68
|
1260 else:
|
jpayne@68
|
1261 raise ValueError(
|
jpayne@68
|
1262 "Only slices or integers allowed for indexing " "into a BedTool"
|
jpayne@68
|
1263 )
|
jpayne@68
|
1264
|
jpayne@68
|
1265 def __add__(self, other: BedTool) -> BedTool:
|
jpayne@68
|
1266 try:
|
jpayne@68
|
1267 result = self.intersect(other, u=True)
|
jpayne@68
|
1268 except BEDToolsError as e:
|
jpayne@68
|
1269 # BEDTools versions <2.20 would raise BEDToolsError
|
jpayne@68
|
1270 if (self.file_type == "empty") or (other.file_type == "empty"):
|
jpayne@68
|
1271 result = pybedtools.BedTool("", from_string=True)
|
jpayne@68
|
1272 else:
|
jpayne@68
|
1273 raise e
|
jpayne@68
|
1274 return result
|
jpayne@68
|
1275
|
jpayne@68
|
1276 def __sub__(self, other: BedTool) -> BedTool:
|
jpayne@68
|
1277 result = None
|
jpayne@68
|
1278
|
jpayne@68
|
1279 try:
|
jpayne@68
|
1280 result = self.intersect(other, v=True)
|
jpayne@68
|
1281 except BEDToolsError:
|
jpayne@68
|
1282 # BEDTools versions <2.20 would raise BEDToolsError
|
jpayne@68
|
1283
|
jpayne@68
|
1284 if (self.file_type == "empty") and (other.file_type == "empty"):
|
jpayne@68
|
1285 result = pybedtools.BedTool("", from_string=True)
|
jpayne@68
|
1286 elif other.file_type == "empty":
|
jpayne@68
|
1287 result = self.saveas()
|
jpayne@68
|
1288 elif self.file_type == "empty":
|
jpayne@68
|
1289 result = pybedtools.BedTool("", from_string=True)
|
jpayne@68
|
1290 if result is None:
|
jpayne@68
|
1291 raise ValueError("Subtraction operation failed.")
|
jpayne@68
|
1292
|
jpayne@68
|
1293 return result
|
jpayne@68
|
1294
|
jpayne@68
|
1295 def head(self, n: int = 10, as_string: bool = False):
|
jpayne@68
|
1296 """
|
jpayne@68
|
1297 Prints the first *n* lines or returns them if as_string is True
|
jpayne@68
|
1298
|
jpayne@68
|
1299 Note that this only opens the underlying file (gzipped or not), so it
|
jpayne@68
|
1300 does not check to see if the file is a valid BED file.
|
jpayne@68
|
1301
|
jpayne@68
|
1302 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
1303 >>> a.head(2) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
1304 chr1 1 100 feature1 0 +
|
jpayne@68
|
1305 chr1 100 200 feature2 0 +
|
jpayne@68
|
1306 <BLANKLINE>
|
jpayne@68
|
1307
|
jpayne@68
|
1308 """
|
jpayne@68
|
1309 if not isinstance(self.fn, str):
|
jpayne@68
|
1310 raise NotImplementedError(
|
jpayne@68
|
1311 "head() not supported for non file-based BedTools"
|
jpayne@68
|
1312 )
|
jpayne@68
|
1313 if as_string:
|
jpayne@68
|
1314 return "".join(str(line) for line in self[:n])
|
jpayne@68
|
1315 if self._isbam:
|
jpayne@68
|
1316 raise NotImplementedError("head() not supported for BAM")
|
jpayne@68
|
1317 else:
|
jpayne@68
|
1318 if isGZIP(self.fn):
|
jpayne@68
|
1319 openfunc = gzip.open
|
jpayne@68
|
1320 openmode = "rt"
|
jpayne@68
|
1321 else:
|
jpayne@68
|
1322 openfunc = open
|
jpayne@68
|
1323 openmode = "r"
|
jpayne@68
|
1324 with openfunc(self.fn, openmode) as fin:
|
jpayne@68
|
1325 for i, line in enumerate(fin):
|
jpayne@68
|
1326 if i == (n):
|
jpayne@68
|
1327 break
|
jpayne@68
|
1328 print(line, end=" ")
|
jpayne@68
|
1329
|
jpayne@68
|
1330 def set_chromsizes(self, chromsizes: str | dict):
|
jpayne@68
|
1331 """
|
jpayne@68
|
1332 Prepare BedTool for operations that require chromosome coords.
|
jpayne@68
|
1333
|
jpayne@68
|
1334 Set the chromsizes for this genome. If *chromsizes* is a string, it
|
jpayne@68
|
1335 will be considered a genome assembly name. If that assembly name is
|
jpayne@68
|
1336 not available in pybedtools.genome_registry, then it will be searched
|
jpayne@68
|
1337 for on the UCSC Genome Browser.
|
jpayne@68
|
1338
|
jpayne@68
|
1339 Example usage:
|
jpayne@68
|
1340
|
jpayne@68
|
1341 >>> hg19 = pybedtools.chromsizes('hg19')
|
jpayne@68
|
1342 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
1343 >>> a = a.set_chromsizes(hg19)
|
jpayne@68
|
1344 >>> print(a.chromsizes['chr1'])
|
jpayne@68
|
1345 (0, 249250621)
|
jpayne@68
|
1346
|
jpayne@68
|
1347 """
|
jpayne@68
|
1348 if isinstance(chromsizes, str):
|
jpayne@68
|
1349 self.chromsizes = pybedtools.chromsizes(chromsizes)
|
jpayne@68
|
1350 elif isinstance(chromsizes, dict):
|
jpayne@68
|
1351 self.chromsizes = chromsizes
|
jpayne@68
|
1352 else:
|
jpayne@68
|
1353 raise ValueError(
|
jpayne@68
|
1354 "Need to specify chromsizes either as a string"
|
jpayne@68
|
1355 " (assembly name) or a dictionary"
|
jpayne@68
|
1356 )
|
jpayne@68
|
1357 return self
|
jpayne@68
|
1358
|
jpayne@68
|
1359 def _collapse(
|
jpayne@68
|
1360 self,
|
jpayne@68
|
1361 iterable: Iterable,
|
jpayne@68
|
1362 fn: Optional[str] = None,
|
jpayne@68
|
1363 trackline: Optional[str] = None,
|
jpayne@68
|
1364 in_compressed: bool = False,
|
jpayne@68
|
1365 out_compressed: bool = False,
|
jpayne@68
|
1366 ) -> str:
|
jpayne@68
|
1367 """
|
jpayne@68
|
1368 Collapses an iterable into file *fn* (or a new tempfile if *fn* is
|
jpayne@68
|
1369 None).
|
jpayne@68
|
1370
|
jpayne@68
|
1371 Returns the newly created filename.
|
jpayne@68
|
1372
|
jpayne@68
|
1373 Parameters
|
jpayne@68
|
1374 ----------
|
jpayne@68
|
1375
|
jpayne@68
|
1376 iterable : iter
|
jpayne@68
|
1377 Any iterable object whose items can be converted to an Interval.
|
jpayne@68
|
1378
|
jpayne@68
|
1379 fn : str
|
jpayne@68
|
1380 Output filename, if None then creates a temp file for output
|
jpayne@68
|
1381
|
jpayne@68
|
1382 trackline : str
|
jpayne@68
|
1383 If not None, string to be added to the top of the output. Newline
|
jpayne@68
|
1384 will be added.
|
jpayne@68
|
1385
|
jpayne@68
|
1386 in_compressed : bool
|
jpayne@68
|
1387 Indicates whether the input is compressed
|
jpayne@68
|
1388
|
jpayne@68
|
1389 out_compressed : bool
|
jpayne@68
|
1390 Indicates whether the output should be compressed
|
jpayne@68
|
1391 """
|
jpayne@68
|
1392 if fn is None:
|
jpayne@68
|
1393 fn = self._tmp()
|
jpayne@68
|
1394
|
jpayne@68
|
1395 in_open_func = gzip.open if in_compressed else open
|
jpayne@68
|
1396 out_open_func = gzip.open if out_compressed else open
|
jpayne@68
|
1397
|
jpayne@68
|
1398 # special case: if BAM-format BedTool is provided, no trackline should
|
jpayne@68
|
1399 # be supplied, and don't iterate -- copy the file wholesale
|
jpayne@68
|
1400 if isinstance(iterable, BedTool) and iterable._isbam:
|
jpayne@68
|
1401 if trackline:
|
jpayne@68
|
1402 raise ValueError(
|
jpayne@68
|
1403 "trackline provided, but input is a BAM "
|
jpayne@68
|
1404 "file, which takes no track line"
|
jpayne@68
|
1405 )
|
jpayne@68
|
1406 with open(fn, "wb") as out_:
|
jpayne@68
|
1407 out_.write(open(self.fn, "rb").read())
|
jpayne@68
|
1408 return fn
|
jpayne@68
|
1409
|
jpayne@68
|
1410 # If we're just working with filename-based BedTool objects, just copy
|
jpayne@68
|
1411 # the files directly
|
jpayne@68
|
1412 if isinstance(iterable, BedTool) and isinstance(iterable.fn, str):
|
jpayne@68
|
1413 with out_open_func(fn, "wt") as out_:
|
jpayne@68
|
1414 if sys.version_info > (3,0):
|
jpayne@68
|
1415 in_ = in_open_func(iterable.fn, "rt", errors="ignore")
|
jpayne@68
|
1416 else:
|
jpayne@68
|
1417 in_ = in_open_func(iterable.fn, "rt")
|
jpayne@68
|
1418 if trackline:
|
jpayne@68
|
1419 out_.write(trackline.strip() + "\n")
|
jpayne@68
|
1420 out_.writelines(in_)
|
jpayne@68
|
1421 in_.close()
|
jpayne@68
|
1422 else:
|
jpayne@68
|
1423 with out_open_func(fn, "wt") as out_:
|
jpayne@68
|
1424 for i in iterable:
|
jpayne@68
|
1425 if isinstance(i, (list, tuple)):
|
jpayne@68
|
1426 i = create_interval_from_list(list(i))
|
jpayne@68
|
1427 out_.write(str(i))
|
jpayne@68
|
1428 return fn
|
jpayne@68
|
1429
|
jpayne@68
|
1430 def handle_kwargs(self, prog:str, arg_order: Optional[list[str]] = None, **kwargs):
|
jpayne@68
|
1431 """
|
jpayne@68
|
1432 Handle most cases of BEDTool program calls, but leave the specifics
|
jpayne@68
|
1433 up to individual methods.
|
jpayne@68
|
1434
|
jpayne@68
|
1435 *prog* is a BEDTools program name, e.g., 'intersectBed'.
|
jpayne@68
|
1436
|
jpayne@68
|
1437 *arg_order* lists any arguments that are sensitive to order. Everything
|
jpayne@68
|
1438 else will be reverse-sorted.
|
jpayne@68
|
1439
|
jpayne@68
|
1440 *kwargs* are passed directly from the calling method (like
|
jpayne@68
|
1441 self.intersect).
|
jpayne@68
|
1442
|
jpayne@68
|
1443 This method figures out, given how this BedTool was constructed, what
|
jpayne@68
|
1444 to send to BEDTools programs -- for example, an open file to stdin with
|
jpayne@68
|
1445 the `-` argument, or a filename with the `-a` argument.
|
jpayne@68
|
1446 """
|
jpayne@68
|
1447 pybedtools.logger.debug(
|
jpayne@68
|
1448 "BedTool.handle_kwargs() got these kwargs:\n%s", pprint.pformat(kwargs)
|
jpayne@68
|
1449 )
|
jpayne@68
|
1450
|
jpayne@68
|
1451 # If you pass in a list, how should it be converted to a BedTools arg?
|
jpayne@68
|
1452 default_list_delimiter = " "
|
jpayne@68
|
1453 list_delimiters = {
|
jpayne@68
|
1454 "annotateBed": " ",
|
jpayne@68
|
1455 "getOverlap": ",",
|
jpayne@68
|
1456 "groupBy": ",",
|
jpayne@68
|
1457 "multiIntersectBed": " ",
|
jpayne@68
|
1458 "mergeBed": ",",
|
jpayne@68
|
1459 "intersectBed": " ",
|
jpayne@68
|
1460 "mapBed": ",",
|
jpayne@68
|
1461 }
|
jpayne@68
|
1462 stdin = None
|
jpayne@68
|
1463
|
jpayne@68
|
1464 # -----------------------------------------------------------------
|
jpayne@68
|
1465 # Decide how to send instream1 to BEDTools. If there's no implicit
|
jpayne@68
|
1466 # instream1 arg, then do nothing.
|
jpayne@68
|
1467 #
|
jpayne@68
|
1468 try:
|
jpayne@68
|
1469 # e.g., 'a' for intersectBed
|
jpayne@68
|
1470 if self._isbam:
|
jpayne@68
|
1471 inarg1 = _bam_registry[prog]
|
jpayne@68
|
1472 else:
|
jpayne@68
|
1473 inarg1 = _implicit_registry[prog]
|
jpayne@68
|
1474
|
jpayne@68
|
1475 # e.g., self.fn or 'a.bed' or an iterator...
|
jpayne@68
|
1476 instream1 = kwargs[inarg1]
|
jpayne@68
|
1477
|
jpayne@68
|
1478 # If it's a BedTool, then get underlying stream
|
jpayne@68
|
1479 if isinstance(instream1, BedTool):
|
jpayne@68
|
1480 instream1 = instream1.fn
|
jpayne@68
|
1481
|
jpayne@68
|
1482 # Filename? No pipe, just provide the file
|
jpayne@68
|
1483 if isinstance(instream1, str):
|
jpayne@68
|
1484 kwargs[inarg1] = instream1
|
jpayne@68
|
1485 stdin = None
|
jpayne@68
|
1486
|
jpayne@68
|
1487 # Open file? Pipe it
|
jpayne@68
|
1488 # elif isinstance(instream1, file):
|
jpayne@68
|
1489 # kwargs[inarg1] = 'stdin'
|
jpayne@68
|
1490 # stdin = instream1
|
jpayne@68
|
1491
|
jpayne@68
|
1492 # A generator or iterator: pipe it as a generator of lines
|
jpayne@68
|
1493 else:
|
jpayne@68
|
1494 kwargs[inarg1] = "stdin"
|
jpayne@68
|
1495 stdin = (str(i) for i in instream1)
|
jpayne@68
|
1496 except KeyError:
|
jpayne@68
|
1497 pass
|
jpayne@68
|
1498
|
jpayne@68
|
1499 # -----------------------------------------------------------------
|
jpayne@68
|
1500 # Decide how to send instream2 to BEDTools.
|
jpayne@68
|
1501 try:
|
jpayne@68
|
1502 # e.g., 'b' for intersectBed
|
jpayne@68
|
1503 inarg2 = _other_registry[prog]
|
jpayne@68
|
1504
|
jpayne@68
|
1505 # e.g., another BedTool
|
jpayne@68
|
1506 instream2 = kwargs[inarg2]
|
jpayne@68
|
1507
|
jpayne@68
|
1508 # Get stream if BedTool
|
jpayne@68
|
1509 if isinstance(instream2, BedTool):
|
jpayne@68
|
1510 instream2 = instream2.fn
|
jpayne@68
|
1511
|
jpayne@68
|
1512 # Filename
|
jpayne@68
|
1513 if isinstance(instream2, str):
|
jpayne@68
|
1514 kwargs[inarg2] = instream2
|
jpayne@68
|
1515
|
jpayne@68
|
1516 # If it's a list of strings, then we need to figure out if it's
|
jpayne@68
|
1517 # a list of filenames or a list of intervals (see issue #156)
|
jpayne@68
|
1518 #
|
jpayne@68
|
1519 # Several options:
|
jpayne@68
|
1520 #
|
jpayne@68
|
1521 # - assume intervals have tabs but filenames don't
|
jpayne@68
|
1522 # - assume that, upon being split on tabs, an interval is >=3 fields
|
jpayne@68
|
1523 # - try creating an interval out of the first thing, success means interval
|
jpayne@68
|
1524 #
|
jpayne@68
|
1525 # The last seems the most robust. It does allow filenames with
|
jpayne@68
|
1526 # tabs; deciding whether or not such filenames are a good idea is
|
jpayne@68
|
1527 # left to the user.
|
jpayne@68
|
1528 #
|
jpayne@68
|
1529 elif isinstance(instream2, (list, tuple)) and isinstance(
|
jpayne@68
|
1530 instream2[0], str
|
jpayne@68
|
1531 ):
|
jpayne@68
|
1532 try:
|
jpayne@68
|
1533 _ = create_interval_from_list(instream2[0].split("\t"))
|
jpayne@68
|
1534 kwargs[inarg2] = self._collapse(instream2)
|
jpayne@68
|
1535 except IndexError:
|
jpayne@68
|
1536 kwargs[inarg2] = instream2
|
jpayne@68
|
1537
|
jpayne@68
|
1538 # Otherwise we need to collapse it in order to send to BEDTools
|
jpayne@68
|
1539 # programs
|
jpayne@68
|
1540 else:
|
jpayne@68
|
1541 kwargs[inarg2] = self._collapse(instream2)
|
jpayne@68
|
1542
|
jpayne@68
|
1543 except KeyError:
|
jpayne@68
|
1544 pass
|
jpayne@68
|
1545
|
jpayne@68
|
1546 # If stream not specified, then a tempfile will be created
|
jpayne@68
|
1547 if kwargs.pop("stream", None):
|
jpayne@68
|
1548 tmp = None
|
jpayne@68
|
1549 else:
|
jpayne@68
|
1550 output = kwargs.pop("output", None)
|
jpayne@68
|
1551 if output:
|
jpayne@68
|
1552 tmp = output
|
jpayne@68
|
1553 else:
|
jpayne@68
|
1554 tmp = self._tmp()
|
jpayne@68
|
1555
|
jpayne@68
|
1556 additional_args = kwargs.pop("additional_args", None)
|
jpayne@68
|
1557
|
jpayne@68
|
1558 # Parse the kwargs into BEDTools-ready args
|
jpayne@68
|
1559 cmds = [prog]
|
jpayne@68
|
1560
|
jpayne@68
|
1561 # arg_order mechanism added to fix #345
|
jpayne@68
|
1562 if arg_order is None:
|
jpayne@68
|
1563 arg_order = []
|
jpayne@68
|
1564
|
jpayne@68
|
1565 for arg in arg_order:
|
jpayne@68
|
1566 if arg in kwargs:
|
jpayne@68
|
1567 val = kwargs.pop(arg)
|
jpayne@68
|
1568 cmds.append("-" + arg)
|
jpayne@68
|
1569 cmds.append(val)
|
jpayne@68
|
1570
|
jpayne@68
|
1571 # The reverse-sort is a temp fix for issue #81
|
jpayne@68
|
1572 for key, value in sorted(list(kwargs.items()), reverse=True):
|
jpayne@68
|
1573 if isinstance(value, bool):
|
jpayne@68
|
1574 if value:
|
jpayne@68
|
1575 cmds.append("-" + key)
|
jpayne@68
|
1576 else:
|
jpayne@68
|
1577 continue
|
jpayne@68
|
1578 elif isinstance(value, list) or isinstance(value, tuple):
|
jpayne@68
|
1579 value = list(map(str, value))
|
jpayne@68
|
1580 try:
|
jpayne@68
|
1581 delim = list_delimiters[prog]
|
jpayne@68
|
1582 except KeyError:
|
jpayne@68
|
1583 delim = default_list_delimiter
|
jpayne@68
|
1584
|
jpayne@68
|
1585 if delim == " ":
|
jpayne@68
|
1586 cmds.append("-" + key)
|
jpayne@68
|
1587 cmds.extend(value)
|
jpayne@68
|
1588
|
jpayne@68
|
1589 # make comma-separated list if that's what's needed
|
jpayne@68
|
1590 else:
|
jpayne@68
|
1591 cmds.append("-" + key)
|
jpayne@68
|
1592 cmds.append(delim.join(value))
|
jpayne@68
|
1593
|
jpayne@68
|
1594 else:
|
jpayne@68
|
1595 cmds.append("-" + key)
|
jpayne@68
|
1596 cmds.append(str(value))
|
jpayne@68
|
1597
|
jpayne@68
|
1598 if additional_args:
|
jpayne@68
|
1599 cmds.append(additional_args)
|
jpayne@68
|
1600
|
jpayne@68
|
1601 return cmds, tmp, stdin
|
jpayne@68
|
1602
|
jpayne@68
|
1603 def check_genome(self, **kwargs):
|
jpayne@68
|
1604 """
|
jpayne@68
|
1605 Handles the different ways of specifying a genome in kwargs:
|
jpayne@68
|
1606
|
jpayne@68
|
1607 g='genome.file' specifies a file directly
|
jpayne@68
|
1608 genome='dm3' gets the file from genome registry
|
jpayne@68
|
1609 self.chromsizes could be a dict.\
|
jpayne@68
|
1610 """
|
jpayne@68
|
1611
|
jpayne@68
|
1612 # If both g and genome are missing, assume self.chromsizes
|
jpayne@68
|
1613 if ("g" not in kwargs) and ("genome" not in kwargs):
|
jpayne@68
|
1614 if hasattr(self, "chromsizes"):
|
jpayne@68
|
1615 kwargs["g"] = self.chromsizes
|
jpayne@68
|
1616 else:
|
jpayne@68
|
1617 raise ValueError(
|
jpayne@68
|
1618 'No genome specified. Use the "g" or '
|
jpayne@68
|
1619 '"genome" kwargs, or use the '
|
jpayne@68
|
1620 ".set_chromsizes() method"
|
jpayne@68
|
1621 )
|
jpayne@68
|
1622
|
jpayne@68
|
1623 # If both specified, rather than make an implicit decision, raise an
|
jpayne@68
|
1624 # exception
|
jpayne@68
|
1625 if "g" in kwargs and "genome" in kwargs:
|
jpayne@68
|
1626 raise ValueError('Cannot specify both "g" and "genome"')
|
jpayne@68
|
1627
|
jpayne@68
|
1628 # Something like genome='dm3' was specified
|
jpayne@68
|
1629 if "g" not in kwargs and "genome" in kwargs:
|
jpayne@68
|
1630 if isinstance(kwargs["genome"], dict):
|
jpayne@68
|
1631 genome_dict = kwargs["genome"]
|
jpayne@68
|
1632 else:
|
jpayne@68
|
1633 genome_dict = pybedtools.chromsizes(kwargs["genome"])
|
jpayne@68
|
1634 genome_file = pybedtools.chromsizes_to_file(genome_dict)
|
jpayne@68
|
1635 kwargs["g"] = genome_file
|
jpayne@68
|
1636 del kwargs["genome"]
|
jpayne@68
|
1637
|
jpayne@68
|
1638 # By the time we get here, 'g' is specified.
|
jpayne@68
|
1639
|
jpayne@68
|
1640 # If a dict was provided, convert to tempfile here
|
jpayne@68
|
1641 if isinstance(kwargs["g"], dict):
|
jpayne@68
|
1642 kwargs["g"] = pybedtools.chromsizes_to_file(kwargs["g"])
|
jpayne@68
|
1643
|
jpayne@68
|
1644 if not os.path.exists(kwargs["g"]):
|
jpayne@68
|
1645 msg = 'Genome file "%s" does not exist' % (kwargs["g"])
|
jpayne@68
|
1646 raise FileNotFoundError(msg)
|
jpayne@68
|
1647
|
jpayne@68
|
1648 return kwargs
|
jpayne@68
|
1649
|
jpayne@68
|
1650 @_log_to_history
|
jpayne@68
|
1651 def remove_invalid(self):
|
jpayne@68
|
1652 """
|
jpayne@68
|
1653 Remove invalid features that may break BEDTools programs.
|
jpayne@68
|
1654
|
jpayne@68
|
1655 >>> a = pybedtools.BedTool("chr1 10 100\\nchr1 10 1",
|
jpayne@68
|
1656 ... from_string=True)
|
jpayne@68
|
1657 >>> print(a.remove_invalid()) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
1658 chr1 10 100
|
jpayne@68
|
1659 <BLANKLINE>
|
jpayne@68
|
1660
|
jpayne@68
|
1661 """
|
jpayne@68
|
1662 tmp = self._tmp()
|
jpayne@68
|
1663
|
jpayne@68
|
1664 # If it's a file-based BedTool -- which is likely, if we're trying to
|
jpayne@68
|
1665 # remove invalid features -- then we need to parse it line by line.
|
jpayne@68
|
1666 if isinstance(self.fn, str):
|
jpayne@68
|
1667 i = IntervalIterator(open(self.fn, "r"))
|
jpayne@68
|
1668 else:
|
jpayne@68
|
1669 tmp = self.saveas()
|
jpayne@68
|
1670 i = IntervalIterator(open(tmp.fn, "r"))
|
jpayne@68
|
1671
|
jpayne@68
|
1672 def _generator():
|
jpayne@68
|
1673 while True:
|
jpayne@68
|
1674 try:
|
jpayne@68
|
1675 feature = next(i)
|
jpayne@68
|
1676 if feature.start <= feature.stop:
|
jpayne@68
|
1677 yield feature
|
jpayne@68
|
1678 else:
|
jpayne@68
|
1679 continue
|
jpayne@68
|
1680 except pybedtools.MalformedBedLineError:
|
jpayne@68
|
1681 continue
|
jpayne@68
|
1682 except OverflowError:
|
jpayne@68
|
1683 # This can happen if coords are negative
|
jpayne@68
|
1684 continue
|
jpayne@68
|
1685 except IndexError:
|
jpayne@68
|
1686 continue
|
jpayne@68
|
1687 except StopIteration:
|
jpayne@68
|
1688 break
|
jpayne@68
|
1689
|
jpayne@68
|
1690 return BedTool(_generator())
|
jpayne@68
|
1691
|
jpayne@68
|
1692 def all_hits(self, interval: Interval, same_strand: bool = False, overlap: float = 0.0):
|
jpayne@68
|
1693 """
|
jpayne@68
|
1694 Return all intervals that overlap `interval`.
|
jpayne@68
|
1695
|
jpayne@68
|
1696 Calls the `all_hits` method of an IntervalFile to return all intervals
|
jpayne@68
|
1697 in this current BedTool that overlap `interval`.
|
jpayne@68
|
1698
|
jpayne@68
|
1699 Require that overlaps have the same strand with same_strand=True.
|
jpayne@68
|
1700
|
jpayne@68
|
1701 Notes:
|
jpayne@68
|
1702 If this current BedTool is generator-based, it will be
|
jpayne@68
|
1703 converted into a file first.
|
jpayne@68
|
1704
|
jpayne@68
|
1705 If this current BedTool refers to a BAM file, it will be
|
jpayne@68
|
1706 converted to a BED file first using default arguments. If you
|
jpayne@68
|
1707 don't want this to happen, please convert to BED first before
|
jpayne@68
|
1708 using this method.
|
jpayne@68
|
1709 """
|
jpayne@68
|
1710 if not isinstance(interval, Interval):
|
jpayne@68
|
1711 raise ValueError("Need an Interval instance")
|
jpayne@68
|
1712 fn = self.fn
|
jpayne@68
|
1713 if not isinstance(fn, str):
|
jpayne@68
|
1714 fn = self.saveas().fn
|
jpayne@68
|
1715 if self._isbam:
|
jpayne@68
|
1716 fn = self.bam_to_bed().fn
|
jpayne@68
|
1717 interval_file = pybedtools.IntervalFile(fn)
|
jpayne@68
|
1718 return interval_file.all_hits(interval, same_strand, overlap)
|
jpayne@68
|
1719
|
jpayne@68
|
1720 def any_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0):
|
jpayne@68
|
1721 """
|
jpayne@68
|
1722 Return whether or not any intervals overlap `interval`.
|
jpayne@68
|
1723
|
jpayne@68
|
1724 Calls the `any_hits` method of an IntervalFile. If there were any hits
|
jpayne@68
|
1725 within `interval` in this BedTool, then return 1; otherwise 0.
|
jpayne@68
|
1726
|
jpayne@68
|
1727 Require that overlaps have the same strand with same_strand=True.
|
jpayne@68
|
1728
|
jpayne@68
|
1729 Notes:
|
jpayne@68
|
1730 If this current BedTool is generator-based, it will be
|
jpayne@68
|
1731 converted into a file first.
|
jpayne@68
|
1732
|
jpayne@68
|
1733 If this current BedTool refers to a BAM file, it will be
|
jpayne@68
|
1734 converted to a BED file first using default arguments. If you
|
jpayne@68
|
1735 don't want this to happen, please convert to BED first before
|
jpayne@68
|
1736 using this method.
|
jpayne@68
|
1737 """
|
jpayne@68
|
1738 if not isinstance(interval, Interval):
|
jpayne@68
|
1739 raise ValueError("Need an Interval instance")
|
jpayne@68
|
1740 fn = self.fn
|
jpayne@68
|
1741 if not isinstance(fn, str):
|
jpayne@68
|
1742 fn = self.saveas().fn
|
jpayne@68
|
1743 if self._isbam:
|
jpayne@68
|
1744 fn = self.bam_to_bed().fn
|
jpayne@68
|
1745 interval_file = pybedtools.IntervalFile(fn)
|
jpayne@68
|
1746 return interval_file.any_hits(interval, same_strand, overlap)
|
jpayne@68
|
1747
|
jpayne@68
|
1748 def count_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0) -> int:
|
jpayne@68
|
1749 """
|
jpayne@68
|
1750 Return the number of intervals that overlap `interval`.
|
jpayne@68
|
1751
|
jpayne@68
|
1752 Calls the `count_hits` method of an IntervalFile. Returns the number
|
jpayne@68
|
1753 of valid hits in this BedTool that overlap `interval`.
|
jpayne@68
|
1754
|
jpayne@68
|
1755 Require that overlaps have the same strand with same_strand=True.
|
jpayne@68
|
1756
|
jpayne@68
|
1757 Notes:
|
jpayne@68
|
1758 If this current BedTool is generator-based, it will be
|
jpayne@68
|
1759 converted into a file first.
|
jpayne@68
|
1760
|
jpayne@68
|
1761 If this current BedTool refers to a BAM file, it will be
|
jpayne@68
|
1762 converted to a BED file first using default arguments. If you
|
jpayne@68
|
1763 don't want this to happen, please convert to BED first before
|
jpayne@68
|
1764 using this method.
|
jpayne@68
|
1765 """
|
jpayne@68
|
1766 if not isinstance(interval, Interval):
|
jpayne@68
|
1767 raise ValueError("Need an Interval instance")
|
jpayne@68
|
1768 fn = self.fn
|
jpayne@68
|
1769 if not isinstance(fn, str):
|
jpayne@68
|
1770 fn = self.saveas().fn
|
jpayne@68
|
1771 if self._isbam:
|
jpayne@68
|
1772 fn = self.bam_to_bed().fn
|
jpayne@68
|
1773 interval_file = pybedtools.IntervalFile(fn)
|
jpayne@68
|
1774 return interval_file.count_hits(interval, same_strand, overlap)
|
jpayne@68
|
1775
|
jpayne@68
|
1776 @_log_to_history
|
jpayne@68
|
1777 @_wraps(prog="bed12ToBed6", implicit="i", bam=None, other=None)
|
jpayne@68
|
1778 def bed6(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
1779 """
|
jpayne@68
|
1780 Wraps `bedtools bed12tobed6`.
|
jpayne@68
|
1781 """
|
jpayne@68
|
1782
|
jpayne@68
|
1783 # Alias for backward compatibility
|
jpayne@68
|
1784 bed12tobed6 = bed6
|
jpayne@68
|
1785
|
jpayne@68
|
1786 @_log_to_history
|
jpayne@68
|
1787 @_wraps(prog="bamToBed", implicit="i", other=None, nonbam="ALL", bam="i")
|
jpayne@68
|
1788 def bam_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
1789 """
|
jpayne@68
|
1790 Wraps `bedtools bamtobed`.
|
jpayne@68
|
1791 """
|
jpayne@68
|
1792
|
jpayne@68
|
1793 # Alias for backward compatibility
|
jpayne@68
|
1794 bamtobed = bam_to_bed
|
jpayne@68
|
1795
|
jpayne@68
|
1796 @_wraps(prog="bedToBam", implicit="i", uses_genome=True, force_bam=True)
|
jpayne@68
|
1797 def _bed_to_bam(self, *args, **kwargs):
|
jpayne@68
|
1798 """
|
jpayne@68
|
1799 Wraps bedToBam and is called internally for BED/GFF/VCF files by
|
jpayne@68
|
1800 self.to_bam (which needs to do something different for SAM files...)
|
jpayne@68
|
1801 """
|
jpayne@68
|
1802
|
jpayne@68
|
1803 @_log_to_history
|
jpayne@68
|
1804 def to_bam(self, **kwargs):
|
jpayne@68
|
1805 """
|
jpayne@68
|
1806 Wraps `bedtools bedtobam`
|
jpayne@68
|
1807
|
jpayne@68
|
1808 If self.fn is in BED/VCF/GFF format, call BEDTools' bedToBam. If
|
jpayne@68
|
1809 self.fn is in SAM format, then create a header out of the genome file
|
jpayne@68
|
1810 and then convert using `samtools`.
|
jpayne@68
|
1811 """
|
jpayne@68
|
1812 if self.file_type == "bam":
|
jpayne@68
|
1813 return self
|
jpayne@68
|
1814 if self.file_type in ("bed", "gff", "vcf"):
|
jpayne@68
|
1815 return self._bed_to_bam(**kwargs)
|
jpayne@68
|
1816
|
jpayne@68
|
1817 # TODO: to maintain backwards compatibility we go from Interval to
|
jpayne@68
|
1818 # AlignedSegment.
|
jpayne@68
|
1819 if self.file_type == "sam":
|
jpayne@68
|
1820
|
jpayne@68
|
1821 # Use pysam, but construct the header out of a provided genome
|
jpayne@68
|
1822 # file.
|
jpayne@68
|
1823
|
jpayne@68
|
1824 # construct a genome out of whatever kwargs were passed in
|
jpayne@68
|
1825 kwargs = self.check_genome(**kwargs)
|
jpayne@68
|
1826
|
jpayne@68
|
1827 # Build a header that we can use for the output BAM file.
|
jpayne@68
|
1828 genome = dict(i.split() for i in open(kwargs["g"]))
|
jpayne@68
|
1829 SQ = []
|
jpayne@68
|
1830 ref_ids = {}
|
jpayne@68
|
1831 text_header = ["@HD\tVN:1.0"]
|
jpayne@68
|
1832
|
jpayne@68
|
1833 for i, (k, v) in enumerate(genome.items()):
|
jpayne@68
|
1834 SQ.append(dict(SN=k, LN=int(v)))
|
jpayne@68
|
1835 ref_ids[k] = i
|
jpayne@68
|
1836 text_header.append("@SQ\tSN:{0}\tLN:{1}".format(k, v))
|
jpayne@68
|
1837
|
jpayne@68
|
1838 # And the text-format header
|
jpayne@68
|
1839 text_header = "\n".join(text_header) + "\n"
|
jpayne@68
|
1840
|
jpayne@68
|
1841 # The strategy is to write an actual SAM file to disk, along with
|
jpayne@68
|
1842 # a header, and then read that back in.
|
jpayne@68
|
1843 #
|
jpayne@68
|
1844 # Painfully inefficient, but this will change once all py2 tests
|
jpayne@68
|
1845 # pass.
|
jpayne@68
|
1846 sam_tmp = self._tmp()
|
jpayne@68
|
1847 bam_tmp = self._tmp()
|
jpayne@68
|
1848 with open(sam_tmp, "w") as fout:
|
jpayne@68
|
1849 fout.write(text_header)
|
jpayne@68
|
1850 for interval in self:
|
jpayne@68
|
1851 fout.write("\t".join(map(str, interval.fields)) + "\n")
|
jpayne@68
|
1852
|
jpayne@68
|
1853 samfile = pysam.AlignmentFile(sam_tmp, "r")
|
jpayne@68
|
1854 bamfile = pysam.AlignmentFile(bam_tmp, "wb", template=samfile)
|
jpayne@68
|
1855 for alignment in samfile:
|
jpayne@68
|
1856 bamfile.write(alignment)
|
jpayne@68
|
1857
|
jpayne@68
|
1858 samfile.close()
|
jpayne@68
|
1859 bamfile.close()
|
jpayne@68
|
1860 new_bedtool = BedTool(bam_tmp)
|
jpayne@68
|
1861 new_bedtool._isbam = True
|
jpayne@68
|
1862 return new_bedtool
|
jpayne@68
|
1863
|
jpayne@68
|
1864 # Alias for backward compatibility
|
jpayne@68
|
1865 bedtobam = to_bam
|
jpayne@68
|
1866
|
jpayne@68
|
1867 @_log_to_history
|
jpayne@68
|
1868 @_wraps(prog="intersectBed", implicit="a", other="b", bam="abam",
|
jpayne@68
|
1869 nonbam="bed", arg_order=["a", "abam"])
|
jpayne@68
|
1870 def intersect(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
1871 """
|
jpayne@68
|
1872 Wraps `bedtools intersect`.
|
jpayne@68
|
1873 """
|
jpayne@68
|
1874
|
jpayne@68
|
1875 @_log_to_history
|
jpayne@68
|
1876 @_wraps(
|
jpayne@68
|
1877 prog="fastaFromBed",
|
jpayne@68
|
1878 implicit="bed",
|
jpayne@68
|
1879 bam=None,
|
jpayne@68
|
1880 other="fi",
|
jpayne@68
|
1881 make_tempfile_for="fo",
|
jpayne@68
|
1882 check_stderr=_check_sequence_stderr,
|
jpayne@68
|
1883 add_to_bedtool={"fo": "seqfn"},
|
jpayne@68
|
1884 )
|
jpayne@68
|
1885 def sequence(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
1886 '''
|
jpayne@68
|
1887 Wraps `bedtools getfasta`.
|
jpayne@68
|
1888
|
jpayne@68
|
1889 *fi* is passed in by the user; *bed* is automatically passed in as the
|
jpayne@68
|
1890 bedfile of this object; *fo* by default is a temp file. Use
|
jpayne@68
|
1891 save_seqs() to save as a file.
|
jpayne@68
|
1892
|
jpayne@68
|
1893 The end result is that this BedTool will assign a value to the attribute , self.seqfn,
|
jpayne@68
|
1894 that points to the new fasta file.
|
jpayne@68
|
1895
|
jpayne@68
|
1896 Example usage:
|
jpayne@68
|
1897
|
jpayne@68
|
1898 >>> a = pybedtools.BedTool("""
|
jpayne@68
|
1899 ... chr1 1 10
|
jpayne@68
|
1900 ... chr1 50 55""", from_string=True)
|
jpayne@68
|
1901 >>> fasta = pybedtools.example_filename('test.fa')
|
jpayne@68
|
1902 >>> a = a.sequence(fi=fasta)
|
jpayne@68
|
1903 >>> print(open(a.seqfn).read())
|
jpayne@68
|
1904 >chr1:1-10
|
jpayne@68
|
1905 GATGAGTCT
|
jpayne@68
|
1906 >chr1:50-55
|
jpayne@68
|
1907 CCATC
|
jpayne@68
|
1908 <BLANKLINE>
|
jpayne@68
|
1909
|
jpayne@68
|
1910 '''
|
jpayne@68
|
1911
|
jpayne@68
|
1912 # Alias for backwards compatibility
|
jpayne@68
|
1913 getfasta = sequence
|
jpayne@68
|
1914
|
jpayne@68
|
1915 @staticmethod
|
jpayne@68
|
1916 def seq(loc, fasta) -> str:
|
jpayne@68
|
1917 """
|
jpayne@68
|
1918 Return just the sequence from a region string or a single location
|
jpayne@68
|
1919 >>> fn = pybedtools.example_filename('test.fa')
|
jpayne@68
|
1920 >>> BedTool.seq('chr1:2-10', fn)
|
jpayne@68
|
1921 'GATGAGTCT'
|
jpayne@68
|
1922 >>> BedTool.seq(('chr1', 1, 10), fn)
|
jpayne@68
|
1923 'GATGAGTCT'
|
jpayne@68
|
1924 """
|
jpayne@68
|
1925 if isinstance(loc, str):
|
jpayne@68
|
1926 chrom, start_end = loc.split(":")
|
jpayne@68
|
1927 start, end = list(map(int, start_end.split("-")))
|
jpayne@68
|
1928 start -= 1
|
jpayne@68
|
1929 else:
|
jpayne@68
|
1930 chrom, start, end = loc[0], loc[1], loc[2]
|
jpayne@68
|
1931
|
jpayne@68
|
1932 loc = BedTool("%s\t%i\t%i" % (chrom, start, end), from_string=True)
|
jpayne@68
|
1933 lseq = loc.sequence(fi=fasta)
|
jpayne@68
|
1934 return "".join([l.rstrip() for l in open(lseq.seqfn, "r") if l[0] != ">"])
|
jpayne@68
|
1935
|
jpayne@68
|
1936 @_log_to_history
|
jpayne@68
|
1937 @_wraps(
|
jpayne@68
|
1938 prog="nucBed", implicit="bed", other="fi", check_stderr=_check_sequence_stderr
|
jpayne@68
|
1939 )
|
jpayne@68
|
1940 def nucleotide_content(self) -> BedTool: # type: ignore
|
jpayne@68
|
1941 """
|
jpayne@68
|
1942 Wraps `bedtools nuc`.
|
jpayne@68
|
1943
|
jpayne@68
|
1944 Profiles nucleotide content. The returned BED file contains extra
|
jpayne@68
|
1945 information about the nucleotide content
|
jpayne@68
|
1946 """
|
jpayne@68
|
1947
|
jpayne@68
|
1948 # Alias for backwards compatibility
|
jpayne@68
|
1949 nuc = nucleotide_content
|
jpayne@68
|
1950
|
jpayne@68
|
1951 @_log_to_history
|
jpayne@68
|
1952 @_wraps(prog="multiBamCov", implicit="bed")
|
jpayne@68
|
1953 def multi_bam_coverage(self) -> BedTool: # type: ignore
|
jpayne@68
|
1954 """
|
jpayne@68
|
1955 Wraps `bedtools multicov`.
|
jpayne@68
|
1956
|
jpayne@68
|
1957 Pass a list of sorted and indexed BAM files as `bams`
|
jpayne@68
|
1958 """
|
jpayne@68
|
1959
|
jpayne@68
|
1960 # Alias for backwards compatibility
|
jpayne@68
|
1961 multicov = multi_bam_coverage
|
jpayne@68
|
1962
|
jpayne@68
|
1963 @_log_to_history
|
jpayne@68
|
1964 @_wraps(prog="subtractBed", implicit="a", other="b", bam=None)
|
jpayne@68
|
1965 def subtract(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
1966 """
|
jpayne@68
|
1967 Wraps `bedtools subtract`.
|
jpayne@68
|
1968
|
jpayne@68
|
1969 Subtracts from another BED file and returns a new BedTool object.
|
jpayne@68
|
1970
|
jpayne@68
|
1971 Example usage:
|
jpayne@68
|
1972
|
jpayne@68
|
1973 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
1974 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
1975
|
jpayne@68
|
1976 Do a "stranded" subtraction:
|
jpayne@68
|
1977
|
jpayne@68
|
1978 >>> c = a.subtract(b, s=True)
|
jpayne@68
|
1979
|
jpayne@68
|
1980 Require 50% of features in `a` to overlap:
|
jpayne@68
|
1981
|
jpayne@68
|
1982 >>> c = a.subtract(b, f=0.5)
|
jpayne@68
|
1983 """
|
jpayne@68
|
1984 if "a" not in kwargs:
|
jpayne@68
|
1985 kwargs["a"] = self.fn
|
jpayne@68
|
1986
|
jpayne@68
|
1987 if "b" not in kwargs:
|
jpayne@68
|
1988 if len(args) > 0:
|
jpayne@68
|
1989 kwargs["b"] = args[0]
|
jpayne@68
|
1990 else:
|
jpayne@68
|
1991 raise ValueError("Must specify a BED file to subtract, either as a positional argument or as the 'b' keyword argument.")
|
jpayne@68
|
1992
|
jpayne@68
|
1993 cmds, tmp, stdin = self.handle_kwargs(prog="subtractBed", **kwargs)
|
jpayne@68
|
1994 stream = call_bedtools(cmds, tmp, stdin=stdin)
|
jpayne@68
|
1995 return BedTool(stream)
|
jpayne@68
|
1996
|
jpayne@68
|
1997 @_log_to_history
|
jpayne@68
|
1998 @_wraps(prog="slopBed", implicit="i", other=None, bam=None, uses_genome=True)
|
jpayne@68
|
1999 def slop(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2000 """
|
jpayne@68
|
2001 Wraps `bedtools slop`.
|
jpayne@68
|
2002 """
|
jpayne@68
|
2003
|
jpayne@68
|
2004 @_log_to_history
|
jpayne@68
|
2005 @_wraps(prog="shiftBed", implicit="i", other=None, bam=None, uses_genome=True)
|
jpayne@68
|
2006 def shift(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2007 """
|
jpayne@68
|
2008 Wraps `bedtools shift`.
|
jpayne@68
|
2009
|
jpayne@68
|
2010 Shift each feature by user-defined number of bases. Returns a new BedTool object.
|
jpayne@68
|
2011
|
jpayne@68
|
2012 Example usage:
|
jpayne@68
|
2013
|
jpayne@68
|
2014 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2015
|
jpayne@68
|
2016 Shift every feature by 5bp:
|
jpayne@68
|
2017
|
jpayne@68
|
2018 >>> b = a.shift(genome='hg19', s=5)
|
jpayne@68
|
2019 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2020 chr1 6 105 feature1 0 +
|
jpayne@68
|
2021 chr1 105 205 feature2 0 +
|
jpayne@68
|
2022 chr1 155 505 feature3 0 -
|
jpayne@68
|
2023 chr1 905 955 feature4 0 +
|
jpayne@68
|
2024 <BLANKLINE>
|
jpayne@68
|
2025
|
jpayne@68
|
2026 Shift features on the '+' strand by -1bp and on '-' strand by +3bp:
|
jpayne@68
|
2027
|
jpayne@68
|
2028 >>> b = a.shift(genome='hg19', p=-1, m=3)
|
jpayne@68
|
2029 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2030 chr1 0 99 feature1 0 +
|
jpayne@68
|
2031 chr1 99 199 feature2 0 +
|
jpayne@68
|
2032 chr1 153 503 feature3 0 -
|
jpayne@68
|
2033 chr1 899 949 feature4 0 +
|
jpayne@68
|
2034 <BLANKLINE>
|
jpayne@68
|
2035
|
jpayne@68
|
2036 # Disabling, see https://github.com/arq5x/bedtools2/issues/807
|
jpayne@68
|
2037 Shift features by a fraction of their length (0.50):
|
jpayne@68
|
2038
|
jpayne@68
|
2039 #>>> b = a.shift(genome='hg19', pct=True, s=0.50)
|
jpayne@68
|
2040 #>>> print(b) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2041 #chr1 50 149 feature1 0 +
|
jpayne@68
|
2042 #chr1 150 250 feature2 0 +
|
jpayne@68
|
2043 #chr1 325 675 feature3 0 -
|
jpayne@68
|
2044 #chr1 925 975 feature4 0 +
|
jpayne@68
|
2045 #<BLANKLINE>
|
jpayne@68
|
2046
|
jpayne@68
|
2047 """
|
jpayne@68
|
2048
|
jpayne@68
|
2049 @_log_to_history
|
jpayne@68
|
2050 @_wraps(prog="mergeBed", implicit="i", other=None, bam=None)
|
jpayne@68
|
2051 def merge(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2052 """
|
jpayne@68
|
2053 Wraps `bedtools merge`.
|
jpayne@68
|
2054
|
jpayne@68
|
2055 Merge overlapping features together. Returns a new BedTool object.
|
jpayne@68
|
2056
|
jpayne@68
|
2057 Example usage:
|
jpayne@68
|
2058
|
jpayne@68
|
2059 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2060
|
jpayne@68
|
2061 Merge:
|
jpayne@68
|
2062
|
jpayne@68
|
2063 >>> c = a.merge()
|
jpayne@68
|
2064
|
jpayne@68
|
2065 Allow merging of features 500 bp apart:
|
jpayne@68
|
2066
|
jpayne@68
|
2067 >>> c = a.merge(d=500)
|
jpayne@68
|
2068
|
jpayne@68
|
2069 """
|
jpayne@68
|
2070
|
jpayne@68
|
2071 @_log_to_history
|
jpayne@68
|
2072 @_wraps(prog="closestBed", implicit="a", other="b", bam=None)
|
jpayne@68
|
2073 def closest(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2074 """
|
jpayne@68
|
2075 Wraps `bedtools closest`.
|
jpayne@68
|
2076
|
jpayne@68
|
2077 Return a new BedTool object containing closest features in *b*. Note
|
jpayne@68
|
2078 that the resulting file is no longer a valid BED format; use the
|
jpayne@68
|
2079 special "_closest" methods to work with the resulting file.
|
jpayne@68
|
2080
|
jpayne@68
|
2081 Example usage::
|
jpayne@68
|
2082
|
jpayne@68
|
2083 a = BedTool('in.bed')
|
jpayne@68
|
2084
|
jpayne@68
|
2085 # get the closest feature in 'other.bed' on the same strand
|
jpayne@68
|
2086 b = a.closest('other.bed', s=True)
|
jpayne@68
|
2087
|
jpayne@68
|
2088 """
|
jpayne@68
|
2089
|
jpayne@68
|
2090 @_log_to_history
|
jpayne@68
|
2091 @_wraps(prog="windowBed", implicit="a", other="b", bam="abam", nonbam="bed")
|
jpayne@68
|
2092 def window(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2093 """
|
jpayne@68
|
2094 Wraps `bedtools window`.
|
jpayne@68
|
2095
|
jpayne@68
|
2096 Example usage::
|
jpayne@68
|
2097
|
jpayne@68
|
2098 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2099 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
2100 >>> print(a.window(b, w=1000)) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2101 chr1 1 100 feature1 0 + chr1 155 200 feature5 0 -
|
jpayne@68
|
2102 chr1 1 100 feature1 0 + chr1 800 901 feature6 0 +
|
jpayne@68
|
2103 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 -
|
jpayne@68
|
2104 chr1 100 200 feature2 0 + chr1 800 901 feature6 0 +
|
jpayne@68
|
2105 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 -
|
jpayne@68
|
2106 chr1 150 500 feature3 0 - chr1 800 901 feature6 0 +
|
jpayne@68
|
2107 chr1 900 950 feature4 0 + chr1 155 200 feature5 0 -
|
jpayne@68
|
2108 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 +
|
jpayne@68
|
2109 <BLANKLINE>
|
jpayne@68
|
2110 """
|
jpayne@68
|
2111
|
jpayne@68
|
2112 @_log_to_history
|
jpayne@68
|
2113 @_wraps(prog="shuffleBed", implicit="i", other=None, bam=None, uses_genome=True)
|
jpayne@68
|
2114 def shuffle(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2115 """
|
jpayne@68
|
2116 Wraps `bedtools shuffle`.
|
jpayne@68
|
2117
|
jpayne@68
|
2118 Example usage:
|
jpayne@68
|
2119
|
jpayne@68
|
2120 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2121 >>> seed = 1 # so this test always returns the same results
|
jpayne@68
|
2122 >>> b = a.shuffle(genome='hg19', chrom=True, seed=seed)
|
jpayne@68
|
2123 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2124 chr1 123081365 123081464 feature1 0 +
|
jpayne@68
|
2125 chr1 243444570 243444670 feature2 0 +
|
jpayne@68
|
2126 chr1 194620241 194620591 feature3 0 -
|
jpayne@68
|
2127 chr1 172792873 172792923 feature4 0 +
|
jpayne@68
|
2128 <BLANKLINE>
|
jpayne@68
|
2129 """
|
jpayne@68
|
2130
|
jpayne@68
|
2131 @_log_to_history
|
jpayne@68
|
2132 @_wraps(prog="sortBed", implicit="i", uses_genome=True, genome_if=["g", "genome"])
|
jpayne@68
|
2133 def sort(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2134 """
|
jpayne@68
|
2135 Wraps `bedtools sort`.
|
jpayne@68
|
2136
|
jpayne@68
|
2137 Note that chromosomes are sorted lexicographically, so chr12 will come
|
jpayne@68
|
2138 before chr9.
|
jpayne@68
|
2139
|
jpayne@68
|
2140 Example usage:
|
jpayne@68
|
2141
|
jpayne@68
|
2142 >>> a = pybedtools.BedTool('''
|
jpayne@68
|
2143 ... chr9 300 400
|
jpayne@68
|
2144 ... chr1 100 200
|
jpayne@68
|
2145 ... chr1 1 50
|
jpayne@68
|
2146 ... chr12 1 100
|
jpayne@68
|
2147 ... chr9 500 600
|
jpayne@68
|
2148 ... ''', from_string=True)
|
jpayne@68
|
2149 >>> print(a.sort()) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2150 chr1 1 50
|
jpayne@68
|
2151 chr1 100 200
|
jpayne@68
|
2152 chr12 1 100
|
jpayne@68
|
2153 chr9 300 400
|
jpayne@68
|
2154 chr9 500 600
|
jpayne@68
|
2155 <BLANKLINE>
|
jpayne@68
|
2156 """
|
jpayne@68
|
2157
|
jpayne@68
|
2158 @_log_to_history
|
jpayne@68
|
2159 @_wraps(prog="annotateBed", implicit="i")
|
jpayne@68
|
2160 def annotate(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2161 """
|
jpayne@68
|
2162 Wraps `bedtools annotate`.
|
jpayne@68
|
2163
|
jpayne@68
|
2164 Annotate this BedTool with a list of other files.
|
jpayne@68
|
2165 Example usage:
|
jpayne@68
|
2166
|
jpayne@68
|
2167 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2168 >>> b_fn = pybedtools.example_filename('b.bed')
|
jpayne@68
|
2169 >>> print(a.annotate(files=b_fn)) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2170 chr1 1 100 feature1 0 + 0.000000
|
jpayne@68
|
2171 chr1 100 200 feature2 0 + 0.450000
|
jpayne@68
|
2172 chr1 150 500 feature3 0 - 0.128571
|
jpayne@68
|
2173 chr1 900 950 feature4 0 + 0.020000
|
jpayne@68
|
2174 <BLANKLINE>
|
jpayne@68
|
2175 """
|
jpayne@68
|
2176
|
jpayne@68
|
2177 @_log_to_history
|
jpayne@68
|
2178 @_wraps(prog="flankBed", implicit="i", uses_genome=True)
|
jpayne@68
|
2179 def flank(self, *args, **kwargs) -> BedTool:
|
jpayne@68
|
2180 """
|
jpayne@68
|
2181 Wraps `bedtools flank`.
|
jpayne@68
|
2182
|
jpayne@68
|
2183 Example usage:
|
jpayne@68
|
2184
|
jpayne@68
|
2185 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2186 >>> print(a.flank(genome='hg19', b=100)) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2187 chr1 0 1 feature1 0 +
|
jpayne@68
|
2188 chr1 100 200 feature1 0 +
|
jpayne@68
|
2189 chr1 0 100 feature2 0 +
|
jpayne@68
|
2190 chr1 200 300 feature2 0 +
|
jpayne@68
|
2191 chr1 50 150 feature3 0 -
|
jpayne@68
|
2192 chr1 500 600 feature3 0 -
|
jpayne@68
|
2193 chr1 800 900 feature4 0 +
|
jpayne@68
|
2194 chr1 950 1050 feature4 0 +
|
jpayne@68
|
2195 <BLANKLINE>
|
jpayne@68
|
2196
|
jpayne@68
|
2197 """
|
jpayne@68
|
2198 kwargs = self.check_genome(**kwargs)
|
jpayne@68
|
2199
|
jpayne@68
|
2200 if "i" not in kwargs:
|
jpayne@68
|
2201 kwargs["i"] = self.fn
|
jpayne@68
|
2202
|
jpayne@68
|
2203 cmds, tmp, stdin = self.handle_kwargs(prog="flankBed", **kwargs)
|
jpayne@68
|
2204 stream = call_bedtools(cmds, tmp, stdin=stdin)
|
jpayne@68
|
2205 return BedTool(stream)
|
jpayne@68
|
2206
|
jpayne@68
|
2207 @_log_to_history
|
jpayne@68
|
2208 @_wraps(
|
jpayne@68
|
2209 prog="genomeCoverageBed",
|
jpayne@68
|
2210 implicit="i",
|
jpayne@68
|
2211 bam="ibam",
|
jpayne@68
|
2212 genome_none_if=["ibam"],
|
jpayne@68
|
2213 genome_ok_if=["ibam"],
|
jpayne@68
|
2214 uses_genome=True,
|
jpayne@68
|
2215 nonbam="ALL",
|
jpayne@68
|
2216 )
|
jpayne@68
|
2217 def genome_coverage(self, *args, **kwargs):
|
jpayne@68
|
2218 """
|
jpayne@68
|
2219 Wraps `bedtools genomecov`.
|
jpayne@68
|
2220
|
jpayne@68
|
2221 Note that some invocations of `bedtools genomecov` do not result in
|
jpayne@68
|
2222 a properly-formatted BED file. For example, the default behavior is to
|
jpayne@68
|
2223 report a histogram of coverage. Iterating over the resulting,
|
jpayne@68
|
2224 non-BED-format file will raise exceptions in pybedtools' parser.
|
jpayne@68
|
2225
|
jpayne@68
|
2226 Consider using the `BedTool.to_dataframe` method to convert these
|
jpayne@68
|
2227 non-BED files into a pandas DataFrame for further use.
|
jpayne@68
|
2228
|
jpayne@68
|
2229 Example usage:
|
jpayne@68
|
2230
|
jpayne@68
|
2231 BAM file input does not require a genome:
|
jpayne@68
|
2232
|
jpayne@68
|
2233 >>> a = pybedtools.example_bedtool('x.bam')
|
jpayne@68
|
2234 >>> b = a.genome_coverage(bg=True)
|
jpayne@68
|
2235 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2236 chr2L 9329 9365 1
|
jpayne@68
|
2237 chr2L 10212 10248 1
|
jpayne@68
|
2238 chr2L 10255 10291 1
|
jpayne@68
|
2239
|
jpayne@68
|
2240 Other input does require a genome:
|
jpayne@68
|
2241
|
jpayne@68
|
2242 >>> a = pybedtools.example_bedtool('x.bed')
|
jpayne@68
|
2243 >>> b = a.genome_coverage(bg=True, genome='dm3')
|
jpayne@68
|
2244 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2245 chr2L 9329 9365 1
|
jpayne@68
|
2246 chr2L 10212 10248 1
|
jpayne@68
|
2247 chr2L 10255 10291 1
|
jpayne@68
|
2248
|
jpayne@68
|
2249 Non-BED format results:
|
jpayne@68
|
2250 >>> a = pybedtools.example_bedtool('x.bed')
|
jpayne@68
|
2251 >>> b = a.genome_coverage(genome='dm3')
|
jpayne@68
|
2252 >>> df = b.to_dataframe(names=['chrom', 'depth', 'n', 'chromsize', 'fraction'])
|
jpayne@68
|
2253 """
|
jpayne@68
|
2254
|
jpayne@68
|
2255 # Alias for backwards compatibility
|
jpayne@68
|
2256 genomecov = genome_coverage
|
jpayne@68
|
2257
|
jpayne@68
|
2258 @_log_to_history
|
jpayne@68
|
2259 @_wraps(prog="coverageBed", implicit="a", other="b", bam="abam", nonbam="ALL")
|
jpayne@68
|
2260 def coverage(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2261 """
|
jpayne@68
|
2262 Wraps `bedtools coverage`.
|
jpayne@68
|
2263
|
jpayne@68
|
2264 Note that starting in version 2.24.0, BEDTools swapped the semantics of
|
jpayne@68
|
2265 the "a" and "b" files.
|
jpayne@68
|
2266
|
jpayne@68
|
2267 Example usage:
|
jpayne@68
|
2268
|
jpayne@68
|
2269 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2270 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
2271 >>> c = b.coverage(a)
|
jpayne@68
|
2272 >>> c.head(3) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2273 chr1 155 200 feature5 0 - 2 45 45 1.0000000
|
jpayne@68
|
2274 chr1 800 901 feature6 0 + 1 1 101 0.0099010
|
jpayne@68
|
2275 """
|
jpayne@68
|
2276
|
jpayne@68
|
2277 @_log_to_history
|
jpayne@68
|
2278 @_wraps(
|
jpayne@68
|
2279 prog="maskFastaFromBed",
|
jpayne@68
|
2280 implicit="bed",
|
jpayne@68
|
2281 other="fi",
|
jpayne@68
|
2282 make_tempfile_for="fo",
|
jpayne@68
|
2283 add_to_bedtool={"fo": "seqfn"},
|
jpayne@68
|
2284 check_stderr=_check_sequence_stderr,
|
jpayne@68
|
2285 )
|
jpayne@68
|
2286 def mask_fasta(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2287 """
|
jpayne@68
|
2288 Wraps `bedtools maskfasta`.
|
jpayne@68
|
2289
|
jpayne@68
|
2290 Masks a fasta file at the positions in a BED file and saves result as
|
jpayne@68
|
2291 'out' and stores the filename in seqfn.
|
jpayne@68
|
2292
|
jpayne@68
|
2293 >>> a = pybedtools.BedTool('chr1 100 110', from_string=True)
|
jpayne@68
|
2294 >>> fasta_fn = pybedtools.example_filename('test.fa')
|
jpayne@68
|
2295 >>> a = a.mask_fasta(fi=fasta_fn, fo='masked.fa.example')
|
jpayne@68
|
2296 >>> b = a.slop(b=2, genome='hg19')
|
jpayne@68
|
2297 >>> b = b.sequence(fi=a.seqfn)
|
jpayne@68
|
2298 >>> print(open(b.seqfn).read())
|
jpayne@68
|
2299 >chr1:98-112
|
jpayne@68
|
2300 TTNNNNNNNNNNAT
|
jpayne@68
|
2301 <BLANKLINE>
|
jpayne@68
|
2302 >>> os.unlink('masked.fa.example')
|
jpayne@68
|
2303 >>> if os.path.exists('masked.fa.example.fai'):
|
jpayne@68
|
2304 ... os.unlink('masked.fa.example.fai')
|
jpayne@68
|
2305 """
|
jpayne@68
|
2306
|
jpayne@68
|
2307 # Alias for backwards compatibility
|
jpayne@68
|
2308 maskfasta = mask_fasta
|
jpayne@68
|
2309
|
jpayne@68
|
2310 @_log_to_history
|
jpayne@68
|
2311 @_wraps(prog="complementBed", implicit="i", uses_genome=True)
|
jpayne@68
|
2312 def complement(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2313 """
|
jpayne@68
|
2314 Wraps `bedtools complement`.
|
jpayne@68
|
2315 Example usage:
|
jpayne@68
|
2316
|
jpayne@68
|
2317 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2318 >>> a.complement(genome='hg19').head(5) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2319 chr1 0 1
|
jpayne@68
|
2320 chr1 500 900
|
jpayne@68
|
2321 chr1 950 249250621
|
jpayne@68
|
2322 chr2 0 243199373
|
jpayne@68
|
2323 chr3 0 198022430
|
jpayne@68
|
2324 """
|
jpayne@68
|
2325
|
jpayne@68
|
2326 @_log_to_history
|
jpayne@68
|
2327 @_wraps(prog="getOverlap", implicit="i")
|
jpayne@68
|
2328 def overlap(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2329 """
|
jpayne@68
|
2330 Wraps `bedtools overlap`.
|
jpayne@68
|
2331
|
jpayne@68
|
2332 Example usage:
|
jpayne@68
|
2333
|
jpayne@68
|
2334 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2335 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
2336 >>> c = a.window(b, w=10).overlap(cols=[2,3,8,9])
|
jpayne@68
|
2337 >>> print(c) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2338 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 - 45
|
jpayne@68
|
2339 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 - 45
|
jpayne@68
|
2340 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 + 1
|
jpayne@68
|
2341 <BLANKLINE>
|
jpayne@68
|
2342 """
|
jpayne@68
|
2343
|
jpayne@68
|
2344 # TODO: needs test files and doctests written
|
jpayne@68
|
2345 @_log_to_history
|
jpayne@68
|
2346 @_wraps(prog="pairToBed", implicit="a", other="b", bam="abam", nonbam="bedpe")
|
jpayne@68
|
2347 def pair_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2348 """
|
jpayne@68
|
2349 Wraps `bedtools pairtobed`.
|
jpayne@68
|
2350 """
|
jpayne@68
|
2351
|
jpayne@68
|
2352 # Alias for backwards compatibility
|
jpayne@68
|
2353 pairtobed = pair_to_bed
|
jpayne@68
|
2354
|
jpayne@68
|
2355 @_log_to_history
|
jpayne@68
|
2356 @_wraps(prog="pairToPair", implicit="a", other="b")
|
jpayne@68
|
2357 def pair_to_pair(self) -> BedTool: # type: ignore
|
jpayne@68
|
2358 """
|
jpayne@68
|
2359 Wraps `bedtools pairtopair`.
|
jpayne@68
|
2360 """
|
jpayne@68
|
2361
|
jpayne@68
|
2362 # Alias for backwards compatibility
|
jpayne@68
|
2363 pairtopair = pair_to_pair
|
jpayne@68
|
2364
|
jpayne@68
|
2365 @_log_to_history
|
jpayne@68
|
2366 @_wraps(prog="groupBy", implicit="i")
|
jpayne@68
|
2367 def groupby(self, *args, **kwargs) -> BedTool:
|
jpayne@68
|
2368 """
|
jpayne@68
|
2369 Wraps `bedtools groupby`.
|
jpayne@68
|
2370
|
jpayne@68
|
2371 Example usage:
|
jpayne@68
|
2372
|
jpayne@68
|
2373 >>> a = pybedtools.example_bedtool('gdc.gff')
|
jpayne@68
|
2374 >>> b = pybedtools.example_bedtool('gdc.bed')
|
jpayne@68
|
2375 >>> c = a.intersect(b, c=True)
|
jpayne@68
|
2376 >>> d = c.groupby(g=[1, 4, 5], c=10, o=['sum'])
|
jpayne@68
|
2377 >>> print(d) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2378 chr2L 41 70 0
|
jpayne@68
|
2379 chr2L 71 130 2
|
jpayne@68
|
2380 chr2L 131 170 4
|
jpayne@68
|
2381 chr2L 171 200 0
|
jpayne@68
|
2382 chr2L 201 220 1
|
jpayne@68
|
2383 chr2L 41 130 2
|
jpayne@68
|
2384 chr2L 171 220 1
|
jpayne@68
|
2385 chr2L 41 220 7
|
jpayne@68
|
2386 chr2L 161 230 6
|
jpayne@68
|
2387 chr2L 41 220 7
|
jpayne@68
|
2388 <BLANKLINE>
|
jpayne@68
|
2389
|
jpayne@68
|
2390 """
|
jpayne@68
|
2391
|
jpayne@68
|
2392 @_log_to_history
|
jpayne@68
|
2393 @_wraps(prog="tagBam", implicit="i", bam="i")
|
jpayne@68
|
2394 def tag_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore
|
jpayne@68
|
2395 """
|
jpayne@68
|
2396 Wraps `bedtools tag`.
|
jpayne@68
|
2397
|
jpayne@68
|
2398 `files` and `labels` should lists of equal length.
|
jpayne@68
|
2399
|
jpayne@68
|
2400 """
|
jpayne@68
|
2401 if "i" in kwargs:
|
jpayne@68
|
2402 if "labels" in kwargs:
|
jpayne@68
|
2403 if len(kwargs["i"]) != len(kwargs["labels"]):
|
jpayne@68
|
2404 raise ValueError("files and labels must be lists of equal length")
|
jpayne@68
|
2405
|
jpayne@68
|
2406
|
jpayne@68
|
2407 # Alias for backwards compatibility
|
jpayne@68
|
2408 tag = tag_bam
|
jpayne@68
|
2409
|
jpayne@68
|
2410 @_log_to_history
|
jpayne@68
|
2411 @_wraps(prog="mapBed", implicit="a", other="b")
|
jpayne@68
|
2412 def map(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2413 """
|
jpayne@68
|
2414 Wraps `bedtools map`; See also :meth:`BedTool.each`.
|
jpayne@68
|
2415 """
|
jpayne@68
|
2416
|
jpayne@68
|
2417 @_log_to_history
|
jpayne@68
|
2418 @_wraps(prog="multiIntersectBed", uses_genome=True, genome_if=["empty"])
|
jpayne@68
|
2419 def multi_intersect(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2420 """
|
jpayne@68
|
2421 Wraps `bedtools multiintersect`.
|
jpayne@68
|
2422
|
jpayne@68
|
2423 Provide a list of filenames as the "i" argument. e.g. if you already
|
jpayne@68
|
2424 have BedTool objects then use their `.fn` attribute, like this::
|
jpayne@68
|
2425
|
jpayne@68
|
2426 >>> x = pybedtools.BedTool()
|
jpayne@68
|
2427 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2428 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
2429 >>> result = x.multi_intersect(i=[a.fn, b.fn])
|
jpayne@68
|
2430 >>> print(result) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2431 chr1 1 155 1 1 1 0
|
jpayne@68
|
2432 chr1 155 200 2 1,2 1 1
|
jpayne@68
|
2433 chr1 200 500 1 1 1 0
|
jpayne@68
|
2434 chr1 800 900 1 2 0 1
|
jpayne@68
|
2435 chr1 900 901 2 1,2 1 1
|
jpayne@68
|
2436 chr1 901 950 1 1 1 0
|
jpayne@68
|
2437 <BLANKLINE>
|
jpayne@68
|
2438
|
jpayne@68
|
2439 """
|
jpayne@68
|
2440
|
jpayne@68
|
2441 # Alias for backwards compatibility
|
jpayne@68
|
2442 multiinter = multi_intersect
|
jpayne@68
|
2443
|
jpayne@68
|
2444 @_log_to_history
|
jpayne@68
|
2445 @_wraps(prog="randomBed", uses_genome=True)
|
jpayne@68
|
2446 def random(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2447 """
|
jpayne@68
|
2448 Wraps `bedtools random`.
|
jpayne@68
|
2449
|
jpayne@68
|
2450 Since this method does not operate on an existing file, create
|
jpayne@68
|
2451 a BedTool with no arguments and then call this method, e.g.,
|
jpayne@68
|
2452
|
jpayne@68
|
2453 >>> x = BedTool()
|
jpayne@68
|
2454 >>> y = x.random(l=100, n=10, genome='hg19')
|
jpayne@68
|
2455 """
|
jpayne@68
|
2456
|
jpayne@68
|
2457 @_log_to_history
|
jpayne@68
|
2458 @_wraps("bedpeToBam", implicit="i", uses_genome=True, force_bam=True)
|
jpayne@68
|
2459 def bedpe_to_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore
|
jpayne@68
|
2460 """
|
jpayne@68
|
2461 Wraps `bedtools bedpetobam`.
|
jpayne@68
|
2462 """
|
jpayne@68
|
2463
|
jpayne@68
|
2464 # Alias for backwards compatibility
|
jpayne@68
|
2465 bedpetobam = bedpe_to_bam
|
jpayne@68
|
2466
|
jpayne@68
|
2467 @_log_to_history
|
jpayne@68
|
2468 @_wraps(prog="clusterBed", implicit="i")
|
jpayne@68
|
2469 def cluster(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2470 """
|
jpayne@68
|
2471 Wraps `bedtools cluster`.
|
jpayne@68
|
2472 """
|
jpayne@68
|
2473
|
jpayne@68
|
2474 @_log_to_history
|
jpayne@68
|
2475 @_wraps(prog="unionBedGraphs")
|
jpayne@68
|
2476 def union_bedgraphs(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2477 """
|
jpayne@68
|
2478 Wraps `bedtools unionbedg`.
|
jpayne@68
|
2479
|
jpayne@68
|
2480 Warning: using the `header=True` kwarg will result in a file that is
|
jpayne@68
|
2481 not in true BED format, which may break downstream analysis.
|
jpayne@68
|
2482 """
|
jpayne@68
|
2483 if "header" in kwargs:
|
jpayne@68
|
2484 if kwargs["header"] is True:
|
jpayne@68
|
2485 warn("Using header=True with unionbedg will result in a file that is not in true BED format, which may break downstream analysis.")
|
jpayne@68
|
2486
|
jpayne@68
|
2487
|
jpayne@68
|
2488 # Alias for backwards compatibility
|
jpayne@68
|
2489 unionbedg = union_bedgraphs
|
jpayne@68
|
2490
|
jpayne@68
|
2491 @_log_to_history
|
jpayne@68
|
2492 @_wraps(prog="windowMaker", uses_genome=True, genome_none_if=["b"], other="b", arg_order=["w"])
|
jpayne@68
|
2493 def window_maker(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2494 """
|
jpayne@68
|
2495 Wraps `bedtools makewindows`.
|
jpayne@68
|
2496 """
|
jpayne@68
|
2497
|
jpayne@68
|
2498 # Alias for backwards compatibility
|
jpayne@68
|
2499 makewindows = window_maker
|
jpayne@68
|
2500
|
jpayne@68
|
2501 @_log_to_history
|
jpayne@68
|
2502 @_wraps(prog="expandCols", implicit="i")
|
jpayne@68
|
2503 def expand(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2504 """
|
jpayne@68
|
2505 Wraps `bedtools expand`
|
jpayne@68
|
2506 """
|
jpayne@68
|
2507
|
jpayne@68
|
2508 @_log_to_history
|
jpayne@68
|
2509 @_wraps(prog="linksBed", implicit="i", add_to_bedtool={"stdout": "links_html"})
|
jpayne@68
|
2510 def links(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2511 """
|
jpayne@68
|
2512 Wraps `linksBed`.
|
jpayne@68
|
2513
|
jpayne@68
|
2514 The resulting BedTool will assign a value to the attribute `links_html`. This
|
jpayne@68
|
2515 attribute is a temp filename containing the HTML links.
|
jpayne@68
|
2516 """
|
jpayne@68
|
2517
|
jpayne@68
|
2518 @_log_to_history
|
jpayne@68
|
2519 @_wraps(prog="bedToIgv", implicit="i", add_to_bedtool={"stdout": "igv_script"})
|
jpayne@68
|
2520 def igv(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2521 """
|
jpayne@68
|
2522 Wraps `bedtools igv`.
|
jpayne@68
|
2523
|
jpayne@68
|
2524 The resulting BedTool will assign a value to the attribute `igv_script`. This
|
jpayne@68
|
2525 attribute is a temp filename containing the IGV script.
|
jpayne@68
|
2526 """
|
jpayne@68
|
2527
|
jpayne@68
|
2528 @_log_to_history
|
jpayne@68
|
2529 @_wraps(
|
jpayne@68
|
2530 prog="bamToFastq",
|
jpayne@68
|
2531 implicit="i",
|
jpayne@68
|
2532 bam="i",
|
jpayne@68
|
2533 make_tempfile_for="fq",
|
jpayne@68
|
2534 add_to_bedtool={"fq": "fastq"},
|
jpayne@68
|
2535 )
|
jpayne@68
|
2536 def bam_to_fastq(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2537 """
|
jpayne@68
|
2538 Wraps `bedtools bamtofastq`.
|
jpayne@68
|
2539
|
jpayne@68
|
2540 The `fq` argument is required.
|
jpayne@68
|
2541
|
jpayne@68
|
2542 The resulting BedTool will assign a value to the attribute `fastq`.
|
jpayne@68
|
2543 """
|
jpayne@68
|
2544
|
jpayne@68
|
2545 # Alias for backwards compatibility
|
jpayne@68
|
2546 bamtofastq = bam_to_fastq
|
jpayne@68
|
2547
|
jpayne@68
|
2548 @_wraps(
|
jpayne@68
|
2549 prog="jaccard",
|
jpayne@68
|
2550 implicit="a",
|
jpayne@68
|
2551 other="b",
|
jpayne@68
|
2552 does_not_return_bedtool=_jaccard_output_to_dict,
|
jpayne@68
|
2553 )
|
jpayne@68
|
2554 def jaccard(self, *args, **kwargs) -> dict[str, Any]: # type: ignore
|
jpayne@68
|
2555 """
|
jpayne@68
|
2556 Returns a dictionary with keys (intersection, union, jaccard).
|
jpayne@68
|
2557 """
|
jpayne@68
|
2558
|
jpayne@68
|
2559 @_wraps(
|
jpayne@68
|
2560 prog="reldist",
|
jpayne@68
|
2561 implicit="a",
|
jpayne@68
|
2562 other="b",
|
jpayne@68
|
2563 does_not_return_bedtool=_reldist_output_handler,
|
jpayne@68
|
2564 )
|
jpayne@68
|
2565 def reldist(self, *args, **kwargs) -> BedTool | dict[str, Any]: # type: ignore
|
jpayne@68
|
2566 """
|
jpayne@68
|
2567 If detail=False, then return a dictionary with keys (reldist, count,
|
jpayne@68
|
2568 total, fraction), which is the summary of the bedtools reldist.
|
jpayne@68
|
2569
|
jpayne@68
|
2570 Otherwise return a BedTool, with the relative distance for each
|
jpayne@68
|
2571 interval in A in the last column.
|
jpayne@68
|
2572 """
|
jpayne@68
|
2573 if "detail" in kwargs:
|
jpayne@68
|
2574 if kwargs["detail"] is False:
|
jpayne@68
|
2575 warn("Using detail=False with reldist will return a dictionary with keys (reldist, count, total, fraction), which is the summary of the bedtools reldist."
|
jpayne@68
|
2576 "Not a BedTool object.")
|
jpayne@68
|
2577
|
jpayne@68
|
2578
|
jpayne@68
|
2579 @_wraps(prog="sample", implicit="i", bam="i")
|
jpayne@68
|
2580 def sample(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2581 """
|
jpayne@68
|
2582 Wraps 'sample'.
|
jpayne@68
|
2583 """
|
jpayne@68
|
2584
|
jpayne@68
|
2585 @_wraps(
|
jpayne@68
|
2586 prog="fisher",
|
jpayne@68
|
2587 implicit="a",
|
jpayne@68
|
2588 other="b",
|
jpayne@68
|
2589 uses_genome=True,
|
jpayne@68
|
2590 does_not_return_bedtool=FisherOutput,
|
jpayne@68
|
2591 )
|
jpayne@68
|
2592 def fisher(self, *args, **kwargs) -> FisherOutput: # type: ignore
|
jpayne@68
|
2593 """
|
jpayne@68
|
2594 Wraps 'fisher'. Returns an object representing the output.
|
jpayne@68
|
2595
|
jpayne@68
|
2596 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2597 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
2598 >>> f = a.fisher(b, genome='hg19')
|
jpayne@68
|
2599 >>> print(f) # doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2600 # Number of query intervals: 4
|
jpayne@68
|
2601 # Number of db intervals: 2
|
jpayne@68
|
2602 # Number of overlaps: 3
|
jpayne@68
|
2603 # Number of possible intervals (estimated): 13958448
|
jpayne@68
|
2604 # phyper(3 - 1, 4, 13958448 - 4, 2, lower.tail=F)
|
jpayne@68
|
2605 # Contingency Table Of Counts
|
jpayne@68
|
2606 #_________________________________________
|
jpayne@68
|
2607 # | in -b | not in -b |
|
jpayne@68
|
2608 # in -a | 3 | 1 |
|
jpayne@68
|
2609 # not in -a | 0 | 13958444 |
|
jpayne@68
|
2610 #_________________________________________
|
jpayne@68
|
2611 # p-values for fisher's exact test
|
jpayne@68
|
2612 left right two-tail ratio
|
jpayne@68
|
2613 1 8.8247e-21 8.8247e-21 inf
|
jpayne@68
|
2614 <BLANKLINE>
|
jpayne@68
|
2615
|
jpayne@68
|
2616
|
jpayne@68
|
2617 >>> f.table['not in -a']['in -b']
|
jpayne@68
|
2618 0
|
jpayne@68
|
2619
|
jpayne@68
|
2620 >>> f.table['not in -a']['not in -b']
|
jpayne@68
|
2621 13958444
|
jpayne@68
|
2622
|
jpayne@68
|
2623 >>> f.table['in -a']['in -b']
|
jpayne@68
|
2624 3
|
jpayne@68
|
2625
|
jpayne@68
|
2626 >>> f.table['in -a']['not in -b']
|
jpayne@68
|
2627 1
|
jpayne@68
|
2628
|
jpayne@68
|
2629 >>> f.two_tail
|
jpayne@68
|
2630 8.8247e-21
|
jpayne@68
|
2631 """
|
jpayne@68
|
2632
|
jpayne@68
|
2633 @_wraps(prog="split", implicit="i", does_not_return_bedtool=SplitOutput)
|
jpayne@68
|
2634 def splitbed(self, *args, **kwargs) -> SplitOutput: # type: ignore
|
jpayne@68
|
2635 """
|
jpayne@68
|
2636 Wraps 'bedtools split'.
|
jpayne@68
|
2637
|
jpayne@68
|
2638 BedTool objects have long had a `split` method which splits intervals
|
jpayne@68
|
2639 according to a custom function. Now that BEDTools has a `split` tool,
|
jpayne@68
|
2640 the method name conflicts. To maintain backwards compatibility, the
|
jpayne@68
|
2641 method wrapping the BEDTools command is called `splitbed`.
|
jpayne@68
|
2642
|
jpayne@68
|
2643 Since this tool does not return a single BED file, the method parses
|
jpayne@68
|
2644 the output and returns a SplitOutput object, which includes an
|
jpayne@68
|
2645 attribute, `bedtools`, that is a list of BedTool objects created from
|
jpayne@68
|
2646 the split files.
|
jpayne@68
|
2647
|
jpayne@68
|
2648 To keep the working directory clean, you may want to consider using
|
jpayne@68
|
2649 `prefix=BedTool._tmp()` to get a temp file that will be deleted when
|
jpayne@68
|
2650 Python exits cleanly.
|
jpayne@68
|
2651
|
jpayne@68
|
2652 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2653 >>> s = a.splitbed(n=2, p="split")
|
jpayne@68
|
2654 >>> assert len(a) == 4, len(a)
|
jpayne@68
|
2655 >>> assert len(s.bedtools) == 2
|
jpayne@68
|
2656 >>> print(s.bedtools[0]) # doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2657 chr1 150 500 feature3 0 -
|
jpayne@68
|
2658 <BLANKLINE>
|
jpayne@68
|
2659 >>> print(s.bedtools[1]) # doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2660 chr1 100 200 feature2 0 +
|
jpayne@68
|
2661 chr1 1 100 feature1 0 +
|
jpayne@68
|
2662 chr1 900 950 feature4 0 +
|
jpayne@68
|
2663 <BLANKLINE>
|
jpayne@68
|
2664 """
|
jpayne@68
|
2665
|
jpayne@68
|
2666 @_wraps(prog="spacing", implicit="i")
|
jpayne@68
|
2667 def spacing(self, *args, **kwargs) -> BedTool: # type: ignore
|
jpayne@68
|
2668 """
|
jpayne@68
|
2669 Wraps `bedtools spacing`
|
jpayne@68
|
2670
|
jpayne@68
|
2671 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2672 >>> print(a.spacing()) # doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
2673 chr1 1 100 feature1 0 + .
|
jpayne@68
|
2674 chr1 100 200 feature2 0 + 0
|
jpayne@68
|
2675 chr1 150 500 feature3 0 - -1
|
jpayne@68
|
2676 chr1 900 950 feature4 0 + 400
|
jpayne@68
|
2677 """
|
jpayne@68
|
2678
|
jpayne@68
|
2679 def count(self) -> int:
|
jpayne@68
|
2680 """
|
jpayne@68
|
2681 Count the number features in this BedTool.
|
jpayne@68
|
2682
|
jpayne@68
|
2683 Number of features in BED file. Does the same thing as len(self), which
|
jpayne@68
|
2684 actually just calls this method.
|
jpayne@68
|
2685
|
jpayne@68
|
2686 Only counts the actual features. Ignores any track lines, browser
|
jpayne@68
|
2687 lines, lines starting with a "#", or blank lines.
|
jpayne@68
|
2688
|
jpayne@68
|
2689 Example usage:
|
jpayne@68
|
2690
|
jpayne@68
|
2691 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
2692 >>> a.count()
|
jpayne@68
|
2693 4
|
jpayne@68
|
2694 """
|
jpayne@68
|
2695 if hasattr(self, "next") or hasattr(self, "__next__"):
|
jpayne@68
|
2696 return sum(1 for _ in self)
|
jpayne@68
|
2697 return sum(1 for _ in iter(self))
|
jpayne@68
|
2698
|
jpayne@68
|
2699 def print_sequence(self) -> str:
|
jpayne@68
|
2700 """
|
jpayne@68
|
2701 Print the sequence that was retrieved by BedTool.sequence.
|
jpayne@68
|
2702 See usage example in :meth:`BedTool.sequence`.
|
jpayne@68
|
2703
|
jpayne@68
|
2704 Returns:
|
jpayne@68
|
2705 str: The sequence as a string.
|
jpayne@68
|
2706
|
jpayne@68
|
2707 Raises:
|
jpayne@68
|
2708 ValueError: If the sequence has not been generated using .sequence(fasta).
|
jpayne@68
|
2709 """
|
jpayne@68
|
2710 if self.seqfn is None:
|
jpayne@68
|
2711 raise ValueError("Use .sequence(fasta) to get the sequence first")
|
jpayne@68
|
2712 f = open(self.seqfn)
|
jpayne@68
|
2713 s = f.read()
|
jpayne@68
|
2714 f.close()
|
jpayne@68
|
2715 return s
|
jpayne@68
|
2716
|
jpayne@68
|
2717 def save_seqs(self, fn:str) -> BedTool:
|
jpayne@68
|
2718 """
|
jpayne@68
|
2719 Save sequences, after calling BedTool.sequence.
|
jpayne@68
|
2720
|
jpayne@68
|
2721 In order to use this function, you need to have called
|
jpayne@68
|
2722 the :meth:`BedTool.sequence()` method.
|
jpayne@68
|
2723
|
jpayne@68
|
2724 A new BedTool object is returned which references the newly saved file.
|
jpayne@68
|
2725
|
jpayne@68
|
2726 Example usage:
|
jpayne@68
|
2727
|
jpayne@68
|
2728 >>> a = pybedtools.BedTool('''
|
jpayne@68
|
2729 ... chr1 1 10
|
jpayne@68
|
2730 ... chr1 50 55''', from_string=True)
|
jpayne@68
|
2731 >>> fasta = pybedtools.example_filename('test.fa')
|
jpayne@68
|
2732 >>> a = a.sequence(fi=fasta)
|
jpayne@68
|
2733 >>> print(open(a.seqfn).read())
|
jpayne@68
|
2734 >chr1:1-10
|
jpayne@68
|
2735 GATGAGTCT
|
jpayne@68
|
2736 >chr1:50-55
|
jpayne@68
|
2737 CCATC
|
jpayne@68
|
2738 <BLANKLINE>
|
jpayne@68
|
2739 >>> b = a.save_seqs('example.fa')
|
jpayne@68
|
2740 >>> assert open(b.fn).read() == open(a.fn).read()
|
jpayne@68
|
2741 >>> if os.path.exists('example.fa'):
|
jpayne@68
|
2742 ... os.unlink('example.fa')
|
jpayne@68
|
2743 """
|
jpayne@68
|
2744
|
jpayne@68
|
2745 if self.seqfn is None:
|
jpayne@68
|
2746 raise ValueError("Use .sequence(fasta) to get the sequence first")
|
jpayne@68
|
2747
|
jpayne@68
|
2748 with open(fn, "w") as fout:
|
jpayne@68
|
2749 if self.seqfn is None:
|
jpayne@68
|
2750 raise ValueError("Use .sequence(fasta) to get the sequence first")
|
jpayne@68
|
2751 with open(self.seqfn) as seqfile:
|
jpayne@68
|
2752 fout.write(seqfile.read())
|
jpayne@68
|
2753
|
jpayne@68
|
2754 new_bedtool = BedTool(self.fn)
|
jpayne@68
|
2755 new_bedtool.seqfn = fn
|
jpayne@68
|
2756 return new_bedtool
|
jpayne@68
|
2757
|
jpayne@68
|
2758 def randomstats(
|
jpayne@68
|
2759 self,
|
jpayne@68
|
2760 other: BedTool,
|
jpayne@68
|
2761 iterations: int,
|
jpayne@68
|
2762 new: bool = False,
|
jpayne@68
|
2763 genome_fn: Optional[str] = None,
|
jpayne@68
|
2764 include_distribution: bool = False,
|
jpayne@68
|
2765 **kwargs
|
jpayne@68
|
2766 ) -> dict[str, Any]:
|
jpayne@68
|
2767 """
|
jpayne@68
|
2768 Dictionary of results from many randomly shuffled intersections.
|
jpayne@68
|
2769
|
jpayne@68
|
2770 Sends args and kwargs to :meth:`BedTool.randomintersection` and
|
jpayne@68
|
2771 compiles results into a dictionary with useful stats. Requires
|
jpayne@68
|
2772 numpy.
|
jpayne@68
|
2773
|
jpayne@68
|
2774 If `include_distribution` is True, then the dictionary will include the
|
jpayne@68
|
2775 full distribution; otherwise, the distribution is deleted and cleaned
|
jpayne@68
|
2776 up to save on memory usage.
|
jpayne@68
|
2777
|
jpayne@68
|
2778 This is one possible way of assigning significance to overlaps between
|
jpayne@68
|
2779 two files. See, for example:
|
jpayne@68
|
2780
|
jpayne@68
|
2781 Negre N, Brown CD, Shah PK, Kheradpour P, Morrison CA, et al. 2010
|
jpayne@68
|
2782 A Comprehensive Map of Insulator Elements for the Drosophila
|
jpayne@68
|
2783 Genome. PLoS Genet 6(1): e1000814. doi:10.1371/journal.pgen.1000814
|
jpayne@68
|
2784
|
jpayne@68
|
2785 Example usage:
|
jpayne@68
|
2786
|
jpayne@68
|
2787 Make chromsizes a very small genome for this example:
|
jpayne@68
|
2788
|
jpayne@68
|
2789 >>> chromsizes = {'chr1':(1,1000)}
|
jpayne@68
|
2790 >>> a = pybedtools.example_bedtool('a.bed').set_chromsizes(chromsizes)
|
jpayne@68
|
2791 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
2792 >>> try:
|
jpayne@68
|
2793 ... results = a.randomstats(b, 100, debug=True)
|
jpayne@68
|
2794 ... except ImportError:
|
jpayne@68
|
2795 ... pass
|
jpayne@68
|
2796
|
jpayne@68
|
2797 *results* is a dictionary that you can inspect.
|
jpayne@68
|
2798
|
jpayne@68
|
2799 (Note that the following examples are not run as part of the doctests
|
jpayne@68
|
2800 to avoid forcing users to install NumPy just to pass tests)
|
jpayne@68
|
2801
|
jpayne@68
|
2802 The actual overlap::
|
jpayne@68
|
2803
|
jpayne@68
|
2804 print(results['actual'])
|
jpayne@68
|
2805 3
|
jpayne@68
|
2806
|
jpayne@68
|
2807 The median of all randomized overlaps::
|
jpayne@68
|
2808
|
jpayne@68
|
2809 print(results['median randomized'])
|
jpayne@68
|
2810 2.0
|
jpayne@68
|
2811
|
jpayne@68
|
2812 The percentile of the actual overlap in the distribution of randomized
|
jpayne@68
|
2813 overlaps, which can be used to get an empirical p-value::
|
jpayne@68
|
2814
|
jpayne@68
|
2815 print(results['percentile'])
|
jpayne@68
|
2816 90.0
|
jpayne@68
|
2817 """
|
jpayne@68
|
2818 if ("intersect_kwargs" not in kwargs) or (kwargs["intersect_kwargs"] is None):
|
jpayne@68
|
2819 kwargs["intersect_kwargs"] = {"u": True}
|
jpayne@68
|
2820 try:
|
jpayne@68
|
2821 import numpy as np
|
jpayne@68
|
2822 except ImportError:
|
jpayne@68
|
2823 raise ImportError("Need to install NumPy for stats...")
|
jpayne@68
|
2824
|
jpayne@68
|
2825 def percentileofscore(a, score):
|
jpayne@68
|
2826 """
|
jpayne@68
|
2827 copied from scipy.stats.percentileofscore, to avoid dependency on
|
jpayne@68
|
2828 scipy.
|
jpayne@68
|
2829 """
|
jpayne@68
|
2830 a = np.array(a)
|
jpayne@68
|
2831 n = len(a)
|
jpayne@68
|
2832
|
jpayne@68
|
2833 if not (np.any(a == score)):
|
jpayne@68
|
2834 a = np.append(a, score)
|
jpayne@68
|
2835 a_len = np.array(list(range(len(a))))
|
jpayne@68
|
2836 else:
|
jpayne@68
|
2837 a_len = np.array(list(range(len(a)))) + 1.0
|
jpayne@68
|
2838
|
jpayne@68
|
2839 a = np.sort(a)
|
jpayne@68
|
2840 idx = tuple([a == score])
|
jpayne@68
|
2841 pct = (np.mean(a_len[idx]) / n) * 100.0
|
jpayne@68
|
2842 return pct
|
jpayne@68
|
2843
|
jpayne@68
|
2844 if isinstance(other, str):
|
jpayne@68
|
2845 other = BedTool(other)
|
jpayne@68
|
2846 else:
|
jpayne@68
|
2847 assert isinstance(
|
jpayne@68
|
2848 other, BedTool
|
jpayne@68
|
2849 ), "Either filename or another BedTool instance required"
|
jpayne@68
|
2850
|
jpayne@68
|
2851 # Actual (unshuffled) counts.
|
jpayne@68
|
2852 i_kwargs = kwargs["intersect_kwargs"]
|
jpayne@68
|
2853 actual = len(self.intersect(other, **i_kwargs))
|
jpayne@68
|
2854
|
jpayne@68
|
2855 # List of counts from randomly shuffled versions.
|
jpayne@68
|
2856 # Length of counts == *iterations*.
|
jpayne@68
|
2857
|
jpayne@68
|
2858 if not new:
|
jpayne@68
|
2859 distribution = self.randomintersection(
|
jpayne@68
|
2860 other, iterations=iterations, **kwargs
|
jpayne@68
|
2861 )
|
jpayne@68
|
2862 else:
|
jpayne@68
|
2863 # use new mechanism
|
jpayne@68
|
2864 if genome_fn is None:
|
jpayne@68
|
2865 raise ValueError(
|
jpayne@68
|
2866 "`genome_fn` must be provided if using the "
|
jpayne@68
|
2867 "new _randomintersection mechanism"
|
jpayne@68
|
2868 )
|
jpayne@68
|
2869 distribution = self._randomintersection(
|
jpayne@68
|
2870 other, iterations=iterations, genome_fn=genome_fn, **kwargs
|
jpayne@68
|
2871 )
|
jpayne@68
|
2872
|
jpayne@68
|
2873 distribution = np.array(list(distribution))
|
jpayne@68
|
2874
|
jpayne@68
|
2875 # Median of distribution
|
jpayne@68
|
2876 med_count = np.median(distribution)
|
jpayne@68
|
2877
|
jpayne@68
|
2878 n = float(len(distribution))
|
jpayne@68
|
2879
|
jpayne@68
|
2880 frac_above = sum(distribution > actual) / n
|
jpayne@68
|
2881 frac_below = sum(distribution < actual) / n
|
jpayne@68
|
2882
|
jpayne@68
|
2883 normalized = actual / med_count
|
jpayne@68
|
2884
|
jpayne@68
|
2885 lower_thresh = 2.5
|
jpayne@68
|
2886 upper_thresh = 97.5
|
jpayne@68
|
2887 lower, upper = np.percentile(distribution, [lower_thresh, upper_thresh])
|
jpayne@68
|
2888
|
jpayne@68
|
2889 actual_percentile = percentileofscore(distribution, actual)
|
jpayne@68
|
2890 d = {
|
jpayne@68
|
2891 "iterations": iterations,
|
jpayne@68
|
2892 "actual": actual,
|
jpayne@68
|
2893 "file_a": self.fn,
|
jpayne@68
|
2894 "file_b": other.fn,
|
jpayne@68
|
2895 self.fn: len(self),
|
jpayne@68
|
2896 other.fn: len(other),
|
jpayne@68
|
2897 "self": len(self),
|
jpayne@68
|
2898 "other": len(other),
|
jpayne@68
|
2899 "frac randomized above actual": frac_above,
|
jpayne@68
|
2900 "frac randomized below actual": frac_below,
|
jpayne@68
|
2901 "median randomized": med_count,
|
jpayne@68
|
2902 "normalized": normalized,
|
jpayne@68
|
2903 "percentile": actual_percentile,
|
jpayne@68
|
2904 "lower_%sth" % lower_thresh: lower,
|
jpayne@68
|
2905 "upper_%sth" % upper_thresh: upper,
|
jpayne@68
|
2906 }
|
jpayne@68
|
2907 if include_distribution:
|
jpayne@68
|
2908 d["distribution"] = distribution
|
jpayne@68
|
2909 else:
|
jpayne@68
|
2910 del distribution
|
jpayne@68
|
2911 return d
|
jpayne@68
|
2912
|
jpayne@68
|
2913 def random_op(self, *args, **kwargs):
|
jpayne@68
|
2914 """
|
jpayne@68
|
2915 For backwards compatibility; see BedTool.parallel_apply instead.
|
jpayne@68
|
2916 """
|
jpayne@68
|
2917 return self.parallel_apply(*args, **kwargs)
|
jpayne@68
|
2918
|
jpayne@68
|
2919 def parallel_apply(
|
jpayne@68
|
2920 self,
|
jpayne@68
|
2921 iterations: int,
|
jpayne@68
|
2922 func: Callable,
|
jpayne@68
|
2923 func_args: Iterable,
|
jpayne@68
|
2924 func_kwargs: dict,
|
jpayne@68
|
2925 processes: int=1,
|
jpayne@68
|
2926 _orig_pool: Optional[Pool] = None # type: ignore
|
jpayne@68
|
2927 ):
|
jpayne@68
|
2928 """
|
jpayne@68
|
2929 Generalized method for applying a function in parallel.
|
jpayne@68
|
2930
|
jpayne@68
|
2931 Typically used when having to do many random shufflings.
|
jpayne@68
|
2932
|
jpayne@68
|
2933 `func_args` and `func_kwargs` will be passed to `func` each time in
|
jpayne@68
|
2934 `iterations`, and these iterations will be split across `processes`
|
jpayne@68
|
2935 processes.
|
jpayne@68
|
2936
|
jpayne@68
|
2937 Notes on the function, `func`:
|
jpayne@68
|
2938
|
jpayne@68
|
2939 * the function should manually remove any tempfiles created. This
|
jpayne@68
|
2940 is because the BedTool.TEMPFILES list of auto-created tempfiles
|
jpayne@68
|
2941 does not share state across processes, so things will not get
|
jpayne@68
|
2942 cleaned up automatically as they do in a single-process
|
jpayne@68
|
2943 pybedtools session.
|
jpayne@68
|
2944
|
jpayne@68
|
2945 * this includes deleting any "chromsizes" or genome files --
|
jpayne@68
|
2946 generally it will be best to require a genome filename in
|
jpayne@68
|
2947 `func_kwargs` if you'll be using any BedTool methods that accept
|
jpayne@68
|
2948 the `g` kwarg.
|
jpayne@68
|
2949
|
jpayne@68
|
2950 * the function should be a module-level function (rather than a
|
jpayne@68
|
2951 class method) because class methods can't be pickled across
|
jpayne@68
|
2952 process boundaries
|
jpayne@68
|
2953
|
jpayne@68
|
2954 * the function can have any signature and have any return value
|
jpayne@68
|
2955
|
jpayne@68
|
2956 `_orig_pool` can be a previously-created multiprocessing.Pool instance;
|
jpayne@68
|
2957 otherwise, a new Pool will be created with `processes`
|
jpayne@68
|
2958 """
|
jpayne@68
|
2959 if processes == 1:
|
jpayne@68
|
2960 for _ in range(iterations):
|
jpayne@68
|
2961 yield func(*func_args, **func_kwargs)
|
jpayne@68
|
2962 raise StopIteration
|
jpayne@68
|
2963
|
jpayne@68
|
2964 if _orig_pool:
|
jpayne@68
|
2965 p = _orig_pool
|
jpayne@68
|
2966 else:
|
jpayne@68
|
2967 p = Pool(processes)
|
jpayne@68
|
2968 iterations_each = [iterations / processes] * processes
|
jpayne@68
|
2969 iterations_each[-1] += iterations % processes
|
jpayne@68
|
2970
|
jpayne@68
|
2971 # FYI some useful info on apply_async:
|
jpayne@68
|
2972 # http://stackoverflow.com/questions/8533318/
|
jpayne@68
|
2973 # python-multiprocessing-pool-when-to-use-apply-apply-async-or-map
|
jpayne@68
|
2974 #
|
jpayne@68
|
2975 # Here, we don't care about the order, and don't want the subprocesses
|
jpayne@68
|
2976 # to block.
|
jpayne@68
|
2977 results = [
|
jpayne@68
|
2978 p.apply_async(func, func_args, func_kwargs) for _ in range(iterations)
|
jpayne@68
|
2979 ]
|
jpayne@68
|
2980 for r in results:
|
jpayne@68
|
2981 yield r.get()
|
jpayne@68
|
2982 raise StopIteration
|
jpayne@68
|
2983
|
jpayne@68
|
2984 def random_jaccard(
|
jpayne@68
|
2985 self,
|
jpayne@68
|
2986 other: BedTool,
|
jpayne@68
|
2987 genome_fn: Optional[str] = None,
|
jpayne@68
|
2988 iterations: Optional[int] = None,
|
jpayne@68
|
2989 processes: int = 1,
|
jpayne@68
|
2990 _orig_pool: Optional[Pool] = None,
|
jpayne@68
|
2991 shuffle_kwargs: Optional[dict[str, Any]] = None,
|
jpayne@68
|
2992 jaccard_kwargs: Optional[dict[str, Any]] = None,
|
jpayne@68
|
2993 ) -> list:
|
jpayne@68
|
2994 """
|
jpayne@68
|
2995 Computes the naive Jaccard statistic (intersection divided by union).
|
jpayne@68
|
2996
|
jpayne@68
|
2997 .. note::
|
jpayne@68
|
2998
|
jpayne@68
|
2999 If you don't need the randomization functionality of this method,
|
jpayne@68
|
3000 you can use the simpler BedTool.jaccard method instead.
|
jpayne@68
|
3001
|
jpayne@68
|
3002 See Favorov et al. (2012) PLoS Comput Biol 8(5): e1002529 for more
|
jpayne@68
|
3003 info on the Jaccard statistic for intersections.
|
jpayne@68
|
3004
|
jpayne@68
|
3005 If `iterations` is None, then do not perform random shufflings.
|
jpayne@68
|
3006
|
jpayne@68
|
3007 If `iterations` is an integer, perform `iterations` random shufflings,
|
jpayne@68
|
3008 each time computing the Jaccard statistic to build an empirical
|
jpayne@68
|
3009 distribution. `genome_fn` will also be needed; optional `processes`
|
jpayne@68
|
3010 will split the iterations across multiple CPUs.
|
jpayne@68
|
3011
|
jpayne@68
|
3012 Returns a tuple of the observed Jaccard statistic and a list of the
|
jpayne@68
|
3013 randomized statistics (which will be an empty list if `iterations` was
|
jpayne@68
|
3014 None).
|
jpayne@68
|
3015 """
|
jpayne@68
|
3016 if shuffle_kwargs is None:
|
jpayne@68
|
3017 shuffle_kwargs = {}
|
jpayne@68
|
3018 if jaccard_kwargs is None:
|
jpayne@68
|
3019 jaccard_kwargs = {}
|
jpayne@68
|
3020 if not genome_fn:
|
jpayne@68
|
3021 raise ValueError("Need a genome filename in order to perform randomization")
|
jpayne@68
|
3022 return list(
|
jpayne@68
|
3023 self.parallel_apply(
|
jpayne@68
|
3024 iterations=iterations if iterations else 1,
|
jpayne@68
|
3025 func=pybedtools.stats.random_jaccard,
|
jpayne@68
|
3026 func_args=(self, other),
|
jpayne@68
|
3027 func_kwargs=dict(
|
jpayne@68
|
3028 genome_fn=genome_fn,
|
jpayne@68
|
3029 shuffle_kwargs=shuffle_kwargs,
|
jpayne@68
|
3030 jaccard_kwargs=jaccard_kwargs,
|
jpayne@68
|
3031 ),
|
jpayne@68
|
3032 processes=processes,
|
jpayne@68
|
3033 _orig_pool=_orig_pool,
|
jpayne@68
|
3034 )
|
jpayne@68
|
3035 )
|
jpayne@68
|
3036
|
jpayne@68
|
3037 def _randomintersection(
|
jpayne@68
|
3038 self,
|
jpayne@68
|
3039 other: BedTool,
|
jpayne@68
|
3040 iterations: int,
|
jpayne@68
|
3041 genome_fn: str,
|
jpayne@68
|
3042 intersect_kwargs: Optional[dict[str, Any]] = None,
|
jpayne@68
|
3043 _orig_pool:Optional[Pool] = None,
|
jpayne@68
|
3044 shuffle_kwargs: Optional[dict[str, Any]] = None,
|
jpayne@68
|
3045 processes: int = 1,
|
jpayne@68
|
3046 ):
|
jpayne@68
|
3047 """
|
jpayne@68
|
3048 Re-implementation of BedTool.randomintersection using the new
|
jpayne@68
|
3049 `random_op` method
|
jpayne@68
|
3050 """
|
jpayne@68
|
3051 if shuffle_kwargs is None:
|
jpayne@68
|
3052 shuffle_kwargs = {}
|
jpayne@68
|
3053 if intersect_kwargs is None:
|
jpayne@68
|
3054 intersect_kwargs = dict(u=True)
|
jpayne@68
|
3055 if not genome_fn:
|
jpayne@68
|
3056 raise ValueError("Need a genome filename in order to perform randomization")
|
jpayne@68
|
3057 return list(
|
jpayne@68
|
3058 self.parallel_apply(
|
jpayne@68
|
3059 iterations=iterations,
|
jpayne@68
|
3060 func=pybedtools.stats.random_intersection,
|
jpayne@68
|
3061 func_args=(self, other),
|
jpayne@68
|
3062 func_kwargs=dict(
|
jpayne@68
|
3063 genome_fn=genome_fn,
|
jpayne@68
|
3064 shuffle_kwargs=shuffle_kwargs,
|
jpayne@68
|
3065 intersect_kwargs=intersect_kwargs,
|
jpayne@68
|
3066 ),
|
jpayne@68
|
3067 processes=processes,
|
jpayne@68
|
3068 _orig_pool=_orig_pool,
|
jpayne@68
|
3069 )
|
jpayne@68
|
3070 )
|
jpayne@68
|
3071
|
jpayne@68
|
3072 def randomintersection_bp(
|
jpayne@68
|
3073 self,
|
jpayne@68
|
3074 other: BedTool,
|
jpayne@68
|
3075 iterations: int,
|
jpayne@68
|
3076 genome_fn: str,
|
jpayne@68
|
3077 intersect_kwargs: Optional[dict[str, Any]] = None,
|
jpayne@68
|
3078 shuffle_kwargs: Optional[dict[str, Any]] = None,
|
jpayne@68
|
3079 processes: int = 1,
|
jpayne@68
|
3080 _orig_pool:Optional[Pool] = None,
|
jpayne@68
|
3081 ) -> list[int]:
|
jpayne@68
|
3082 """
|
jpayne@68
|
3083 Like randomintersection, but return the bp overlap instead of the
|
jpayne@68
|
3084 number of intersecting intervals.
|
jpayne@68
|
3085 """
|
jpayne@68
|
3086 if shuffle_kwargs is None:
|
jpayne@68
|
3087 shuffle_kwargs = {}
|
jpayne@68
|
3088 if intersect_kwargs is None:
|
jpayne@68
|
3089 intersect_kwargs = {}
|
jpayne@68
|
3090 if not genome_fn:
|
jpayne@68
|
3091 raise ValueError("Need a genome filename in order to perform randomization")
|
jpayne@68
|
3092 return list(
|
jpayne@68
|
3093 self.parallel_apply(
|
jpayne@68
|
3094 iterations=iterations,
|
jpayne@68
|
3095 func=pybedtools.stats.random_intersection_bp,
|
jpayne@68
|
3096 func_args=(self, other),
|
jpayne@68
|
3097 func_kwargs=dict(
|
jpayne@68
|
3098 genome_fn=genome_fn,
|
jpayne@68
|
3099 shuffle_kwargs=shuffle_kwargs,
|
jpayne@68
|
3100 intersect_kwargs=intersect_kwargs,
|
jpayne@68
|
3101 ),
|
jpayne@68
|
3102 processes=processes,
|
jpayne@68
|
3103 _orig_pool=_orig_pool,
|
jpayne@68
|
3104 )
|
jpayne@68
|
3105 )
|
jpayne@68
|
3106
|
jpayne@68
|
3107 def randomintersection(
|
jpayne@68
|
3108 self,
|
jpayne@68
|
3109 other: BedTool,
|
jpayne@68
|
3110 iterations: int,
|
jpayne@68
|
3111 intersect_kwargs: Optional[dict[str, Any]] = None,
|
jpayne@68
|
3112 shuffle_kwargs: Optional[dict[str, Any]] = None,
|
jpayne@68
|
3113 debug: bool = False,
|
jpayne@68
|
3114 report_iterations: bool = False,
|
jpayne@68
|
3115 processes: Optional[int] = None,
|
jpayne@68
|
3116 _orig_processes: Optional[int] = None,
|
jpayne@68
|
3117 ) -> Iterator[int]:
|
jpayne@68
|
3118 """
|
jpayne@68
|
3119 Perform `iterations` shufflings, each time intersecting with `other`.
|
jpayne@68
|
3120
|
jpayne@68
|
3121 Returns a generator of integers where each integer is the number of
|
jpayne@68
|
3122 intersections of a shuffled file with *other*. This distribution can
|
jpayne@68
|
3123 be used in downstream analysis for things like empirical p-values.
|
jpayne@68
|
3124
|
jpayne@68
|
3125 *intersect_kwargs* and *shuffle_kwargs* are passed to self.intersect()
|
jpayne@68
|
3126 and self.shuffle() respectively. By default for intersect, u=True is
|
jpayne@68
|
3127 specified -- but s=True might be a useful option for strand-specific
|
jpayne@68
|
3128 work.
|
jpayne@68
|
3129
|
jpayne@68
|
3130 Useful kwargs for *shuffle_kwargs* are chrom, excl, or incl. If you
|
jpayne@68
|
3131 use the "seed" kwarg, that seed will be used *each* time shuffleBed is
|
jpayne@68
|
3132 called -- so all your randomization results will be identical for each
|
jpayne@68
|
3133 iteration. To get around this and to allow for tests, debug=True will
|
jpayne@68
|
3134 set the seed to the iteration number. You may also break up the
|
jpayne@68
|
3135 intersections across multiple processes with *processes* > 1.
|
jpayne@68
|
3136
|
jpayne@68
|
3137 Example usage:
|
jpayne@68
|
3138
|
jpayne@68
|
3139 >>> chromsizes = {'chr1':(0, 1000)}
|
jpayne@68
|
3140 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
3141 >>> a = a.set_chromsizes(chromsizes)
|
jpayne@68
|
3142 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
3143 >>> results = a.randomintersection(b, 10, debug=True)
|
jpayne@68
|
3144 >>> print(list(results))
|
jpayne@68
|
3145 [1, 0, 1, 2, 4, 2, 2, 1, 2, 4]
|
jpayne@68
|
3146
|
jpayne@68
|
3147 """
|
jpayne@68
|
3148 if processes is not None:
|
jpayne@68
|
3149 p = Pool(processes)
|
jpayne@68
|
3150 iterations_each = [iterations // processes] * processes
|
jpayne@68
|
3151 iterations_each[-1] += iterations % processes
|
jpayne@68
|
3152 results = [
|
jpayne@68
|
3153 p.apply_async(
|
jpayne@68
|
3154 _call_randomintersect,
|
jpayne@68
|
3155 (self, other, it),
|
jpayne@68
|
3156 dict(
|
jpayne@68
|
3157 intersect_kwargs=intersect_kwargs,
|
jpayne@68
|
3158 shuffle_kwargs=shuffle_kwargs,
|
jpayne@68
|
3159 debug=debug,
|
jpayne@68
|
3160 report_iterations=report_iterations,
|
jpayne@68
|
3161 _orig_processes=processes,
|
jpayne@68
|
3162 ),
|
jpayne@68
|
3163 )
|
jpayne@68
|
3164 for it in iterations_each
|
jpayne@68
|
3165 ]
|
jpayne@68
|
3166 for r in results:
|
jpayne@68
|
3167 for value in r.get():
|
jpayne@68
|
3168 yield value
|
jpayne@68
|
3169 raise StopIteration
|
jpayne@68
|
3170
|
jpayne@68
|
3171 if shuffle_kwargs is None:
|
jpayne@68
|
3172 shuffle_kwargs = {}
|
jpayne@68
|
3173 if intersect_kwargs is None:
|
jpayne@68
|
3174 intersect_kwargs = {"u": True}
|
jpayne@68
|
3175
|
jpayne@68
|
3176 if "u" not in intersect_kwargs:
|
jpayne@68
|
3177 intersect_kwargs["u"] = True
|
jpayne@68
|
3178
|
jpayne@68
|
3179 resort = intersect_kwargs.get("sorted", False)
|
jpayne@68
|
3180
|
jpayne@68
|
3181 for i in range(iterations):
|
jpayne@68
|
3182 if debug:
|
jpayne@68
|
3183 shuffle_kwargs["seed"] = i
|
jpayne@68
|
3184 if report_iterations:
|
jpayne@68
|
3185 if _orig_processes > 1:
|
jpayne@68
|
3186 msg = "\rapprox (total across %s processes): %s" % (
|
jpayne@68
|
3187 _orig_processes,
|
jpayne@68
|
3188 i * _orig_processes,
|
jpayne@68
|
3189 )
|
jpayne@68
|
3190 else:
|
jpayne@68
|
3191 msg = "\r%s" % i
|
jpayne@68
|
3192 sys.stderr.write(msg)
|
jpayne@68
|
3193 sys.stderr.flush()
|
jpayne@68
|
3194
|
jpayne@68
|
3195 # Re-sort if sorted=True in kwargs
|
jpayne@68
|
3196 if resort:
|
jpayne@68
|
3197 tmp0 = self.shuffle(**shuffle_kwargs)
|
jpayne@68
|
3198 tmp = tmp0.sort()
|
jpayne@68
|
3199 else:
|
jpayne@68
|
3200 tmp = self.shuffle(**shuffle_kwargs)
|
jpayne@68
|
3201
|
jpayne@68
|
3202 tmp2 = tmp.intersect(other, stream=True, **intersect_kwargs)
|
jpayne@68
|
3203
|
jpayne@68
|
3204 yield len(tmp2)
|
jpayne@68
|
3205
|
jpayne@68
|
3206 # Close the open stdouts from subprocess.Popen calls. Note: doing
|
jpayne@68
|
3207 # this in self.__del__ doesn't fix the open file limit bug; it
|
jpayne@68
|
3208 # needs to be done here.
|
jpayne@68
|
3209 # if resort:
|
jpayne@68
|
3210 # tmp0.fn.close()
|
jpayne@68
|
3211 # tmp.fn.close()
|
jpayne@68
|
3212 tmp2.fn.close()
|
jpayne@68
|
3213 del tmp
|
jpayne@68
|
3214 del tmp2
|
jpayne@68
|
3215
|
jpayne@68
|
3216 @_log_to_history
|
jpayne@68
|
3217 def cat(self, *others: Iterable[str|Path|BedTool], **kwargs) -> BedTool:
|
jpayne@68
|
3218 """
|
jpayne@68
|
3219 Concatenate interval files together.
|
jpayne@68
|
3220
|
jpayne@68
|
3221 Concatenates two BedTool objects (or an object and a file) and does an
|
jpayne@68
|
3222 optional post-merge of the features.
|
jpayne@68
|
3223
|
jpayne@68
|
3224 *postmerge=True* by default; use *postmerge=False* if you want to keep
|
jpayne@68
|
3225 features separate.
|
jpayne@68
|
3226
|
jpayne@68
|
3227 *force_truncate=False* by default; *force_truncate=True* to truncate
|
jpayne@68
|
3228 all files to chrom, start, stop.
|
jpayne@68
|
3229
|
jpayne@68
|
3230 When *force_truncate=False* and *postmerge=False*, the output will
|
jpayne@68
|
3231 contain the smallest number of fields observed across all inputs. This
|
jpayne@68
|
3232 maintains compatibility with BEDTools programs, which assume constant
|
jpayne@68
|
3233 number of fields in all lines of a file.
|
jpayne@68
|
3234
|
jpayne@68
|
3235 Other kwargs are sent to :meth:`BedTool.merge` (and assuming that
|
jpayne@68
|
3236 *postmerge=True*).
|
jpayne@68
|
3237
|
jpayne@68
|
3238 Example usage:
|
jpayne@68
|
3239
|
jpayne@68
|
3240 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
3241 >>> b = pybedtools.example_bedtool('b.bed')
|
jpayne@68
|
3242 >>> print(a.cat(b)) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
3243 chr1 1 500
|
jpayne@68
|
3244 chr1 800 950
|
jpayne@68
|
3245 <BLANKLINE>
|
jpayne@68
|
3246 >>> print(a.cat(*[b,b],
|
jpayne@68
|
3247 ... postmerge=False)) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
3248 chr1 1 100 feature1 0 +
|
jpayne@68
|
3249 chr1 100 200 feature2 0 +
|
jpayne@68
|
3250 chr1 150 500 feature3 0 -
|
jpayne@68
|
3251 chr1 900 950 feature4 0 +
|
jpayne@68
|
3252 chr1 155 200 feature5 0 -
|
jpayne@68
|
3253 chr1 800 901 feature6 0 +
|
jpayne@68
|
3254 chr1 155 200 feature5 0 -
|
jpayne@68
|
3255 chr1 800 901 feature6 0 +
|
jpayne@68
|
3256 <BLANKLINE>
|
jpayne@68
|
3257 """
|
jpayne@68
|
3258 same_type = None
|
jpayne@68
|
3259 same_field_num = None
|
jpayne@68
|
3260 field_nums = set()
|
jpayne@68
|
3261
|
jpayne@68
|
3262 assert len(others) > 0, "You must specify at least one other bedfile!"
|
jpayne@68
|
3263 other_beds = []
|
jpayne@68
|
3264 for other in others:
|
jpayne@68
|
3265 if isinstance(other, (str, Path)):
|
jpayne@68
|
3266 other = BedTool(other)
|
jpayne@68
|
3267 else:
|
jpayne@68
|
3268 assert isinstance(
|
jpayne@68
|
3269 other, BedTool
|
jpayne@68
|
3270 ), "Either filename or another BedTool instance required"
|
jpayne@68
|
3271 other_beds.append(other)
|
jpayne@68
|
3272
|
jpayne@68
|
3273 # postmerge and force_truncate don't get passed on to merge
|
jpayne@68
|
3274 postmerge = kwargs.pop("postmerge", True)
|
jpayne@68
|
3275 force_truncate = kwargs.pop("force_truncate", False)
|
jpayne@68
|
3276 stream_merge = kwargs.get("stream", False)
|
jpayne@68
|
3277 if stream_merge and postmerge:
|
jpayne@68
|
3278 raise ValueError(
|
jpayne@68
|
3279 "The post-merge step in the `cat()` method "
|
jpayne@68
|
3280 "performs a sort, which uses stream=True. Using "
|
jpayne@68
|
3281 "stream=True for the merge as well will result in a "
|
jpayne@68
|
3282 "deadlock!"
|
jpayne@68
|
3283 )
|
jpayne@68
|
3284
|
jpayne@68
|
3285 # if filetypes and field counts are the same, don't truncate
|
jpayne@68
|
3286 if not force_truncate:
|
jpayne@68
|
3287 try:
|
jpayne@68
|
3288 filetypes = set(
|
jpayne@68
|
3289 [self.file_type] + [i.file_type for i in other_beds]
|
jpayne@68
|
3290 ).difference(["empty"])
|
jpayne@68
|
3291 field_nums = (
|
jpayne@68
|
3292 set([self.field_count()] + [i.field_count() for i in other_beds])
|
jpayne@68
|
3293 .difference([None])
|
jpayne@68
|
3294 .difference([0])
|
jpayne@68
|
3295 )
|
jpayne@68
|
3296 same_field_num = len(field_nums) == 1
|
jpayne@68
|
3297 same_type = len(set(filetypes)) == 1
|
jpayne@68
|
3298 except ValueError:
|
jpayne@68
|
3299 raise ValueError(
|
jpayne@68
|
3300 "Can't check filetype or field count -- "
|
jpayne@68
|
3301 "is one of the files you're merging a 'streaming' "
|
jpayne@68
|
3302 "BedTool? If so, use .saveas() to save to file first"
|
jpayne@68
|
3303 )
|
jpayne@68
|
3304
|
jpayne@68
|
3305 tmp = self._tmp()
|
jpayne@68
|
3306
|
jpayne@68
|
3307 if not force_truncate and same_type and same_field_num:
|
jpayne@68
|
3308 with open(tmp, "w") as TMP:
|
jpayne@68
|
3309 for f in self:
|
jpayne@68
|
3310 TMP.write(str(f))
|
jpayne@68
|
3311 for other in other_beds:
|
jpayne@68
|
3312 for f in other:
|
jpayne@68
|
3313 TMP.write(str(f))
|
jpayne@68
|
3314
|
jpayne@68
|
3315 # Types match, so we can use the min number of fields observed across
|
jpayne@68
|
3316 # all inputs
|
jpayne@68
|
3317 elif not force_truncate and same_type:
|
jpayne@68
|
3318 minfields = min(field_nums)
|
jpayne@68
|
3319 with open(tmp, "w") as TMP:
|
jpayne@68
|
3320 for f in self:
|
jpayne@68
|
3321 TMP.write("\t".join(f.fields[:minfields]) + "\n")
|
jpayne@68
|
3322 for other in other_beds:
|
jpayne@68
|
3323 for f in other:
|
jpayne@68
|
3324 TMP.write("\t".join(f.fields[:minfields]) + "\n")
|
jpayne@68
|
3325
|
jpayne@68
|
3326 # Otherwise, use the zero-based chrom/start/stop to create a BED3,
|
jpayne@68
|
3327 # which will work when catting a GFF and a BED together.
|
jpayne@68
|
3328 else:
|
jpayne@68
|
3329 with open(tmp, "w") as TMP:
|
jpayne@68
|
3330 for f in self:
|
jpayne@68
|
3331 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
|
jpayne@68
|
3332 for other in other_beds:
|
jpayne@68
|
3333 for f in other:
|
jpayne@68
|
3334 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
|
jpayne@68
|
3335
|
jpayne@68
|
3336 c = BedTool(tmp)
|
jpayne@68
|
3337 if postmerge:
|
jpayne@68
|
3338 d = c.sort(stream=True).merge(**kwargs)
|
jpayne@68
|
3339
|
jpayne@68
|
3340 # Explicitly delete -- needed when using multiprocessing
|
jpayne@68
|
3341 os.unlink(tmp)
|
jpayne@68
|
3342 return d
|
jpayne@68
|
3343 else:
|
jpayne@68
|
3344 return c
|
jpayne@68
|
3345
|
jpayne@68
|
3346 @_log_to_history
|
jpayne@68
|
3347 def saveas(self, fn: Optional[str] = None, trackline: Optional[str] = None, compressed: Optional[bool] = None) -> BedTool:
|
jpayne@68
|
3348 """
|
jpayne@68
|
3349 Make a copy of the BedTool.
|
jpayne@68
|
3350
|
jpayne@68
|
3351 Optionally adds `trackline` to the beginning of the file.
|
jpayne@68
|
3352
|
jpayne@68
|
3353 Optionally compresses output using gzip.
|
jpayne@68
|
3354
|
jpayne@68
|
3355 if the filename extension is .gz, or compressed=True,
|
jpayne@68
|
3356 the output is compressed using gzip
|
jpayne@68
|
3357
|
jpayne@68
|
3358 Returns a new BedTool for the newly saved file.
|
jpayne@68
|
3359
|
jpayne@68
|
3360 A newline is automatically added to the trackline if it does not
|
jpayne@68
|
3361 already have one.
|
jpayne@68
|
3362
|
jpayne@68
|
3363 Example usage:
|
jpayne@68
|
3364
|
jpayne@68
|
3365 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
3366 >>> b = a.saveas('other.bed')
|
jpayne@68
|
3367 >>> b.fn
|
jpayne@68
|
3368 'other.bed'
|
jpayne@68
|
3369 >>> print(b == a)
|
jpayne@68
|
3370 True
|
jpayne@68
|
3371
|
jpayne@68
|
3372 >>> b = a.saveas('other.bed', trackline="name='test run' color=0,55,0")
|
jpayne@68
|
3373 >>> open(b.fn).readline()
|
jpayne@68
|
3374 "name='test run' color=0,55,0\\n"
|
jpayne@68
|
3375 >>> if os.path.exists('other.bed'):
|
jpayne@68
|
3376 ... os.unlink('other.bed')
|
jpayne@68
|
3377 """
|
jpayne@68
|
3378 if fn is None:
|
jpayne@68
|
3379 fn = self._tmp()
|
jpayne@68
|
3380
|
jpayne@68
|
3381 # Default to compressed if extension is .gz
|
jpayne@68
|
3382 if compressed is None:
|
jpayne@68
|
3383 __, extension = os.path.splitext(fn)
|
jpayne@68
|
3384 if extension == ".gz":
|
jpayne@68
|
3385 compressed = True
|
jpayne@68
|
3386 else:
|
jpayne@68
|
3387 compressed = False
|
jpayne@68
|
3388
|
jpayne@68
|
3389 in_compressed = isinstance(self.fn, str) and isGZIP(self.fn)
|
jpayne@68
|
3390
|
jpayne@68
|
3391 fn = self._collapse(
|
jpayne@68
|
3392 self,
|
jpayne@68
|
3393 fn=fn,
|
jpayne@68
|
3394 trackline=trackline,
|
jpayne@68
|
3395 in_compressed=in_compressed,
|
jpayne@68
|
3396 out_compressed=compressed,
|
jpayne@68
|
3397 )
|
jpayne@68
|
3398 return BedTool(fn)
|
jpayne@68
|
3399
|
jpayne@68
|
3400 @_log_to_history
|
jpayne@68
|
3401 def moveto(self, fn: Optional[str]=None) -> BedTool:
|
jpayne@68
|
3402 """
|
jpayne@68
|
3403 Move to a new filename (can be much quicker than BedTool.saveas())
|
jpayne@68
|
3404
|
jpayne@68
|
3405 Move BED file to new filename, `fn`.
|
jpayne@68
|
3406
|
jpayne@68
|
3407 Returns a new BedTool for the new file.
|
jpayne@68
|
3408
|
jpayne@68
|
3409 Example usage:
|
jpayne@68
|
3410
|
jpayne@68
|
3411 >>> # make a copy so we don't mess up the example file
|
jpayne@68
|
3412 >>> a = pybedtools.example_bedtool('a.bed').saveas()
|
jpayne@68
|
3413 >>> a_contents = str(a)
|
jpayne@68
|
3414 >>> b = a.moveto('other.bed')
|
jpayne@68
|
3415 >>> b.fn
|
jpayne@68
|
3416 'other.bed'
|
jpayne@68
|
3417 >>> b == a_contents
|
jpayne@68
|
3418 True
|
jpayne@68
|
3419 """
|
jpayne@68
|
3420 if not isinstance(self.fn, str):
|
jpayne@68
|
3421 fn = self._collapse(self, fn=fn)
|
jpayne@68
|
3422 else:
|
jpayne@68
|
3423 shutil.move(self.fn, fn)
|
jpayne@68
|
3424 return BedTool(fn)
|
jpayne@68
|
3425
|
jpayne@68
|
3426 @_log_to_history
|
jpayne@68
|
3427 def random_subset(self, n: Optional[int] = None, f: Optional[float] = None, seed: Optional[float|int]=None) -> BedTool:
|
jpayne@68
|
3428 """
|
jpayne@68
|
3429 Return a BedTool containing a random subset.
|
jpayne@68
|
3430
|
jpayne@68
|
3431 NOTE: using `n` will be slower and use more memory than using `f`.
|
jpayne@68
|
3432
|
jpayne@68
|
3433 Parameters
|
jpayne@68
|
3434 ----------
|
jpayne@68
|
3435
|
jpayne@68
|
3436 n : int
|
jpayne@68
|
3437 Number of features to return. Only one of `n` or `f` can be provided.
|
jpayne@68
|
3438
|
jpayne@68
|
3439 f : float, 0 <= f <= 1
|
jpayne@68
|
3440 Fraction of features to return. Cannot be provided with `n`.
|
jpayne@68
|
3441
|
jpayne@68
|
3442 seed : float or int
|
jpayne@68
|
3443 Set random.seed
|
jpayne@68
|
3444
|
jpayne@68
|
3445 Example
|
jpayne@68
|
3446 -------
|
jpayne@68
|
3447
|
jpayne@68
|
3448 >>> seed = 0 # only for test, otherwise use None
|
jpayne@68
|
3449
|
jpayne@68
|
3450 `n` will always give the same number of returned features, but will be
|
jpayne@68
|
3451 slower since it is creating an index and then shuffling it.
|
jpayne@68
|
3452
|
jpayne@68
|
3453 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
3454 >>> b = a.random_subset(n=2)
|
jpayne@68
|
3455 >>> len(b)
|
jpayne@68
|
3456 2
|
jpayne@68
|
3457
|
jpayne@68
|
3458 Using a fraction `f` will be faster but depending on seed will result
|
jpayne@68
|
3459 in slightly different total numbers.
|
jpayne@68
|
3460
|
jpayne@68
|
3461 >>> a = pybedtools.example_bedtool('x.bam')
|
jpayne@68
|
3462 >>> len(a)
|
jpayne@68
|
3463 45593
|
jpayne@68
|
3464 >>> b = a.random_subset(f=0.4, seed=seed)
|
jpayne@68
|
3465 >>> len(b)
|
jpayne@68
|
3466 18316
|
jpayne@68
|
3467
|
jpayne@68
|
3468 Check that we have approximately the right fraction
|
jpayne@68
|
3469 >>> print('{0:.2f}'.format(len(b) / len(a)))
|
jpayne@68
|
3470 0.40
|
jpayne@68
|
3471
|
jpayne@68
|
3472 """
|
jpayne@68
|
3473 if ((n is None) and (f is None)) or ((n is not None) and (f is not None)):
|
jpayne@68
|
3474 raise ValueError("Exactly one of `n` or `f` must be provided")
|
jpayne@68
|
3475
|
jpayne@68
|
3476 tmpfn = self._tmp()
|
jpayne@68
|
3477 if seed is not None:
|
jpayne@68
|
3478 random.seed(seed)
|
jpayne@68
|
3479
|
jpayne@68
|
3480 if n:
|
jpayne@68
|
3481 idxs = list(range(len(self)))
|
jpayne@68
|
3482 random.shuffle(idxs)
|
jpayne@68
|
3483 idxs = idxs[:n]
|
jpayne@68
|
3484 with open(tmpfn, "w") as tmp:
|
jpayne@68
|
3485 for i, feature in enumerate(self):
|
jpayne@68
|
3486 if i in idxs:
|
jpayne@68
|
3487 tmp.write(str(feature))
|
jpayne@68
|
3488
|
jpayne@68
|
3489 elif f:
|
jpayne@68
|
3490 with open(tmpfn, "w") as tmp:
|
jpayne@68
|
3491 for i in self:
|
jpayne@68
|
3492 if random.random() <= f:
|
jpayne@68
|
3493 tmp.write(str(i))
|
jpayne@68
|
3494
|
jpayne@68
|
3495 return BedTool(tmpfn)
|
jpayne@68
|
3496
|
jpayne@68
|
3497 def total_coverage(self) -> int:
|
jpayne@68
|
3498 """
|
jpayne@68
|
3499 Return the total number of bases covered by this interval file.
|
jpayne@68
|
3500
|
jpayne@68
|
3501 Does a self.merge() first to remove potentially multiple-counting
|
jpayne@68
|
3502 bases.
|
jpayne@68
|
3503
|
jpayne@68
|
3504 Example usage:
|
jpayne@68
|
3505
|
jpayne@68
|
3506 >>> a = pybedtools.example_bedtool('a.bed')
|
jpayne@68
|
3507
|
jpayne@68
|
3508 This does a merge() first, so this is what the total coverage is
|
jpayne@68
|
3509 counting:
|
jpayne@68
|
3510
|
jpayne@68
|
3511 >>> print(a.merge()) #doctest: +NORMALIZE_WHITESPACE
|
jpayne@68
|
3512 chr1 1 500
|
jpayne@68
|
3513 chr1 900 950
|
jpayne@68
|
3514 <BLANKLINE>
|
jpayne@68
|
3515
|
jpayne@68
|
3516 >>> print(a.total_coverage())
|
jpayne@68
|
3517 549
|
jpayne@68
|
3518 """
|
jpayne@68
|
3519 b = self.merge()
|
jpayne@68
|
3520 total_bp = 0
|
jpayne@68
|
3521 for feature in b.features():
|
jpayne@68
|
3522 total_bp += len(feature)
|
jpayne@68
|
3523 return total_bp
|
jpayne@68
|
3524
|
jpayne@68
|
3525 @_log_to_history
|
jpayne@68
|
3526 def with_attrs(self, **kwargs) -> BedTool:
|
jpayne@68
|
3527 """
|
jpayne@68
|
3528 Helper method for adding attributes in the middle of a pipeline.
|
jpayne@68
|
3529
|
jpayne@68
|
3530 Given arbitrary keyword arguments, turns the keys and values into
|
jpayne@68
|
3531 attributes. Useful for labeling BedTools at creation time.
|
jpayne@68
|
3532
|
jpayne@68
|
3533 Example usage:
|
jpayne@68
|
3534
|
jpayne@68
|
3535 >>> # add a "label" attribute to each BedTool
|
jpayne@68
|
3536 >>> a = pybedtools.example_bedtool('a.bed')\
|
jpayne@68
|
3537 .with_attrs(label='transcription factor 1')
|
jpayne@68
|
3538 >>> b = pybedtools.example_bedtool('b.bed')\
|
jpayne@68
|
3539 .with_attrs(label='transcription factor 2')
|
jpayne@68
|
3540 >>> for i in [a, b]:
|
jpayne@68
|
3541 ... print('{0} features for {1}'.format(i.count(), i.label))
|
jpayne@68
|
3542 4 features for transcription factor 1
|
jpayne@68
|
3543 2 features for transcription factor 2
|
jpayne@68
|
3544
|
jpayne@68
|
3545 """
|
jpayne@68
|
3546 for key, value in list(kwargs.items()):
|
jpayne@68
|
3547 setattr(self, key, value)
|
jpayne@68
|
3548 return self
|
jpayne@68
|
3549
|
jpayne@68
|
3550 def as_intervalfile(self) -> IntervalFile:
|
jpayne@68
|
3551 """
|
jpayne@68
|
3552 Returns an IntervalFile of this BedTool for low-level interface.
|
jpayne@68
|
3553 """
|
jpayne@68
|
3554 if not isinstance(self.fn, str):
|
jpayne@68
|
3555 fn = self._collapse(self.fn)
|
jpayne@68
|
3556 else:
|
jpayne@68
|
3557 fn = self.fn
|
jpayne@68
|
3558 return IntervalFile(fn)
|
jpayne@68
|
3559
|
jpayne@68
|
3560 def liftover(self, chainfile: str, unmapped: Optional[str] = None, liftover_args: str = "") -> BedTool:
|
jpayne@68
|
3561 """
|
jpayne@68
|
3562 Returns a new BedTool of the liftedOver features, saving the unmapped
|
jpayne@68
|
3563 ones as `unmapped`. If `unmapped` is None, then discards the unmapped
|
jpayne@68
|
3564 features.
|
jpayne@68
|
3565
|
jpayne@68
|
3566 `liftover_args` is a string of additional args that is passed,
|
jpayne@68
|
3567 verbatim, to liftOver.
|
jpayne@68
|
3568
|
jpayne@68
|
3569 Needs `liftOver` from UCSC to be on the path and a `chainfile`
|
jpayne@68
|
3570 downloaded from UCSC.
|
jpayne@68
|
3571 """
|
jpayne@68
|
3572 result = BedTool._tmp()
|
jpayne@68
|
3573 if unmapped is None:
|
jpayne@68
|
3574 unmapped = BedTool._tmp()
|
jpayne@68
|
3575 cmds = ["liftOver", liftover_args, self.fn, chainfile, result, unmapped]
|
jpayne@68
|
3576 os.system(" ".join(cmds))
|
jpayne@68
|
3577 return BedTool(result)
|
jpayne@68
|
3578
|
jpayne@68
|
3579 def absolute_distance(self, other: BedTool, closest_kwargs: Optional[dict[str, Any]]=None, use_midpoints: bool=False) -> Iterator[int]:
|
jpayne@68
|
3580 """
|
jpayne@68
|
3581 Returns an iterator of the *absolute* distances between features in
|
jpayne@68
|
3582 self and other.
|
jpayne@68
|
3583
|
jpayne@68
|
3584 If `use_midpoints` is True, then only use the midpoints of features
|
jpayne@68
|
3585 (which will return values where features are overlapping). Otherwise,
|
jpayne@68
|
3586 when features overlap the value will always be zero.
|
jpayne@68
|
3587
|
jpayne@68
|
3588 `closest_kwargs` are passed to self.closest(); either `d` or
|
jpayne@68
|
3589 'D` are required in order to get back distance values (`d=True` is
|
jpayne@68
|
3590 default)
|
jpayne@68
|
3591 """
|
jpayne@68
|
3592 from .featurefuncs import midpoint
|
jpayne@68
|
3593
|
jpayne@68
|
3594 if closest_kwargs is None:
|
jpayne@68
|
3595 closest_kwargs = {"d": True}
|
jpayne@68
|
3596
|
jpayne@68
|
3597 if "D" not in closest_kwargs:
|
jpayne@68
|
3598 closest_kwargs.update(dict(d=True))
|
jpayne@68
|
3599
|
jpayne@68
|
3600 if use_midpoints:
|
jpayne@68
|
3601 mid_self = self.each(midpoint).saveas()
|
jpayne@68
|
3602 mid_other = other.each(midpoint).saveas()
|
jpayne@68
|
3603 c = mid_self.closest(mid_other, stream=True, **closest_kwargs)
|
jpayne@68
|
3604 else:
|
jpayne@68
|
3605 c = self.closest(other, stream=True, **closest_kwargs)
|
jpayne@68
|
3606 for i in c:
|
jpayne@68
|
3607 yield int(i[-1])
|
jpayne@68
|
3608
|
jpayne@68
|
3609 def relative_distance(self, other: BedTool, genome:Optional[dict|str] =None, g: Optional[str]=None) -> Iterator[float]:
|
jpayne@68
|
3610 """
|
jpayne@68
|
3611 Returns an iterator of relative distances between features in self and
|
jpayne@68
|
3612 other.
|
jpayne@68
|
3613
|
jpayne@68
|
3614 First computes the midpoints of self and other, then returns distances
|
jpayne@68
|
3615 of each feature in `other` relative to the distance between `self`
|
jpayne@68
|
3616 features.
|
jpayne@68
|
3617
|
jpayne@68
|
3618 Requires either `genome` (dictionary of chromsizes or assembly name) or
|
jpayne@68
|
3619 `g` (filename of chromsizes file).
|
jpayne@68
|
3620 """
|
jpayne@68
|
3621 g_dict = {}
|
jpayne@68
|
3622 if (genome is None) and (g is None):
|
jpayne@68
|
3623 raise ValueError("Need either `genome` or `g` arg for relative distance")
|
jpayne@68
|
3624 elif genome and g:
|
jpayne@68
|
3625 raise ValueError("Please specify only one of `genome` or `g`")
|
jpayne@68
|
3626 elif genome:
|
jpayne@68
|
3627 g_dict = dict(genome=genome)
|
jpayne@68
|
3628 elif g:
|
jpayne@68
|
3629 g_dict = dict(g=g)
|
jpayne@68
|
3630
|
jpayne@68
|
3631 from .featurefuncs import midpoint
|
jpayne@68
|
3632
|
jpayne@68
|
3633 # This gets the space between features in self.
|
jpayne@68
|
3634 c = self.each(midpoint).complement(**g_dict)
|
jpayne@68
|
3635
|
jpayne@68
|
3636 hits = c.intersect(other, wao=True, stream=True) # TODO: should this be other or mid_other?
|
jpayne@68
|
3637 for i in hits:
|
jpayne@68
|
3638 yield float(i[-1]) / len(i)
|
jpayne@68
|
3639
|
jpayne@68
|
3640 def colormap_normalize(self,
|
jpayne@68
|
3641 vmin: Optional[float|int]=None,
|
jpayne@68
|
3642 vmax: Optional[float|int]=None,
|
jpayne@68
|
3643 percentile: bool=False,
|
jpayne@68
|
3644 log: bool=False) -> mcolors.LogNorm|mcolors.Normalize:
|
jpayne@68
|
3645 """
|
jpayne@68
|
3646 Returns a normalization instance for use by featurefuncs.add_color().
|
jpayne@68
|
3647
|
jpayne@68
|
3648 Parameters
|
jpayne@68
|
3649 ----------
|
jpayne@68
|
3650 vmin, vmax : float, int, or None
|
jpayne@68
|
3651 `vmin` and `vmax` set the colormap bounds; if None then
|
jpayne@68
|
3652 these will be determined from the scores in the BED file.
|
jpayne@68
|
3653
|
jpayne@68
|
3654 log : bool
|
jpayne@68
|
3655 If True, put the scores on a log scale; of course be careful
|
jpayne@68
|
3656 if you have negative scores
|
jpayne@68
|
3657
|
jpayne@68
|
3658 percentile : bool
|
jpayne@68
|
3659 If True, interpret vmin and vmax as a percentile in the range
|
jpayne@68
|
3660 [0,100] rather than absolute values.
|
jpayne@68
|
3661 """
|
jpayne@68
|
3662 field_count = self.field_count()
|
jpayne@68
|
3663 if (self.file_type != "bed") or (field_count < 5):
|
jpayne@68
|
3664 raise ValueError("colorizing only works for BED files with score " "fields")
|
jpayne@68
|
3665 try:
|
jpayne@68
|
3666 import matplotlib.colors as mcolors
|
jpayne@68
|
3667 except ImportError:
|
jpayne@68
|
3668 raise ImportError("matplotlib.colors must be installed to use colormap_normalize")
|
jpayne@68
|
3669
|
jpayne@68
|
3670 try:
|
jpayne@68
|
3671 import numpy as np
|
jpayne@68
|
3672 except ImportError:
|
jpayne@68
|
3673 raise ImportError("numpy must be installed to use colormap_normalize")
|
jpayne@68
|
3674
|
jpayne@68
|
3675 if log:
|
jpayne@68
|
3676 norm = mcolors.LogNorm()
|
jpayne@68
|
3677 else:
|
jpayne@68
|
3678 norm = mcolors.Normalize()
|
jpayne@68
|
3679
|
jpayne@68
|
3680 scores = np.array([i.score for i in self], dtype=float)
|
jpayne@68
|
3681 scores = scores[np.isfinite(scores)]
|
jpayne@68
|
3682 norm.autoscale(scores)
|
jpayne@68
|
3683
|
jpayne@68
|
3684 if vmin is not None:
|
jpayne@68
|
3685 if percentile:
|
jpayne@68
|
3686 vmin = float(np.percentile(scores, vmin))
|
jpayne@68
|
3687 norm.vmin = vmin
|
jpayne@68
|
3688 if vmax is not None:
|
jpayne@68
|
3689 if percentile:
|
jpayne@68
|
3690 vmax = float(np.percentile(scores, vmax))
|
jpayne@68
|
3691 norm.vmax = vmax
|
jpayne@68
|
3692
|
jpayne@68
|
3693 return norm
|
jpayne@68
|
3694
|
jpayne@68
|
3695 def at(self, inds: list[int]) -> BedTool:
|
jpayne@68
|
3696 """
|
jpayne@68
|
3697 Returns a new BedTool with only intervals at lines `inds`
|
jpayne@68
|
3698
|
jpayne@68
|
3699 Parameters
|
jpayne@68
|
3700 ----------
|
jpayne@68
|
3701 inds : List[int]
|
jpayne@68
|
3702 List of line numbers
|
jpayne@68
|
3703
|
jpayne@68
|
3704 Returns
|
jpayne@68
|
3705 -------
|
jpayne@68
|
3706 BedTool
|
jpayne@68
|
3707 New BedTool with only intervals at `inds`
|
jpayne@68
|
3708 """
|
jpayne@68
|
3709 length = len(inds)
|
jpayne@68
|
3710
|
jpayne@68
|
3711 def _gen():
|
jpayne@68
|
3712 k = 0
|
jpayne@68
|
3713 for i, feature in enumerate(self):
|
jpayne@68
|
3714 if i == inds[k]:
|
jpayne@68
|
3715 yield feature
|
jpayne@68
|
3716 k += 1
|
jpayne@68
|
3717 if k == length:
|
jpayne@68
|
3718 break
|
jpayne@68
|
3719
|
jpayne@68
|
3720 return BedTool(_gen()).saveas()
|
jpayne@68
|
3721
|
jpayne@68
|
3722 def to_dataframe(self, disable_auto_names: bool = False, *args, **kwargs) -> pd.DataFrame:
|
jpayne@68
|
3723 """
|
jpayne@68
|
3724 Create a pandas.DataFrame, passing args and kwargs to pandas.read_csv
|
jpayne@68
|
3725 The separator kwarg `sep` is given a tab `\\t` as value by default.
|
jpayne@68
|
3726
|
jpayne@68
|
3727 Parameters
|
jpayne@68
|
3728 ----------
|
jpayne@68
|
3729 disable_auto_names : bool
|
jpayne@68
|
3730 By default, the created dataframe fills in column names
|
jpayne@68
|
3731 automatically according to the detected filetype (e.g., "chrom",
|
jpayne@68
|
3732 "start", "end" for a BED3 file). Set this argument to True to
|
jpayne@68
|
3733 disable this behavior.
|
jpayne@68
|
3734 """
|
jpayne@68
|
3735 # Complain if BAM or if not a file
|
jpayne@68
|
3736 if self._isbam:
|
jpayne@68
|
3737 raise ValueError("BAM not supported for converting to DataFrame")
|
jpayne@68
|
3738 if not isinstance(self.fn, str):
|
jpayne@68
|
3739 raise ValueError("use .saveas() to make sure self.fn is a file")
|
jpayne@68
|
3740
|
jpayne@68
|
3741 try:
|
jpayne@68
|
3742 import pandas
|
jpayne@68
|
3743 except ImportError:
|
jpayne@68
|
3744 raise ImportError("pandas must be installed to convert to pandas.DataFrame")
|
jpayne@68
|
3745 # Otherwise we're good:
|
jpayne@68
|
3746 names = kwargs.get("names", None)
|
jpayne@68
|
3747 if names is None and not disable_auto_names:
|
jpayne@68
|
3748 try:
|
jpayne@68
|
3749 _names = settings._column_names[self.file_type][: self.field_count()]
|
jpayne@68
|
3750 if len(_names) < self.field_count():
|
jpayne@68
|
3751 warn(
|
jpayne@68
|
3752 "Default names for filetype %s are:\n%s\nbut file has "
|
jpayne@68
|
3753 "%s fields; you can supply custom names with the "
|
jpayne@68
|
3754 "`names` kwarg" % (self.file_type, _names, self.field_count())
|
jpayne@68
|
3755 )
|
jpayne@68
|
3756 _names = None
|
jpayne@68
|
3757 except KeyError:
|
jpayne@68
|
3758 _names = None
|
jpayne@68
|
3759 kwargs["names"] = _names
|
jpayne@68
|
3760
|
jpayne@68
|
3761 if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0:
|
jpayne@68
|
3762 return pandas.read_csv(self.fn, *args, sep="\t", **kwargs) # type: ignore
|
jpayne@68
|
3763 else:
|
jpayne@68
|
3764 return pandas.DataFrame()
|
jpayne@68
|
3765
|
jpayne@68
|
3766 def tail(self, lines:int = 10, as_string: bool = False) -> Optional[str]:
|
jpayne@68
|
3767 """
|
jpayne@68
|
3768 Like `head`, but prints last 10 lines of the file by default.
|
jpayne@68
|
3769
|
jpayne@68
|
3770 To avoid consuming iterables, this only works with file-based, non-BAM
|
jpayne@68
|
3771 BedTool objects.
|
jpayne@68
|
3772
|
jpayne@68
|
3773 Use `as_string=True` to return a string.
|
jpayne@68
|
3774 """
|
jpayne@68
|
3775 if self._isbam:
|
jpayne@68
|
3776 raise ValueError("tail() not yet implemented for BAM files")
|
jpayne@68
|
3777 if not isinstance(self.fn, str):
|
jpayne@68
|
3778 raise ValueError(
|
jpayne@68
|
3779 "tail() not implemented for non-file-based "
|
jpayne@68
|
3780 "BedTool objects. Please use saveas() first."
|
jpayne@68
|
3781 )
|
jpayne@68
|
3782 bufsize = 8192
|
jpayne@68
|
3783 offset = bufsize
|
jpayne@68
|
3784 f = open(self.fn, "rb")
|
jpayne@68
|
3785
|
jpayne@68
|
3786 # whence=2 arg means relative to end (i.e., go to the end)
|
jpayne@68
|
3787 f.seek(0, 2)
|
jpayne@68
|
3788 file_size = f.tell()
|
jpayne@68
|
3789 data = []
|
jpayne@68
|
3790 while True:
|
jpayne@68
|
3791 if file_size < bufsize:
|
jpayne@68
|
3792 offset = file_size
|
jpayne@68
|
3793 f.seek(-offset, 2)
|
jpayne@68
|
3794 chunk = f.read(offset)
|
jpayne@68
|
3795 data.extend(chunk.splitlines(True))
|
jpayne@68
|
3796 if len(data) >= lines or offset == file_size:
|
jpayne@68
|
3797 break
|
jpayne@68
|
3798 offset += bufsize
|
jpayne@68
|
3799
|
jpayne@68
|
3800 result = "".join([i.decode() for i in data[-lines:]])
|
jpayne@68
|
3801 if as_string:
|
jpayne@68
|
3802 return result
|
jpayne@68
|
3803 else:
|
jpayne@68
|
3804 print(result)
|
jpayne@68
|
3805
|
jpayne@68
|
3806
|
jpayne@68
|
3807 class BAM(object):
|
jpayne@68
|
3808 def __init__(self, stream):
|
jpayne@68
|
3809 """
|
jpayne@68
|
3810 Wraps pysam.Samfile so that it yields pybedtools.Interval objects when
|
jpayne@68
|
3811 iterated over.
|
jpayne@68
|
3812
|
jpayne@68
|
3813 The pysam.Samfile can be accessed via the .pysam_bamfile attribute.
|
jpayne@68
|
3814 """
|
jpayne@68
|
3815 self.stream = stream
|
jpayne@68
|
3816 if not isinstance(self.stream, str):
|
jpayne@68
|
3817 raise ValueError("Only files are supported, not streams")
|
jpayne@68
|
3818 self.pysam_bamfile = pysam.Samfile(self.stream)
|
jpayne@68
|
3819
|
jpayne@68
|
3820 def _aligned_segment_to_interval(self, r):
|
jpayne@68
|
3821
|
jpayne@68
|
3822 if r.rname >= 0:
|
jpayne@68
|
3823 rname = self.pysam_bamfile.get_reference_name(r.rname)
|
jpayne@68
|
3824 else:
|
jpayne@68
|
3825 rname = "*"
|
jpayne@68
|
3826
|
jpayne@68
|
3827 if r.rnext >= 0:
|
jpayne@68
|
3828 if r.rnext == r.rname:
|
jpayne@68
|
3829 rnext = "="
|
jpayne@68
|
3830 else:
|
jpayne@68
|
3831 rnext = self.pysam_bamfile.get_reference_name(r.rnext)
|
jpayne@68
|
3832 else:
|
jpayne@68
|
3833 rnext = "*"
|
jpayne@68
|
3834
|
jpayne@68
|
3835 # SAM spec says if unavailable should be set to 0. Pysam sets to -1.
|
jpayne@68
|
3836
|
jpayne@68
|
3837 if r.pnext <= 0:
|
jpayne@68
|
3838 pnext = "0"
|
jpayne@68
|
3839 else:
|
jpayne@68
|
3840 # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
|
jpayne@68
|
3841 # but pysam uses 0-based.
|
jpayne@68
|
3842 pnext = str(r.pnext + 1)
|
jpayne@68
|
3843
|
jpayne@68
|
3844 if r.cigarstring:
|
jpayne@68
|
3845 cigarstring = r.cigarstring
|
jpayne@68
|
3846 else:
|
jpayne@68
|
3847 cigarstring = "*"
|
jpayne@68
|
3848
|
jpayne@68
|
3849 # Rudimentary support.
|
jpayne@68
|
3850 # TODO: remove when refactoring to new BAM iterating
|
jpayne@68
|
3851 tags = []
|
jpayne@68
|
3852 for k, v in r.tags:
|
jpayne@68
|
3853 if isinstance(v, int):
|
jpayne@68
|
3854 t = "i"
|
jpayne@68
|
3855 elif isinstance(v, float):
|
jpayne@68
|
3856 t = "f"
|
jpayne@68
|
3857 else:
|
jpayne@68
|
3858 t = "Z"
|
jpayne@68
|
3859 tags.append("{0}:{1}:{2}".format(k, t, v))
|
jpayne@68
|
3860
|
jpayne@68
|
3861 tags = "\t".join(tags)
|
jpayne@68
|
3862
|
jpayne@68
|
3863 if r.seq:
|
jpayne@68
|
3864 seq = r.seq
|
jpayne@68
|
3865 else:
|
jpayne@68
|
3866 seq = "*"
|
jpayne@68
|
3867
|
jpayne@68
|
3868 if r.qual:
|
jpayne@68
|
3869 qual = r.qual
|
jpayne@68
|
3870 else:
|
jpayne@68
|
3871 qual = "*"
|
jpayne@68
|
3872
|
jpayne@68
|
3873 fields = [
|
jpayne@68
|
3874 r.qname,
|
jpayne@68
|
3875 str(r.flag),
|
jpayne@68
|
3876 rname,
|
jpayne@68
|
3877 # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
|
jpayne@68
|
3878 # but pysam uses 0-based.
|
jpayne@68
|
3879 str(r.pos + 1),
|
jpayne@68
|
3880 str(r.mapq),
|
jpayne@68
|
3881 cigarstring,
|
jpayne@68
|
3882 rnext,
|
jpayne@68
|
3883 pnext,
|
jpayne@68
|
3884 str(r.tlen),
|
jpayne@68
|
3885 seq,
|
jpayne@68
|
3886 qual,
|
jpayne@68
|
3887 ]
|
jpayne@68
|
3888 if tags:
|
jpayne@68
|
3889 fields.append(tags)
|
jpayne@68
|
3890
|
jpayne@68
|
3891 if None in fields:
|
jpayne@68
|
3892 raise ValueError("Found 'None' in fields: %s" % fields)
|
jpayne@68
|
3893 return create_interval_from_list(fields)
|
jpayne@68
|
3894
|
jpayne@68
|
3895 def __iter__(self):
|
jpayne@68
|
3896 return self
|
jpayne@68
|
3897
|
jpayne@68
|
3898 # TODO: this is PAINFUL but it ensures that existing tests work. Once all
|
jpayne@68
|
3899 # tests work, the new behavior will be to yield pysam AlignedSegment
|
jpayne@68
|
3900 # objects directly.
|
jpayne@68
|
3901 def __next__(self):
|
jpayne@68
|
3902 return self._aligned_segment_to_interval(next(self.pysam_bamfile))
|
jpayne@68
|
3903
|
jpayne@68
|
3904 def next(self):
|
jpayne@68
|
3905 return self.__next__()
|
jpayne@68
|
3906
|
jpayne@68
|
3907
|
jpayne@68
|
3908 class History(list):
|
jpayne@68
|
3909 def __init__(self):
|
jpayne@68
|
3910 """
|
jpayne@68
|
3911 Represents one or many HistorySteps. Mostly used for nicely formatting
|
jpayne@68
|
3912 a series of HistorySteps.
|
jpayne@68
|
3913 """
|
jpayne@68
|
3914 list.__init__(self)
|
jpayne@68
|
3915
|
jpayne@68
|
3916
|
jpayne@68
|
3917 class HistoryStep(object):
|
jpayne@68
|
3918 def __init__(self, method, args, kwargs, bedtool_instance, parent_tag, result_tag):
|
jpayne@68
|
3919 """
|
jpayne@68
|
3920 Class to represent one step in the history.
|
jpayne@68
|
3921
|
jpayne@68
|
3922 Mostly used for its __repr__ method, to try and exactly replicate code
|
jpayne@68
|
3923 that can be pasted to re-do history steps
|
jpayne@68
|
3924 """
|
jpayne@68
|
3925 try:
|
jpayne@68
|
3926 self.method = method._name
|
jpayne@68
|
3927 except AttributeError:
|
jpayne@68
|
3928 self.method = method.__name__
|
jpayne@68
|
3929
|
jpayne@68
|
3930 self.args = args
|
jpayne@68
|
3931 self.kwargs = kwargs
|
jpayne@68
|
3932 self.fn = bedtool_instance.fn
|
jpayne@68
|
3933 self.parent_tag = parent_tag
|
jpayne@68
|
3934 self.result_tag = result_tag
|
jpayne@68
|
3935
|
jpayne@68
|
3936 def _clean_arg(self, arg: str|BedTool) -> str:
|
jpayne@68
|
3937 """
|
jpayne@68
|
3938 Wrap strings in quotes and convert bedtool instances to filenames.
|
jpayne@68
|
3939 """
|
jpayne@68
|
3940 if isinstance(arg, BedTool):
|
jpayne@68
|
3941 arg = arg.fn
|
jpayne@68
|
3942 if isinstance(arg, str):
|
jpayne@68
|
3943 arg = '"%s"' % arg
|
jpayne@68
|
3944 return arg
|
jpayne@68
|
3945
|
jpayne@68
|
3946 def __repr__(self):
|
jpayne@68
|
3947 # Still not sure whether to use pybedtools.bedtool() or bedtool()
|
jpayne@68
|
3948 s = ""
|
jpayne@68
|
3949 s += "<HistoryStep> "
|
jpayne@68
|
3950 if os.path.exists(self.fn):
|
jpayne@68
|
3951 s += 'BedTool("%(fn)s").%(method)s(%%s%%s)' % self.__dict__
|
jpayne@68
|
3952 else:
|
jpayne@68
|
3953 s += 'BedTool("MISSING FILE: %(fn)s")' % self.__dict__
|
jpayne@68
|
3954 s += ".%(method)s(%%s%%s)" % self.__dict__
|
jpayne@68
|
3955
|
jpayne@68
|
3956 # Format args and kwargs
|
jpayne@68
|
3957 args_string = ",".join(map(self._clean_arg, self.args))
|
jpayne@68
|
3958 kwargs_string = ",".join(
|
jpayne@68
|
3959 ["%s=%s" % (i[0], self._clean_arg(i[1])) for i in list(self.kwargs.items())]
|
jpayne@68
|
3960 )
|
jpayne@68
|
3961 # stick a comma on the end if there's something here
|
jpayne@68
|
3962 if len(args_string) > 0:
|
jpayne@68
|
3963 args_string += ", "
|
jpayne@68
|
3964
|
jpayne@68
|
3965 s = s % (args_string, kwargs_string)
|
jpayne@68
|
3966 s += ", parent tag: %s" % self.parent_tag
|
jpayne@68
|
3967 s += ", result tag: %s" % self.result_tag
|
jpayne@68
|
3968 return s
|
jpayne@68
|
3969
|
jpayne@68
|
3970
|
jpayne@68
|
3971 def example_bedtool(fn):
|
jpayne@68
|
3972 """
|
jpayne@68
|
3973 Return a bedtool using a bed file from the pybedtools examples directory.
|
jpayne@68
|
3974 Use :func:`list_example_files` to see a list of files that are included.
|
jpayne@68
|
3975 """
|
jpayne@68
|
3976 fn = os.path.join(filenames.data_dir(), fn)
|
jpayne@68
|
3977 if not os.path.exists(fn):
|
jpayne@68
|
3978 msg = "%s does not exist" % fn
|
jpayne@68
|
3979 raise FileNotFoundError(msg)
|
jpayne@68
|
3980 return BedTool(fn)
|
jpayne@68
|
3981
|
jpayne@68
|
3982
|
jpayne@68
|
3983 if __name__ == "__main__":
|
jpayne@68
|
3984 import doctest
|
jpayne@68
|
3985
|
jpayne@68
|
3986 doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
|