comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/bedtool.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 from __future__ import annotations
2 import tempfile
3 from textwrap import dedent
4 import shutil
5 import subprocess
6 import operator
7 import os
8 import sys
9 import random
10 import string
11 import pprint
12 from itertools import islice
13 from multiprocessing import Pool
14 import gzip
15 from typing import Any, Callable, Iterable, Iterator, Literal, Optional, TYPE_CHECKING, cast
16 import pysam
17 from warnings import warn
18 import pathlib
19 from pathlib import Path
20
21 from .helpers import (
22 get_tempdir,
23 _tags,
24 call_bedtools,
25 _flatten_list,
26 _check_sequence_stderr,
27 isBAM,
28 isBGZIP,
29 isGZIP,
30 BEDToolsError,
31 pybedtoolsError,
32 _call_randomintersect,
33 SplitOutput,
34 FisherOutput,
35 )
36 from . import helpers
37 from .cbedtools import (
38 IntervalFile,
39 IntervalIterator,
40 Interval,
41 create_interval_from_list,
42 BedToolsFileError,
43 )
44 import pybedtools
45 from . import settings
46 from . import filenames
47
48 if TYPE_CHECKING:
49 import pandas as pd
50 import matplotlib.colors as mcolors
51
52 _implicit_registry = {}
53 _other_registry = {}
54 _bam_registry = {}
55
56
57 def _jaccard_output_to_dict(s, **kwargs) -> dict:
58 """
59 jaccard method doesn't return an interval file, rather, it returns a short
60 summary of results. Here, we simply parse it into a dict for convenience.
61 """
62 if isinstance(s, str):
63 _s = open(s).read()
64 elif hasattr(s, "next") or hasattr(s, "__next__"):
65 _s = "".join([i for i in s])
66 else:
67 raise ValueError("Unexpected object %r" % s)
68 header, data = _s.splitlines()
69 header = header.split()
70 data = data.split()
71 data[0] = int(data[0])
72 data[1] = int(data[1])
73 data[2] = float(data[2])
74 data[3] = int(data[3])
75 return dict(list(zip(header, data)))
76
77
78 def _reldist_output_handler(s, **kwargs):
79 """
80 reldist, if called with -detail, returns a valid BED file with the relative
81 distance as the last field. In that case, return the BedTool immediately.
82 If not -detail, then the results are a table, in which case here we parse
83 into a dict for convenience.
84 """
85 if "detail" in kwargs:
86 return BedTool(s)
87 if isinstance(s, str):
88 iterable = open(s)
89 if hasattr(s, "next"):
90 iterable = s
91 header = next(iterable).split()
92 results = {}
93 for h in header:
94 results[h] = []
95 for i in iterable:
96 reldist, count, total, fraction = i.split()
97 data = [float(reldist), int(count), int(total), float(fraction)]
98 for h, d in zip(header, data):
99 results[h].append(d)
100 return results
101
102
103 def _wraps(
104 prog: Optional[str] = None,
105 implicit: Optional[str] = None,
106 bam: Optional[str] = None,
107 other: Optional[str] = None,
108 uses_genome: bool = False,
109 make_tempfile_for: Optional[str] = None,
110 check_stderr:Optional[Callable]=None,
111 add_to_bedtool:Optional[dict] = None,
112 nonbam: Optional[Literal["ALL"] | str | list[str]] = None,
113 force_bam: bool = False,
114 genome_none_if: Optional[list[str]] =None,
115 genome_if: Optional[list[str]] = None,
116 genome_ok_if: Optional[list[str]] = None,
117 does_not_return_bedtool: Optional[Callable] =None,
118 arg_order: Optional[list[str]] = None,
119 ):
120 """
121 Do-it-all wrapper, to be used as a decorator.
122
123 *prog* is the name of the BEDTools program that will be called. The help
124 for this program will also be added to the decorated method's docstring.
125
126 *implicit* is the BEDTools program arg that should be filled in
127 automatically.
128
129 *bam* will disable the implicit substitution if *bam* is in the kwargs.
130 This is typically 'abam' or 'ibam' if the program accepts BAM input.
131
132 *other* is the BEDTools program arg that is passed in as the second input,
133 if supported. Within the semantics of BEDTools, the typical case will be
134 that if implicit='a' then other='b'; if implicit='i' then other=None.
135
136 *uses_genome*, if True, will check for 'g' and/or 'genome' args and
137 retrieve the corresponding genome files as needed.
138
139 *make_tempfile_for* is used for the sequence methods and indicates which
140 kwarg should have a tempfile made for it if it's not provided ('fo' for the
141 sequence methods)
142
143 *check_stderr*, if not None, is a function that accepts a string (which
144 will be anything written to stdout when calling the wrapped program). This
145 function should return True if the string is OK, and False if it should
146 truly be considered an error. This is needed for wrapping fastaFromBed,
147 which will report to stderr that it's creating an index file.
148
149 *add_to_bedtool* is used for sequence methods. It is a dictionary mapping
150 kwargs to attributes to be created in the resulting BedTool. Typically it
151 is {'fo':'seqfn'} which will add the resulting sequence name to the
152 BedTool's .seqfn attribute. If *add_to_bedtool* is not None, then the
153 returned BedTool will be *self* with the added attribute. If a key is
154 "stdout" (e.g., {"stdout": attr_name}), then save the stdout of the command
155 as a tempfile and store the tempfile's name in the attribute. This is
156 required for linksBed and bedToIgv.
157
158 *nonbam* is a kwarg that even if the input file was a BAM, the output will
159 *not* be BAM format. For example, the `-bed` arg for intersectBed will
160 cause the output to be in BED format, not BAM. If not None, this can be a
161 string, a list of strings, or the special string "ALL", which means that
162 the wrapped program will never return BAM output.
163
164 *force_bam*, if True, will force the output to be BAM. This is used for
165 bedToBam.
166
167 *genome_none_if* is a list of arguments that will ignore the requirement
168 for a genome. This is needed for window_maker, where -b and -g are
169 mutually exclusive.
170
171 *genome_ok_if* is a list of arguments that, if they are in
172 *genome_none_if*, are still OK to pass in. This is needed for bedtool
173 genomecov, where -g is not needed if -ibam is specified...but it's still OK
174 if the user passes a genome arg.
175
176 *genome_if* is a list of arguments that will trigger the requirement for
177 a genome; otherwise no genome needs to be specified.
178
179 *does_not_return_bedtool*, if not None, should be a function that handles
180 the returned output. Its signature should be ``func(output, kwargs)``,
181 where `output` is the output from the [possibly streaming] call to BEDTools
182 and `kwargs` are passed verbatim from the wrapped method call. Some
183 examples of methods that use this are jaccard, reldist, fisher, and split
184 methods.
185
186 *arg_order*, if not None, is a sorted list of arguments. This is used by
187 handle_kwargs() to deal with things like issues 81 and 345, where some
188 BEDTools programs are sensitive to argument order.
189 """
190
191 # NOTE: We are calling each BEDTools program to get its help and adding
192 # that to the docstring of each method. This is run at import time. However
193 # if BEDTools is not on the path at import time, `not_implemented` is set
194 # to True and isn't reset later until the module is reloaded.
195 #
196 # helpers.set_bedtools_path therefore will trigger a module reload.
197 not_implemented = False
198
199 # Call the program with -h to get help, which prints to stderr.
200 try:
201 p = subprocess.Popen(
202 helpers._version_2_15_plus_names(prog) + ["-h"],
203 stdout=subprocess.PIPE,
204 stderr=subprocess.PIPE,
205 )
206 help_str = p.communicate()[1].decode()
207
208 # underscores throw off ReStructuredText syntax of docstrings, so
209 # replace 'em
210 help_str = help_str.replace("_", "**")
211
212 # indent
213 help_str = help_str.split("\n")
214 help_str = ["\n\n**Original BEDTools help:**::"] + ["\t" + i for i in help_str]
215 help_str = "\n".join(help_str) + "\n"
216
217 # If the program can't be found, then we'll eventually replace the method
218 # with a version that does nothing but raise a NotImplementedError (plus
219 # a helpful message).
220 except OSError:
221 help_str = (
222 '"%s" does not appear to be installed '
223 "or on the path, so this method is "
224 "disabled. Please install a more recent "
225 "version of BEDTools and re-import to "
226 "use this method." % prog
227 )
228 not_implemented = True
229
230 def decorator(func):
231 """
232 Accepts a function to be wrapped; discards the original and returns a
233 new, rebuilt-from-scratch function based on the kwargs passed to
234 _wraps().
235 """
236 # Register the implicit (as well as bam and other) args in the global
237 # registry. BedTool.handle_kwargs() will access these at runtime. The
238 # registry is keyed by program name (like intersectBed).
239 _implicit_registry[prog] = implicit
240 if other is not None:
241 _other_registry[prog] = other
242 if bam is not None:
243 _bam_registry[prog] = bam
244
245 # Here's where we replace an unable-to-be-found program's method with
246 # one that only returns a NotImplementedError
247 if not_implemented:
248
249 def not_implemented_func(*args, **kwargs):
250 raise NotImplementedError(help_str)
251
252 return not_implemented_func
253
254 _add_doc = []
255 if implicit:
256 _add_doc.append(
257 dedent(
258 """
259 For convenience, the file or stream this BedTool points to
260 is implicitly passed as the `-%s` argument to `%s`
261 """
262 % (implicit, prog)
263 )
264 )
265
266 if uses_genome:
267 _add_doc.append(
268 dedent(
269 """
270 There are two alternatives for supplying a genome. Use
271 `g="genome.filename"` if you have a genome's chrom sizes
272 saved as a file. This is the what BEDTools expects when
273 using it from the command line. Alternatively, use the
274 `genome="assembly.name"` (for example, `genome="hg19"`) to
275 use chrom sizes for that assembly without having to manage
276 a separate file. The `genome` argument triggers a call
277 `pybedtools.chromsizes`, so see that method for more
278 details.
279 """
280 )
281 )
282
283 def wrapped(self, *args, **kwargs):
284 """
285 A newly created function that will be returned by the _wraps()
286 decorator
287 """
288
289 # Only one non-keyword argument is supported; this is then assumed
290 # to be "other" (e.g., `-b` for intersectBed)
291 if len(args) > 0:
292 assert len(args) == 1
293 kwargs[other] = args[0]
294
295 # Add the implicit values to kwargs. If the current BedTool is
296 # BAM, it will automatically be passed to the appropriate
297 # BAM-support arg (like `-abam`). But this also allows the user to
298 # explicitly specify the abam kwarg, which will override the
299 # auto-substitution.
300 # Note: here, `implicit` is something like "a"; `bam` is something
301 # like "abam"
302 if (
303 (implicit not in kwargs)
304 and (bam not in kwargs)
305 and (implicit is not None)
306 ):
307 if not self._isbam:
308 kwargs[implicit] = self.fn
309 else:
310 # It is a bam file. If this program supports BAM as the
311 # first input, then we set it here
312 if bam is not None:
313 kwargs[bam] = self.fn
314
315 # Otherwise, BEDTools can't currently handle it, so raise
316 # an exception.
317 else:
318 raise pybedtoolsError(
319 '"%s" currently can\'t handle BAM '
320 "input, please use bam_to_bed() first." % prog
321 )
322
323 # Should this function handle genome files?
324 check_for_genome = uses_genome
325 if uses_genome:
326 if genome_none_if:
327 for i in genome_none_if:
328 if i in kwargs or i == implicit:
329 check_for_genome = False
330
331 # for genomecov, if -ibam then -g is optional. So it's OK
332 # for the user to provide genome or g kwargs, even if
333 # -ibam.
334 if genome_ok_if:
335 for i in genome_ok_if:
336 if i in kwargs or i == implicit:
337 if ("g" in kwargs) or ("genome" in kwargs):
338 check_for_genome = True
339 if genome_if:
340 check_for_genome = False
341 for i in genome_if:
342 if (i in kwargs) or (i == implicit):
343 check_for_genome = True
344 if check_for_genome:
345 kwargs = self.check_genome(**kwargs)
346
347 # For sequence methods, we may need to make a tempfile that will
348 # hold the resulting sequence. For example, fastaFromBed needs to
349 # make a tempfile for 'fo' if no 'fo' was explicitly specified by
350 # the user.
351 if make_tempfile_for is not None:
352 if make_tempfile_for not in kwargs:
353 kwargs[make_tempfile_for] = self._tmp()
354
355 # At runtime, this will parse the kwargs, convert streams to
356 # tempfiles if needed, and return all the goodies
357 cmds, tmp, stdin = self.handle_kwargs(prog=prog,
358 arg_order=arg_order,
359 **kwargs)
360
361 # Decide whether the output is BAM format or not.
362 result_is_bam = False
363
364 # By default, if the current BedTool is BAM, then the result should
365 # be, too.
366 if self._isbam:
367 result_is_bam = True
368
369 # If nonbam is "ALL", then this method will never return BAM
370 # output.
371 if nonbam == "ALL":
372 result_is_bam = False
373
374 # If any of the `nonbam` args are found in kwargs, then result is
375 # not a BAM. Side note: the _nonbam name mangling is necessary to
376 # keep the nonbam arg passed into the original _wraps() decorator
377 # in scope.
378 if nonbam is not None and nonbam != "ALL":
379 if isinstance(nonbam, str):
380 _nonbam = [nonbam]
381 else:
382 _nonbam = nonbam
383 for i in _nonbam:
384 if i in kwargs:
385 result_is_bam = False
386 break
387
388 if force_bam:
389 result_is_bam = True
390
391 decode_output = not result_is_bam
392
393 # Do the actual call
394 stream = call_bedtools(
395 cmds,
396 tmp,
397 stdin=stdin,
398 check_stderr=check_stderr,
399 decode_output=decode_output,
400 )
401
402 if does_not_return_bedtool:
403 return does_not_return_bedtool(stream, **kwargs)
404
405 # Post-hoc editing of the BedTool -- for example, this is used for
406 # the sequence methods to add a `seqfn` attribute to the resulting
407 # BedTool.
408 if add_to_bedtool is not None:
409 for kw, attr in list(add_to_bedtool.items()):
410 if kw == "stdout":
411 value = stream
412 else:
413 value = kwargs[kw]
414 setattr(self, attr, value)
415 result = self
416 else:
417 result = BedTool(stream)
418
419 result._isbam = result_is_bam
420 result._cmds = cmds
421 del kwargs
422 return result
423
424 # Now add the edited docstring (original Python docstring plus BEDTools
425 # help) to the newly created method above
426 if func.__doc__ is None:
427 orig = ""
428 else:
429 orig = func.__doc__
430
431 wrapped.__doc__ = orig + "\n".join(_add_doc) + help_str
432
433 # Add the original method's name to a new attribute so we can access it
434 # when logging history
435 wrapped._name = func.__name__ # type: ignore
436
437 return wrapped
438
439 return decorator
440
441
442 def _log_to_history(method: Callable):
443 """
444 Decorator to add a method and its kwargs to the history.
445
446 Assumes that you only add this decorator to bedtool instances that
447 return other bedtool instances
448 """
449
450 def decorated(self, *args, **kwargs):
451
452 # this calls the actual method in the first place; *result* is
453 # whatever you get back
454 result = method(self, *args, **kwargs)
455
456 # add appropriate tags
457 parent_tag = self._tag
458 result_tag = result._tag
459
460 history_step = HistoryStep(
461 method, args, kwargs, self, parent_tag, result_tag
462 )
463
464 # only add the current history to the new bedtool if there's
465 # something to add
466 if len(self.history) > 0:
467 result.history.append(self.history)
468
469 # but either way, add this history step to the result.
470 result.history.append(history_step)
471
472 return result
473
474 decorated.__doc__ = method.__doc__
475 return decorated
476
477
478 class BedTool(object):
479 TEMPFILES = filenames.TEMPFILES
480
481 def __init__(self, fn: Optional[Any] = None,
482 from_string: bool = False,
483 remote: bool = False):
484 """
485 Wrapper around Aaron Quinlan's ``BEDtools`` suite of programs
486 (https://github.com/arq5x/bedtools); also contains many useful
487 methods for more detailed work with BED files.
488
489 *fn* is typically the name of a BED-like file, but can also be
490 one of the following:
491
492 * a string filename
493 * another BedTool object
494 * an iterable of Interval objects
495 * an open file object
496 * a "file contents" string (see below)
497
498 If *from_string* is True, then you can pass a string that contains
499 the contents of the BedTool you want to create. This will treat all
500 spaces as TABs and write to tempfile, treating whatever you pass as
501 *fn* as the contents of the bed file. This also strips empty lines.
502
503 Typical usage is to point to an existing file::
504
505 a = BedTool('a.bed')
506
507 But you can also create one from scratch from a string::
508
509 >>> s = '''
510 ... chrX 1 100
511 ... chrX 25 800
512 ... '''
513 >>> a = BedTool(s, from_string=True)
514
515 Or use examples that come with pybedtools::
516
517 >>> example_files = pybedtools.list_example_files()
518 >>> assert 'a.bed' in example_files
519 >>> a = pybedtools.example_bedtool('a.bed')
520
521 """
522 if remote:
523 raise ValueError(
524 "Remote BAM no longer supported (since BEDTools does not " "support it)"
525 )
526 self.remote = remote
527 self._isbam = False
528 self._bam_header = ""
529 self._cmds = []
530 if from_string:
531 if fn is None or not isinstance(fn, str):
532 raise ValueError("from_string=True requires a string to parse")
533 bed_contents = fn
534 fn = self._tmp()
535 fout = open(fn, "w")
536 for line in bed_contents.splitlines():
537 if len(line.strip()) == 0:
538 continue
539 line = "\t".join(line.split()) + "\n"
540 fout.write(line)
541 fout.close()
542
543 else:
544 # if fn is a Path object, we have to use its string representation
545 if isinstance(fn, pathlib.PurePath):
546 fn = str(fn)
547
548 # our work is already done
549 if isinstance(fn, BedTool):
550 fn = fn.fn
551
552 # from_string=False, so assume it's a filename
553 elif isinstance(fn, str):
554 if remote:
555 self._isbam = True
556 else:
557 if not os.path.exists(fn):
558 msg = 'File "%s" does not exist' % fn
559 raise FileNotFoundError(msg)
560 self._isbam = isBAM(fn)
561
562 # TODO: we dont' really need this, but it's added here for
563 # compatibility with existing tests
564 if self._isbam:
565 header = pysam.Samfile(fn).header.to_dict()
566 # For example:
567 # {
568 # 'HD': {'VN': '1.0', 'SO': 'coordinate'},
569 # 'SQ': [
570 # {'LN': 23011544,
571 # 'SN': 'chr2L'},
572 # {'LN': 21146708,
573 # 'SN': 'chr2R'},
574 # {'LN': 24543557,
575 # 'SN': 'chr3L'},
576 # {'LN': 27905053,
577 # 'SN': 'chr3R'},
578 # {'LN': 1351857,
579 # 'SN': 'chr4'},
580 # {'LN': 22422827,
581 # 'SN': 'chrX'}
582 # ]
583 # }
584
585 txt_header = []
586 for k, v in header.items():
587 if isinstance(v, list):
588 for i in v:
589 if isinstance(i, dict):
590 txt_header.append(
591 "\t".join(
592 ["@" + k]
593 + [
594 ":".join(map(str, j))
595 for j in sorted(i.items(), reverse=True)
596 ]
597 )
598 )
599 elif isinstance(i, str):
600 txt_header.append(i)
601
602 elif isinstance(v, dict):
603 txt_header.append(
604 "\t".join(
605 ["@" + k]
606 + [
607 ":".join(map(str, j))
608 for j in sorted(v.items(), reverse=True)
609 ]
610 )
611 )
612 else:
613 raise ValueError("unhandled type in BAM header")
614 self._bam_header = "\n".join(txt_header) + "\n"
615
616 # If tuple or list, then save as file first
617 # (fixes #73)
618 elif isinstance(fn, (list, tuple)):
619 fn = BedTool(iter(fn)).saveas().fn
620
621 # Otherwise assume iterator, say an open file as from
622 # subprocess.PIPE
623 else:
624 fn = fn
625
626 self.fn = fn
627 tag = "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
628 self._tag = tag
629 _tags[tag] = self
630 self._hascounts = False
631 self._file_type = None
632 self.seqfn = None
633 self.fastq = None
634 self.igv_script = None
635 self.links_html = None
636 self.history = History()
637
638 @classmethod
639 def from_dataframe(
640 cls,
641 df: pd.DataFrame,
642 outfile: Optional[str] =None,
643 sep: str ="\t",
644 header: bool =False,
645 na_rep:str =".",
646 index:bool =False,
647 **kwargs
648 ) -> BedTool:
649 """
650 Creates a BedTool from a pandas.DataFrame.
651
652 If `outfile` is None, a temporary file will be used. Otherwise it can
653 be a specific filename or an open file handle. Additional kwargs will
654 be passed to `pandas.DataFrame.to_csv`.
655
656 The fields of the resulting BedTool will match the order of columns in
657 the dataframe.
658 """
659 try:
660 import pandas
661 except ImportError:
662 raise ImportError("pandas must be installed to use dataframes")
663 if outfile is None:
664 outfile = cls._tmp()
665 default_kwargs = dict(sep=sep, header=header, na_rep=na_rep, index=index)
666 default_kwargs.update(kwargs)
667 df.to_csv(outfile, **default_kwargs)
668
669 if isinstance(outfile, str):
670 fn = outfile
671 else:
672 try:
673 fn = outfile.name
674 except AttributeError:
675 raise ValueError(
676 "`outfile` is not a string and doesn't have a `name` attribute. "
677 "Unable to determine filename."
678 )
679 return BedTool(fn)
680
681 def split(self, func: Callable, *args, **kwargs) -> BedTool:
682 """
683 Split each feature using a user-defined function.
684
685 Calls the provided function `func` with each interval. In contrast to
686 `each` (which does something similar), this method expects `func` to
687 return an *iterable* of Interval objects.
688
689 args and kwargs are passed directly to `func`.
690
691 Returns a new BedTool.
692 """
693
694 def generator():
695 for orig_interval in self:
696 for interval in func(orig_interval, *args, **kwargs):
697 yield interval
698
699 return BedTool(generator())
700
701 def truncate_to_chrom(self, genome: str | dict) -> BedTool:
702 """
703 Ensure all features fall within chromosome limits.
704
705 Some peak-callers extend peaks such that the boundaries overstep
706 chromosome coordinates. Upon uploading such a file to a genome browser
707 like UCSC, this results in an error like::
708
709 Error line 101 of custom track: chromEnd larger than chrom chr2
710 size
711
712 Use this method to clean your file, truncating any out-of-bounds
713 features to fit within the chromosome coordinates of `genome`.
714
715 `genome` can be either an assembly name ('dm3') or a dictionary where
716 keys are chrom and values are (start, stop) tuples.
717 """
718 if isinstance(genome, dict):
719 chromdict = genome
720 else:
721 assert isinstance(genome, str)
722 chromdict = helpers.chromsizes(genome)
723
724 tmp = self._tmp()
725 with open(tmp, "w") as fout:
726 for chrom, coords in list(chromdict.items()):
727 start, stop = coords
728 start = str(start)
729 stop = str(stop)
730 fout.write("\t".join([chrom, start, stop]) + "\n")
731 return self.intersect(tmp)
732
733 def tabix_intervals(self, interval_or_string: Interval | str, check_coordinates: bool=False) -> BedTool:
734 """
735 Retrieve all intervals within coordinates from a "tabixed" BedTool.
736
737 Given either a string in "chrom:start-stop" format, or an interval-like
738 object with chrom, start, stop attributes, return a *streaming* BedTool
739 of the features in this BedTool that overlap the provided interval.
740
741 If the coordinates are invalid, an empty generator is returned unless
742 `check_coordinates=True` in which case a ValueError will be raised.
743 """
744 if not self._tabixed():
745 raise ValueError(
746 "This BedTool has not been indexed for tabix "
747 "-- please use the .tabix() method"
748 )
749
750 # tabix expects 1-based coords, but BEDTools works with
751 # zero-based. pybedtools and pysam also work with zero-based. So we can
752 # pass zero-based directly to the pysam tabix interface.
753 tbx = pysam.TabixFile(self.fn)
754
755 # If an interval is passed, use its coordinates directly
756 if isinstance(interval_or_string, Interval):
757 interval: Interval = interval_or_string
758 chrom, start, end = interval.chrom, interval.start, interval.stop
759 # Parse string directly instead of relying on Interval, in order to
760 # permit full chromosome fetching
761 else:
762 match = helpers.coord_re.search(interval_or_string)
763 # Assume string is contig if it doesn't fit chrom:start-end format
764 if match is None:
765 chrom = interval_or_string
766 start, end = None, None
767 # Otherwise parse the coordinates
768 else:
769 chrom, start, end = match.group(1, 2, 3)
770 start, end = int(start), int(end)
771
772 # Fetch results.
773 try:
774 results = tbx.fetch(str(chrom), start, end)
775 except ValueError:
776 if check_coordinates:
777 raise
778 else:
779 results = []
780
781 # pysam.ctabix.TabixIterator does not include newlines when yielding so
782 # we need to add them.
783 def gen():
784 for i in results:
785 yield i + "\n"
786
787 # xref #190
788 x = BedTool(gen()).saveas()
789 tbx.close()
790 return x
791
792 def tabix_contigs(self):
793 """
794 Returns a list of contigs from the tabix index.
795 """
796 if not self._tabixed():
797 raise ValueError(
798 "This BedTool has not been indexed for tabix "
799 "-- please use the .tabix() method"
800 )
801
802 tbx = pysam.TabixFile(self.fn)
803 return tbx.contigs
804
805 def tabix(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> BedTool:
806 """
807 Prepare a BedTool for use with Tabix.
808
809 Returns a new BedTool that has been BGZIP compressed
810 and indexed by tabix.
811
812 Parameters
813 ----------
814
815 in_place : bool
816 If True (default), then assume the file is already sorted and
817 replace the existing file with the BGZIPed version.
818
819 force : bool
820 If True (default is False), then overwrite both the index and the
821 BGZIP file.
822
823 is_sorted : bool
824 If True (default is False), then assume the file is already sorted
825 so that BedTool.bgzip() doesn't have to do that work.
826 """
827 # Return quickly if nothing to do
828 if self._tabixed() and not force:
829 return self
830
831 # Make sure it's BGZIPed
832 fn = self.bgzip(in_place=in_place, force=force, is_sorted=is_sorted)
833 if self.file_type is not None and self.file_type not in ["bam", "empty"]:
834 pysam.tabix_index(fn, force=force, preset=self.file_type) # type: ignore
835 return BedTool(fn)
836
837 def _tabixed(self):
838 """
839 Verifies that we're working with a tabixed file: a string filename
840 pointing to a BGZIPed file with a .tbi file in the same dir.
841 """
842 if (
843 isinstance(self.fn, str)
844 and isBGZIP(self.fn)
845 and os.path.exists(self.fn + ".tbi")
846 ):
847 return True
848
849 def bgzip(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> str:
850 """
851 Helper function for more control over "tabixed" BedTools.
852
853 Checks to see if we already have a BGZIP file; if not then prepare one.
854 Always leaves the original file alone. You can always just make a
855 BedTool out of an already sorted and BGZIPed file to avoid this step.
856
857 `in_place` will put the BGZIPed file in the same dir (possibly after
858 sorting to tempfile).
859
860 If `is_sorted`, then assume the file is already sorted. Otherwise call
861 bedtools sort with the `-header` option.
862
863 `force` will overwrite without asking.
864 """
865 # It may already be BGZIPed...
866 if isinstance(self.fn, str) and not force:
867 if isBGZIP(self.fn):
868 return self.fn
869
870 # If not in_place, then make a tempfile for the BGZIPed version
871 if not in_place:
872 # Get tempfile name, sorted or not
873 if not is_sorted:
874 fn = self.sort(header=True).fn
875 else:
876 fn = self._tmp()
877
878 # Register for later deletion
879 outfn = fn + ".gz"
880 BedTool.TEMPFILES.append(outfn)
881
882 # Creates tempfile.gz
883 pysam.tabix_compress(fn, outfn, force=force)
884 return outfn
885
886 # Otherwise, make sure the BGZIPed version has a similar name to the
887 # current BedTool's file
888 if in_place:
889 if not is_sorted:
890 fn = self.sort(header=True).saveas().fn
891 else:
892 fn = self.fn
893 outfn = self.fn + ".gz"
894 pysam.tabix_compress(fn, outfn, force=force)
895 return outfn
896
897 def delete_temporary_history(self, ask: bool = True, raw_input_func=None):
898 """
899 Use at your own risk! This method will delete temp files. You will be
900 prompted for deletion of files unless you specify *ask=False*.
901
902 Deletes all temporary files created during the history of this BedTool
903 up to but not including the file this current BedTool points to.
904
905 Any filenames that are in the history and have the following pattern
906 will be deleted::
907
908 <TEMP_DIR>/pybedtools.*.tmp
909
910 (where <TEMP_DIR> is the result from get_tempdir() and is by default
911 "/tmp")
912
913 Any files that don't have this format will be left alone.
914
915 (*raw_input_func* is used for testing)
916 """
917 flattened_history = _flatten_list(self.history)
918 to_delete = []
919 tempdir = get_tempdir()
920 for i in flattened_history:
921 fn = i.fn
922 if fn.startswith(os.path.join(os.path.abspath(tempdir), "pybedtools")):
923 if fn.endswith(".tmp"):
924 to_delete.append(fn)
925
926 if raw_input_func is None:
927 raw_input_func = input
928
929 str_fns = "\n\t".join(to_delete)
930 if ask:
931 answer = raw_input_func("Delete these files?\n\t%s\n(y/N) " % str_fns)
932
933 if not answer.lower()[0] == "y":
934 print("OK, not deleting.")
935 return
936 for fn in to_delete:
937 os.unlink(fn)
938 return
939
940 def filter(self, func: Callable, *args, **kwargs) -> BedTool:
941 """
942 Filter features by user-defined function.
943
944 Takes a function *func* that is called for each feature in the
945 `BedTool` object and returns only those for which the function returns
946 True.
947
948 *args and **kwargs are passed directly to *func*.
949
950 Returns a streaming BedTool; if you want the filename then use the
951 .saveas() method.
952
953 >>> a = pybedtools.example_bedtool('a.bed')
954 >>> subset = a.filter(lambda b: b.chrom == 'chr1' and b.start < 150)
955 >>> len(a), len(subset)
956 (4, 2)
957
958 so it has extracted 2 records from the original 4.
959
960 """
961 return BedTool((f for f in self if func(f, *args, **kwargs)))
962
963 def field_count(self, n:int=10) -> int:
964 """
965 Number of fields in each line of this BedTool (checks `n` lines)
966
967 Return the number of fields in the features this file contains. Checks
968 the first *n* features.
969
970 >>> a = pybedtools.example_bedtool('a.bed')
971 >>> a.field_count()
972 6
973 """
974 if self.file_type == "empty":
975 return 0
976 i = 0
977 fields = set([])
978 for feat in self:
979 if i > n:
980 break
981 i += 1
982 # TODO: make this more efficient.
983 fields.update([len(feat.fields)])
984 assert len(fields) == 1, fields
985 return list(fields)[0]
986
987 def each(self, func: Callable, *args, **kwargs) -> BedTool:
988 """
989 Modify each feature with a user-defined function.
990
991 Applies user-defined function *func* to each feature. *func* must
992 accept an Interval as its first argument; *args and **kwargs will be
993 passed to *func*.
994
995 *func* must return an Interval object OR a value that evaluates to
996 False, in which case the original feature will be removed from the
997 output. This way, an additional "filter" call is not necessary.
998
999 >>> def truncate_feature(feature, limit=0):
1000 ... feature.score = str(len(feature))
1001 ... if len(feature) > limit:
1002 ... feature.stop = feature.start + limit
1003 ... feature.name = feature.name + '.short'
1004 ... return feature
1005
1006 >>> a = pybedtools.example_bedtool('a.bed')
1007 >>> b = a.each(truncate_feature, limit=100)
1008 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
1009 chr1 1 100 feature1 99 +
1010 chr1 100 200 feature2 100 +
1011 chr1 150 250 feature3.short 350 -
1012 chr1 900 950 feature4 50 +
1013 <BLANKLINE>
1014
1015 """
1016
1017 def _generator():
1018 for f in self:
1019 result = func(f, *args, **kwargs)
1020 if result:
1021 yield result
1022
1023 return BedTool(_generator())
1024
1025 def introns(self, gene: str = "gene", exon: str = "exon") -> BedTool:
1026 """
1027 Create intron features (requires specific input format).
1028
1029 NOTE: this method assumes a simple file with non-overlapping exons. For
1030 more sophisticated features, consider the gffutils package instead.
1031
1032 Given a BED12 or a GFF with exons, create a new `BedTool` with just
1033 introns. The output is a bed6 file with the score column (5) being one
1034 of 'intron'/'utr5'/'utr3'
1035 """
1036 # iterate over all the features in the gene.
1037 s = self.sort()
1038 if self.file_type == "gff":
1039 exon_iter = BedTool((f for f in s if f[2] == exon)).saveas()
1040 gene_iter = BedTool((f for f in s if f[2] == gene)).saveas()
1041
1042 elif self.file_type == "bed":
1043 if s.field_count() == 12:
1044 exon_iter = s.bed6().saveas()
1045 gene_iter = s.saveas()
1046 else:
1047 # TODO: bed6. groupby on name and find smallest start,
1048 # largest stop.
1049 exon_iter = s
1050 gene_iter = None
1051 raise NotImplementedError(
1052 ".introns() only supported for bed12" "and GFF"
1053 )
1054
1055 else:
1056 raise NotImplementedError(".introns() only supported for BED and GFF")
1057
1058 with open(BedTool._tmp(), "w") as fh:
1059 # group on the name.
1060 exon_intervals = IntervalFile(exon_iter.fn)
1061 for g in gene_iter:
1062 # search finds all, but we just want the ones that completely
1063 # overlap this gene.
1064 exons = [
1065 e
1066 for e in exon_intervals.search(g, same_strand=True)
1067 if e.start >= g.start and e.end <= g.end
1068 ]
1069
1070 for i, exon_instance in enumerate(exons):
1071 exon_instance: pybedtools.Interval
1072 # 5' utr between gene start and first intron
1073 if i == 0 and exon_instance.start > g.start:
1074 utr = {"+": "utr5", "-": "utr3"}[g.strand]
1075 print(
1076 "%s\t%i\t%i\t%s\t%s\t%s"
1077 % (g.chrom, g.start, exon_instance.start, g.name, utr, g.strand),
1078 file=fh,
1079 )
1080 elif i == len(exons) - 1 and exon_instance.end < g.end:
1081 utr = {"+": "utr3", "-": "utr5"}[g.strand]
1082 print(
1083 "%s\t%i\t%i\t%s\t%s\t%s"
1084 % (g.chrom, exon_instance.end, g.end, g.name, utr, g.strand),
1085 file=fh,
1086 )
1087 elif i != len(exons) - 1:
1088 istart = exon_instance.end
1089 iend = exons[i + 1].start
1090 print(
1091 "%s\t%i\t%i\t%s\tintron\t%s"
1092 % (g.chrom, istart, iend, g.name, g.strand),
1093 file=fh,
1094 )
1095 return BedTool(fh.name)
1096
1097 def features(self):
1098 """
1099 Returns an iterable of features
1100 """
1101 if hasattr(self, "next") or hasattr(self, "__next__"):
1102 return self
1103 return iter(self)
1104
1105 FileType = Literal['bed', 'vcf', 'gff', 'bam', 'sam', 'empty']
1106
1107 @property
1108 def file_type(self) -> Optional[FileType]:
1109 """
1110 Return the type of the current file. One of ('bed','vcf','gff', 'bam',
1111 'sam', 'empty').
1112
1113 >>> a = pybedtools.example_bedtool('a.bed')
1114 >>> print(a.file_type)
1115 bed
1116 """
1117 if not isinstance(self.fn, str):
1118 raise ValueError(
1119 "Checking file_type not supported for "
1120 "non-file BedTools. Use .saveas() to "
1121 "save as a temp file first."
1122 )
1123 if self._isbam:
1124 self._file_type = "bam"
1125 else:
1126 try:
1127 self._file_type = next(iter(self)).file_type
1128 except StopIteration:
1129 self._file_type = "empty"
1130
1131 return self._file_type
1132
1133 def cut(self, indexes: list[int], stream: bool = False) -> BedTool:
1134 """
1135 Analogous to unix `cut`.
1136
1137 Similar to unix `cut` except indexes are 0-based, must be a list
1138 and the columns are returned in the order requested.
1139
1140 This method returns a BedTool of results, which means that the indexes
1141 returned must be valid GFF/GTF/BED/SAM features.
1142
1143 If you would like arbitrary columns -- say, just chrom and featuretype
1144 of a GFF, which would not comprise a valid feature -- then instead of
1145 this method, simply use indexes on each feature, e.g,
1146
1147 >>> gff = pybedtools.example_bedtool('d.gff')
1148 >>> results = [(f[0], f[2]) for f in gff]
1149
1150 In addition, `indexes` can contain keys of the GFF/GTF attributes, in
1151 which case the values are returned. e.g. 'gene_name' will return the
1152 corresponding name from a GTF, or 'start' will return the start
1153 attribute of a BED Interval.
1154 """
1155 if stream:
1156 return BedTool(([f[attr] for attr in indexes] for f in self))
1157 else:
1158 with open(self._tmp(), "w") as fh:
1159 for f in self:
1160 print("\t".join(map(str, [f[attr] for attr in indexes])), file=fh)
1161 return BedTool(fh.name)
1162
1163 @classmethod
1164 def _tmp(cls) -> str:
1165 """
1166 Makes a tempfile and registers it in the BedTool.TEMPFILES class
1167 variable. Adds a "pybedtools." prefix and ".tmp" extension for easy
1168 deletion if you forget to call pybedtools.cleanup().
1169 """
1170 tmpfn = tempfile.NamedTemporaryFile(
1171 prefix=settings.tempfile_prefix,
1172 suffix=settings.tempfile_suffix,
1173 delete=False,
1174 )
1175 tmpfn = tmpfn.name
1176 cls.TEMPFILES.append(tmpfn)
1177 return tmpfn
1178
1179 def __iter__(self):
1180 """
1181 Dispatches the right iterator depending on how this BedTool was
1182 created
1183 """
1184 if self._isbam:
1185 # Note: BAM class takes filename or stream, so self.fn is OK
1186 # here
1187 return BAM(self.fn)
1188
1189 # Plain ol' filename
1190 if isinstance(self.fn, str):
1191 if not os.path.exists(self.fn):
1192 raise BedToolsFileError("{0} does not exist".format(self.fn))
1193 if isGZIP(self.fn):
1194 return IntervalIterator(gzip.open(self.fn, "rt"))
1195 else:
1196 return IntervalIterator(open(self.fn, "r"))
1197 # Any other kind of input (streaming string from stdout; iterable of
1198 # Intervals, iterable of (chrom, start, stop) tuples, etc are handled
1199 # appropriately by IntervalIterator.
1200 else:
1201 return IntervalIterator(self.fn)
1202
1203 @property
1204 def intervals(self):
1205 if isinstance(self.fn, str):
1206 return IntervalFile(self.fn)
1207 else:
1208 raise ValueError("Please convert to a file-based BedTool using saveas")
1209
1210 def __repr__(self):
1211 if isinstance(self.fn, str):
1212 if os.path.exists(self.fn) or self.remote:
1213 return "<BedTool(%s)>" % self.fn
1214 else:
1215 return "<BedTool(MISSING FILE: %s)>" % self.fn
1216 elif isinstance(self.fn, BedTool):
1217 return repr(self.fn)
1218 else:
1219 return "<BedTool(%s)>" % repr(self.fn)
1220
1221 def __str__(self):
1222 """
1223 Returns the string representation of the whole `BedTool`
1224 """
1225 items = []
1226 for i in iter(self):
1227 i = str(i)
1228 if isinstance(i, bytes):
1229 i = i.decode("UTF-8")
1230 items.append(i)
1231 return "".join(items)
1232
1233 def __len__(self):
1234 return self.count()
1235
1236 def __eq__(self, other: object) -> bool:
1237 if isinstance(other, BedTool):
1238 if not isinstance(self.fn, str) or not isinstance(
1239 other.fn, str
1240 ):
1241 raise NotImplementedError(
1242 "Testing equality only supported for"
1243 " BedTools that point to files"
1244 )
1245 elif not isinstance(other, str):
1246 raise NotImplementedError(
1247 "Testing equality only supported for"
1248 " BedTools that point to files or str of content"
1249 )
1250 return str(self) == str(other)
1251
1252 def __ne__(self, other:object):
1253 return not self.__eq__(other)
1254
1255 def __getitem__(self, key: slice|int):
1256 if isinstance(key, slice):
1257 return islice(self, key.start, key.stop, key.step)
1258 elif isinstance(key, int):
1259 return list(islice(self, key, key + 1))[0]
1260 else:
1261 raise ValueError(
1262 "Only slices or integers allowed for indexing " "into a BedTool"
1263 )
1264
1265 def __add__(self, other: BedTool) -> BedTool:
1266 try:
1267 result = self.intersect(other, u=True)
1268 except BEDToolsError as e:
1269 # BEDTools versions <2.20 would raise BEDToolsError
1270 if (self.file_type == "empty") or (other.file_type == "empty"):
1271 result = pybedtools.BedTool("", from_string=True)
1272 else:
1273 raise e
1274 return result
1275
1276 def __sub__(self, other: BedTool) -> BedTool:
1277 result = None
1278
1279 try:
1280 result = self.intersect(other, v=True)
1281 except BEDToolsError:
1282 # BEDTools versions <2.20 would raise BEDToolsError
1283
1284 if (self.file_type == "empty") and (other.file_type == "empty"):
1285 result = pybedtools.BedTool("", from_string=True)
1286 elif other.file_type == "empty":
1287 result = self.saveas()
1288 elif self.file_type == "empty":
1289 result = pybedtools.BedTool("", from_string=True)
1290 if result is None:
1291 raise ValueError("Subtraction operation failed.")
1292
1293 return result
1294
1295 def head(self, n: int = 10, as_string: bool = False):
1296 """
1297 Prints the first *n* lines or returns them if as_string is True
1298
1299 Note that this only opens the underlying file (gzipped or not), so it
1300 does not check to see if the file is a valid BED file.
1301
1302 >>> a = pybedtools.example_bedtool('a.bed')
1303 >>> a.head(2) #doctest: +NORMALIZE_WHITESPACE
1304 chr1 1 100 feature1 0 +
1305 chr1 100 200 feature2 0 +
1306 <BLANKLINE>
1307
1308 """
1309 if not isinstance(self.fn, str):
1310 raise NotImplementedError(
1311 "head() not supported for non file-based BedTools"
1312 )
1313 if as_string:
1314 return "".join(str(line) for line in self[:n])
1315 if self._isbam:
1316 raise NotImplementedError("head() not supported for BAM")
1317 else:
1318 if isGZIP(self.fn):
1319 openfunc = gzip.open
1320 openmode = "rt"
1321 else:
1322 openfunc = open
1323 openmode = "r"
1324 with openfunc(self.fn, openmode) as fin:
1325 for i, line in enumerate(fin):
1326 if i == (n):
1327 break
1328 print(line, end=" ")
1329
1330 def set_chromsizes(self, chromsizes: str | dict):
1331 """
1332 Prepare BedTool for operations that require chromosome coords.
1333
1334 Set the chromsizes for this genome. If *chromsizes* is a string, it
1335 will be considered a genome assembly name. If that assembly name is
1336 not available in pybedtools.genome_registry, then it will be searched
1337 for on the UCSC Genome Browser.
1338
1339 Example usage:
1340
1341 >>> hg19 = pybedtools.chromsizes('hg19')
1342 >>> a = pybedtools.example_bedtool('a.bed')
1343 >>> a = a.set_chromsizes(hg19)
1344 >>> print(a.chromsizes['chr1'])
1345 (0, 249250621)
1346
1347 """
1348 if isinstance(chromsizes, str):
1349 self.chromsizes = pybedtools.chromsizes(chromsizes)
1350 elif isinstance(chromsizes, dict):
1351 self.chromsizes = chromsizes
1352 else:
1353 raise ValueError(
1354 "Need to specify chromsizes either as a string"
1355 " (assembly name) or a dictionary"
1356 )
1357 return self
1358
1359 def _collapse(
1360 self,
1361 iterable: Iterable,
1362 fn: Optional[str] = None,
1363 trackline: Optional[str] = None,
1364 in_compressed: bool = False,
1365 out_compressed: bool = False,
1366 ) -> str:
1367 """
1368 Collapses an iterable into file *fn* (or a new tempfile if *fn* is
1369 None).
1370
1371 Returns the newly created filename.
1372
1373 Parameters
1374 ----------
1375
1376 iterable : iter
1377 Any iterable object whose items can be converted to an Interval.
1378
1379 fn : str
1380 Output filename, if None then creates a temp file for output
1381
1382 trackline : str
1383 If not None, string to be added to the top of the output. Newline
1384 will be added.
1385
1386 in_compressed : bool
1387 Indicates whether the input is compressed
1388
1389 out_compressed : bool
1390 Indicates whether the output should be compressed
1391 """
1392 if fn is None:
1393 fn = self._tmp()
1394
1395 in_open_func = gzip.open if in_compressed else open
1396 out_open_func = gzip.open if out_compressed else open
1397
1398 # special case: if BAM-format BedTool is provided, no trackline should
1399 # be supplied, and don't iterate -- copy the file wholesale
1400 if isinstance(iterable, BedTool) and iterable._isbam:
1401 if trackline:
1402 raise ValueError(
1403 "trackline provided, but input is a BAM "
1404 "file, which takes no track line"
1405 )
1406 with open(fn, "wb") as out_:
1407 out_.write(open(self.fn, "rb").read())
1408 return fn
1409
1410 # If we're just working with filename-based BedTool objects, just copy
1411 # the files directly
1412 if isinstance(iterable, BedTool) and isinstance(iterable.fn, str):
1413 with out_open_func(fn, "wt") as out_:
1414 if sys.version_info > (3,0):
1415 in_ = in_open_func(iterable.fn, "rt", errors="ignore")
1416 else:
1417 in_ = in_open_func(iterable.fn, "rt")
1418 if trackline:
1419 out_.write(trackline.strip() + "\n")
1420 out_.writelines(in_)
1421 in_.close()
1422 else:
1423 with out_open_func(fn, "wt") as out_:
1424 for i in iterable:
1425 if isinstance(i, (list, tuple)):
1426 i = create_interval_from_list(list(i))
1427 out_.write(str(i))
1428 return fn
1429
1430 def handle_kwargs(self, prog:str, arg_order: Optional[list[str]] = None, **kwargs):
1431 """
1432 Handle most cases of BEDTool program calls, but leave the specifics
1433 up to individual methods.
1434
1435 *prog* is a BEDTools program name, e.g., 'intersectBed'.
1436
1437 *arg_order* lists any arguments that are sensitive to order. Everything
1438 else will be reverse-sorted.
1439
1440 *kwargs* are passed directly from the calling method (like
1441 self.intersect).
1442
1443 This method figures out, given how this BedTool was constructed, what
1444 to send to BEDTools programs -- for example, an open file to stdin with
1445 the `-` argument, or a filename with the `-a` argument.
1446 """
1447 pybedtools.logger.debug(
1448 "BedTool.handle_kwargs() got these kwargs:\n%s", pprint.pformat(kwargs)
1449 )
1450
1451 # If you pass in a list, how should it be converted to a BedTools arg?
1452 default_list_delimiter = " "
1453 list_delimiters = {
1454 "annotateBed": " ",
1455 "getOverlap": ",",
1456 "groupBy": ",",
1457 "multiIntersectBed": " ",
1458 "mergeBed": ",",
1459 "intersectBed": " ",
1460 "mapBed": ",",
1461 }
1462 stdin = None
1463
1464 # -----------------------------------------------------------------
1465 # Decide how to send instream1 to BEDTools. If there's no implicit
1466 # instream1 arg, then do nothing.
1467 #
1468 try:
1469 # e.g., 'a' for intersectBed
1470 if self._isbam:
1471 inarg1 = _bam_registry[prog]
1472 else:
1473 inarg1 = _implicit_registry[prog]
1474
1475 # e.g., self.fn or 'a.bed' or an iterator...
1476 instream1 = kwargs[inarg1]
1477
1478 # If it's a BedTool, then get underlying stream
1479 if isinstance(instream1, BedTool):
1480 instream1 = instream1.fn
1481
1482 # Filename? No pipe, just provide the file
1483 if isinstance(instream1, str):
1484 kwargs[inarg1] = instream1
1485 stdin = None
1486
1487 # Open file? Pipe it
1488 # elif isinstance(instream1, file):
1489 # kwargs[inarg1] = 'stdin'
1490 # stdin = instream1
1491
1492 # A generator or iterator: pipe it as a generator of lines
1493 else:
1494 kwargs[inarg1] = "stdin"
1495 stdin = (str(i) for i in instream1)
1496 except KeyError:
1497 pass
1498
1499 # -----------------------------------------------------------------
1500 # Decide how to send instream2 to BEDTools.
1501 try:
1502 # e.g., 'b' for intersectBed
1503 inarg2 = _other_registry[prog]
1504
1505 # e.g., another BedTool
1506 instream2 = kwargs[inarg2]
1507
1508 # Get stream if BedTool
1509 if isinstance(instream2, BedTool):
1510 instream2 = instream2.fn
1511
1512 # Filename
1513 if isinstance(instream2, str):
1514 kwargs[inarg2] = instream2
1515
1516 # If it's a list of strings, then we need to figure out if it's
1517 # a list of filenames or a list of intervals (see issue #156)
1518 #
1519 # Several options:
1520 #
1521 # - assume intervals have tabs but filenames don't
1522 # - assume that, upon being split on tabs, an interval is >=3 fields
1523 # - try creating an interval out of the first thing, success means interval
1524 #
1525 # The last seems the most robust. It does allow filenames with
1526 # tabs; deciding whether or not such filenames are a good idea is
1527 # left to the user.
1528 #
1529 elif isinstance(instream2, (list, tuple)) and isinstance(
1530 instream2[0], str
1531 ):
1532 try:
1533 _ = create_interval_from_list(instream2[0].split("\t"))
1534 kwargs[inarg2] = self._collapse(instream2)
1535 except IndexError:
1536 kwargs[inarg2] = instream2
1537
1538 # Otherwise we need to collapse it in order to send to BEDTools
1539 # programs
1540 else:
1541 kwargs[inarg2] = self._collapse(instream2)
1542
1543 except KeyError:
1544 pass
1545
1546 # If stream not specified, then a tempfile will be created
1547 if kwargs.pop("stream", None):
1548 tmp = None
1549 else:
1550 output = kwargs.pop("output", None)
1551 if output:
1552 tmp = output
1553 else:
1554 tmp = self._tmp()
1555
1556 additional_args = kwargs.pop("additional_args", None)
1557
1558 # Parse the kwargs into BEDTools-ready args
1559 cmds = [prog]
1560
1561 # arg_order mechanism added to fix #345
1562 if arg_order is None:
1563 arg_order = []
1564
1565 for arg in arg_order:
1566 if arg in kwargs:
1567 val = kwargs.pop(arg)
1568 cmds.append("-" + arg)
1569 cmds.append(val)
1570
1571 # The reverse-sort is a temp fix for issue #81
1572 for key, value in sorted(list(kwargs.items()), reverse=True):
1573 if isinstance(value, bool):
1574 if value:
1575 cmds.append("-" + key)
1576 else:
1577 continue
1578 elif isinstance(value, list) or isinstance(value, tuple):
1579 value = list(map(str, value))
1580 try:
1581 delim = list_delimiters[prog]
1582 except KeyError:
1583 delim = default_list_delimiter
1584
1585 if delim == " ":
1586 cmds.append("-" + key)
1587 cmds.extend(value)
1588
1589 # make comma-separated list if that's what's needed
1590 else:
1591 cmds.append("-" + key)
1592 cmds.append(delim.join(value))
1593
1594 else:
1595 cmds.append("-" + key)
1596 cmds.append(str(value))
1597
1598 if additional_args:
1599 cmds.append(additional_args)
1600
1601 return cmds, tmp, stdin
1602
1603 def check_genome(self, **kwargs):
1604 """
1605 Handles the different ways of specifying a genome in kwargs:
1606
1607 g='genome.file' specifies a file directly
1608 genome='dm3' gets the file from genome registry
1609 self.chromsizes could be a dict.\
1610 """
1611
1612 # If both g and genome are missing, assume self.chromsizes
1613 if ("g" not in kwargs) and ("genome" not in kwargs):
1614 if hasattr(self, "chromsizes"):
1615 kwargs["g"] = self.chromsizes
1616 else:
1617 raise ValueError(
1618 'No genome specified. Use the "g" or '
1619 '"genome" kwargs, or use the '
1620 ".set_chromsizes() method"
1621 )
1622
1623 # If both specified, rather than make an implicit decision, raise an
1624 # exception
1625 if "g" in kwargs and "genome" in kwargs:
1626 raise ValueError('Cannot specify both "g" and "genome"')
1627
1628 # Something like genome='dm3' was specified
1629 if "g" not in kwargs and "genome" in kwargs:
1630 if isinstance(kwargs["genome"], dict):
1631 genome_dict = kwargs["genome"]
1632 else:
1633 genome_dict = pybedtools.chromsizes(kwargs["genome"])
1634 genome_file = pybedtools.chromsizes_to_file(genome_dict)
1635 kwargs["g"] = genome_file
1636 del kwargs["genome"]
1637
1638 # By the time we get here, 'g' is specified.
1639
1640 # If a dict was provided, convert to tempfile here
1641 if isinstance(kwargs["g"], dict):
1642 kwargs["g"] = pybedtools.chromsizes_to_file(kwargs["g"])
1643
1644 if not os.path.exists(kwargs["g"]):
1645 msg = 'Genome file "%s" does not exist' % (kwargs["g"])
1646 raise FileNotFoundError(msg)
1647
1648 return kwargs
1649
1650 @_log_to_history
1651 def remove_invalid(self):
1652 """
1653 Remove invalid features that may break BEDTools programs.
1654
1655 >>> a = pybedtools.BedTool("chr1 10 100\\nchr1 10 1",
1656 ... from_string=True)
1657 >>> print(a.remove_invalid()) #doctest: +NORMALIZE_WHITESPACE
1658 chr1 10 100
1659 <BLANKLINE>
1660
1661 """
1662 tmp = self._tmp()
1663
1664 # If it's a file-based BedTool -- which is likely, if we're trying to
1665 # remove invalid features -- then we need to parse it line by line.
1666 if isinstance(self.fn, str):
1667 i = IntervalIterator(open(self.fn, "r"))
1668 else:
1669 tmp = self.saveas()
1670 i = IntervalIterator(open(tmp.fn, "r"))
1671
1672 def _generator():
1673 while True:
1674 try:
1675 feature = next(i)
1676 if feature.start <= feature.stop:
1677 yield feature
1678 else:
1679 continue
1680 except pybedtools.MalformedBedLineError:
1681 continue
1682 except OverflowError:
1683 # This can happen if coords are negative
1684 continue
1685 except IndexError:
1686 continue
1687 except StopIteration:
1688 break
1689
1690 return BedTool(_generator())
1691
1692 def all_hits(self, interval: Interval, same_strand: bool = False, overlap: float = 0.0):
1693 """
1694 Return all intervals that overlap `interval`.
1695
1696 Calls the `all_hits` method of an IntervalFile to return all intervals
1697 in this current BedTool that overlap `interval`.
1698
1699 Require that overlaps have the same strand with same_strand=True.
1700
1701 Notes:
1702 If this current BedTool is generator-based, it will be
1703 converted into a file first.
1704
1705 If this current BedTool refers to a BAM file, it will be
1706 converted to a BED file first using default arguments. If you
1707 don't want this to happen, please convert to BED first before
1708 using this method.
1709 """
1710 if not isinstance(interval, Interval):
1711 raise ValueError("Need an Interval instance")
1712 fn = self.fn
1713 if not isinstance(fn, str):
1714 fn = self.saveas().fn
1715 if self._isbam:
1716 fn = self.bam_to_bed().fn
1717 interval_file = pybedtools.IntervalFile(fn)
1718 return interval_file.all_hits(interval, same_strand, overlap)
1719
1720 def any_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0):
1721 """
1722 Return whether or not any intervals overlap `interval`.
1723
1724 Calls the `any_hits` method of an IntervalFile. If there were any hits
1725 within `interval` in this BedTool, then return 1; otherwise 0.
1726
1727 Require that overlaps have the same strand with same_strand=True.
1728
1729 Notes:
1730 If this current BedTool is generator-based, it will be
1731 converted into a file first.
1732
1733 If this current BedTool refers to a BAM file, it will be
1734 converted to a BED file first using default arguments. If you
1735 don't want this to happen, please convert to BED first before
1736 using this method.
1737 """
1738 if not isinstance(interval, Interval):
1739 raise ValueError("Need an Interval instance")
1740 fn = self.fn
1741 if not isinstance(fn, str):
1742 fn = self.saveas().fn
1743 if self._isbam:
1744 fn = self.bam_to_bed().fn
1745 interval_file = pybedtools.IntervalFile(fn)
1746 return interval_file.any_hits(interval, same_strand, overlap)
1747
1748 def count_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0) -> int:
1749 """
1750 Return the number of intervals that overlap `interval`.
1751
1752 Calls the `count_hits` method of an IntervalFile. Returns the number
1753 of valid hits in this BedTool that overlap `interval`.
1754
1755 Require that overlaps have the same strand with same_strand=True.
1756
1757 Notes:
1758 If this current BedTool is generator-based, it will be
1759 converted into a file first.
1760
1761 If this current BedTool refers to a BAM file, it will be
1762 converted to a BED file first using default arguments. If you
1763 don't want this to happen, please convert to BED first before
1764 using this method.
1765 """
1766 if not isinstance(interval, Interval):
1767 raise ValueError("Need an Interval instance")
1768 fn = self.fn
1769 if not isinstance(fn, str):
1770 fn = self.saveas().fn
1771 if self._isbam:
1772 fn = self.bam_to_bed().fn
1773 interval_file = pybedtools.IntervalFile(fn)
1774 return interval_file.count_hits(interval, same_strand, overlap)
1775
1776 @_log_to_history
1777 @_wraps(prog="bed12ToBed6", implicit="i", bam=None, other=None)
1778 def bed6(self, *args, **kwargs) -> BedTool: # type: ignore
1779 """
1780 Wraps `bedtools bed12tobed6`.
1781 """
1782
1783 # Alias for backward compatibility
1784 bed12tobed6 = bed6
1785
1786 @_log_to_history
1787 @_wraps(prog="bamToBed", implicit="i", other=None, nonbam="ALL", bam="i")
1788 def bam_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore
1789 """
1790 Wraps `bedtools bamtobed`.
1791 """
1792
1793 # Alias for backward compatibility
1794 bamtobed = bam_to_bed
1795
1796 @_wraps(prog="bedToBam", implicit="i", uses_genome=True, force_bam=True)
1797 def _bed_to_bam(self, *args, **kwargs):
1798 """
1799 Wraps bedToBam and is called internally for BED/GFF/VCF files by
1800 self.to_bam (which needs to do something different for SAM files...)
1801 """
1802
1803 @_log_to_history
1804 def to_bam(self, **kwargs):
1805 """
1806 Wraps `bedtools bedtobam`
1807
1808 If self.fn is in BED/VCF/GFF format, call BEDTools' bedToBam. If
1809 self.fn is in SAM format, then create a header out of the genome file
1810 and then convert using `samtools`.
1811 """
1812 if self.file_type == "bam":
1813 return self
1814 if self.file_type in ("bed", "gff", "vcf"):
1815 return self._bed_to_bam(**kwargs)
1816
1817 # TODO: to maintain backwards compatibility we go from Interval to
1818 # AlignedSegment.
1819 if self.file_type == "sam":
1820
1821 # Use pysam, but construct the header out of a provided genome
1822 # file.
1823
1824 # construct a genome out of whatever kwargs were passed in
1825 kwargs = self.check_genome(**kwargs)
1826
1827 # Build a header that we can use for the output BAM file.
1828 genome = dict(i.split() for i in open(kwargs["g"]))
1829 SQ = []
1830 ref_ids = {}
1831 text_header = ["@HD\tVN:1.0"]
1832
1833 for i, (k, v) in enumerate(genome.items()):
1834 SQ.append(dict(SN=k, LN=int(v)))
1835 ref_ids[k] = i
1836 text_header.append("@SQ\tSN:{0}\tLN:{1}".format(k, v))
1837
1838 # And the text-format header
1839 text_header = "\n".join(text_header) + "\n"
1840
1841 # The strategy is to write an actual SAM file to disk, along with
1842 # a header, and then read that back in.
1843 #
1844 # Painfully inefficient, but this will change once all py2 tests
1845 # pass.
1846 sam_tmp = self._tmp()
1847 bam_tmp = self._tmp()
1848 with open(sam_tmp, "w") as fout:
1849 fout.write(text_header)
1850 for interval in self:
1851 fout.write("\t".join(map(str, interval.fields)) + "\n")
1852
1853 samfile = pysam.AlignmentFile(sam_tmp, "r")
1854 bamfile = pysam.AlignmentFile(bam_tmp, "wb", template=samfile)
1855 for alignment in samfile:
1856 bamfile.write(alignment)
1857
1858 samfile.close()
1859 bamfile.close()
1860 new_bedtool = BedTool(bam_tmp)
1861 new_bedtool._isbam = True
1862 return new_bedtool
1863
1864 # Alias for backward compatibility
1865 bedtobam = to_bam
1866
1867 @_log_to_history
1868 @_wraps(prog="intersectBed", implicit="a", other="b", bam="abam",
1869 nonbam="bed", arg_order=["a", "abam"])
1870 def intersect(self, *args, **kwargs) -> BedTool: # type: ignore
1871 """
1872 Wraps `bedtools intersect`.
1873 """
1874
1875 @_log_to_history
1876 @_wraps(
1877 prog="fastaFromBed",
1878 implicit="bed",
1879 bam=None,
1880 other="fi",
1881 make_tempfile_for="fo",
1882 check_stderr=_check_sequence_stderr,
1883 add_to_bedtool={"fo": "seqfn"},
1884 )
1885 def sequence(self, *args, **kwargs) -> BedTool: # type: ignore
1886 '''
1887 Wraps `bedtools getfasta`.
1888
1889 *fi* is passed in by the user; *bed* is automatically passed in as the
1890 bedfile of this object; *fo* by default is a temp file. Use
1891 save_seqs() to save as a file.
1892
1893 The end result is that this BedTool will assign a value to the attribute , self.seqfn,
1894 that points to the new fasta file.
1895
1896 Example usage:
1897
1898 >>> a = pybedtools.BedTool("""
1899 ... chr1 1 10
1900 ... chr1 50 55""", from_string=True)
1901 >>> fasta = pybedtools.example_filename('test.fa')
1902 >>> a = a.sequence(fi=fasta)
1903 >>> print(open(a.seqfn).read())
1904 >chr1:1-10
1905 GATGAGTCT
1906 >chr1:50-55
1907 CCATC
1908 <BLANKLINE>
1909
1910 '''
1911
1912 # Alias for backwards compatibility
1913 getfasta = sequence
1914
1915 @staticmethod
1916 def seq(loc, fasta) -> str:
1917 """
1918 Return just the sequence from a region string or a single location
1919 >>> fn = pybedtools.example_filename('test.fa')
1920 >>> BedTool.seq('chr1:2-10', fn)
1921 'GATGAGTCT'
1922 >>> BedTool.seq(('chr1', 1, 10), fn)
1923 'GATGAGTCT'
1924 """
1925 if isinstance(loc, str):
1926 chrom, start_end = loc.split(":")
1927 start, end = list(map(int, start_end.split("-")))
1928 start -= 1
1929 else:
1930 chrom, start, end = loc[0], loc[1], loc[2]
1931
1932 loc = BedTool("%s\t%i\t%i" % (chrom, start, end), from_string=True)
1933 lseq = loc.sequence(fi=fasta)
1934 return "".join([l.rstrip() for l in open(lseq.seqfn, "r") if l[0] != ">"])
1935
1936 @_log_to_history
1937 @_wraps(
1938 prog="nucBed", implicit="bed", other="fi", check_stderr=_check_sequence_stderr
1939 )
1940 def nucleotide_content(self) -> BedTool: # type: ignore
1941 """
1942 Wraps `bedtools nuc`.
1943
1944 Profiles nucleotide content. The returned BED file contains extra
1945 information about the nucleotide content
1946 """
1947
1948 # Alias for backwards compatibility
1949 nuc = nucleotide_content
1950
1951 @_log_to_history
1952 @_wraps(prog="multiBamCov", implicit="bed")
1953 def multi_bam_coverage(self) -> BedTool: # type: ignore
1954 """
1955 Wraps `bedtools multicov`.
1956
1957 Pass a list of sorted and indexed BAM files as `bams`
1958 """
1959
1960 # Alias for backwards compatibility
1961 multicov = multi_bam_coverage
1962
1963 @_log_to_history
1964 @_wraps(prog="subtractBed", implicit="a", other="b", bam=None)
1965 def subtract(self, *args, **kwargs) -> BedTool: # type: ignore
1966 """
1967 Wraps `bedtools subtract`.
1968
1969 Subtracts from another BED file and returns a new BedTool object.
1970
1971 Example usage:
1972
1973 >>> a = pybedtools.example_bedtool('a.bed')
1974 >>> b = pybedtools.example_bedtool('b.bed')
1975
1976 Do a "stranded" subtraction:
1977
1978 >>> c = a.subtract(b, s=True)
1979
1980 Require 50% of features in `a` to overlap:
1981
1982 >>> c = a.subtract(b, f=0.5)
1983 """
1984 if "a" not in kwargs:
1985 kwargs["a"] = self.fn
1986
1987 if "b" not in kwargs:
1988 if len(args) > 0:
1989 kwargs["b"] = args[0]
1990 else:
1991 raise ValueError("Must specify a BED file to subtract, either as a positional argument or as the 'b' keyword argument.")
1992
1993 cmds, tmp, stdin = self.handle_kwargs(prog="subtractBed", **kwargs)
1994 stream = call_bedtools(cmds, tmp, stdin=stdin)
1995 return BedTool(stream)
1996
1997 @_log_to_history
1998 @_wraps(prog="slopBed", implicit="i", other=None, bam=None, uses_genome=True)
1999 def slop(self, *args, **kwargs) -> BedTool: # type: ignore
2000 """
2001 Wraps `bedtools slop`.
2002 """
2003
2004 @_log_to_history
2005 @_wraps(prog="shiftBed", implicit="i", other=None, bam=None, uses_genome=True)
2006 def shift(self, *args, **kwargs) -> BedTool: # type: ignore
2007 """
2008 Wraps `bedtools shift`.
2009
2010 Shift each feature by user-defined number of bases. Returns a new BedTool object.
2011
2012 Example usage:
2013
2014 >>> a = pybedtools.example_bedtool('a.bed')
2015
2016 Shift every feature by 5bp:
2017
2018 >>> b = a.shift(genome='hg19', s=5)
2019 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
2020 chr1 6 105 feature1 0 +
2021 chr1 105 205 feature2 0 +
2022 chr1 155 505 feature3 0 -
2023 chr1 905 955 feature4 0 +
2024 <BLANKLINE>
2025
2026 Shift features on the '+' strand by -1bp and on '-' strand by +3bp:
2027
2028 >>> b = a.shift(genome='hg19', p=-1, m=3)
2029 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
2030 chr1 0 99 feature1 0 +
2031 chr1 99 199 feature2 0 +
2032 chr1 153 503 feature3 0 -
2033 chr1 899 949 feature4 0 +
2034 <BLANKLINE>
2035
2036 # Disabling, see https://github.com/arq5x/bedtools2/issues/807
2037 Shift features by a fraction of their length (0.50):
2038
2039 #>>> b = a.shift(genome='hg19', pct=True, s=0.50)
2040 #>>> print(b) #doctest: +NORMALIZE_WHITESPACE
2041 #chr1 50 149 feature1 0 +
2042 #chr1 150 250 feature2 0 +
2043 #chr1 325 675 feature3 0 -
2044 #chr1 925 975 feature4 0 +
2045 #<BLANKLINE>
2046
2047 """
2048
2049 @_log_to_history
2050 @_wraps(prog="mergeBed", implicit="i", other=None, bam=None)
2051 def merge(self, *args, **kwargs) -> BedTool: # type: ignore
2052 """
2053 Wraps `bedtools merge`.
2054
2055 Merge overlapping features together. Returns a new BedTool object.
2056
2057 Example usage:
2058
2059 >>> a = pybedtools.example_bedtool('a.bed')
2060
2061 Merge:
2062
2063 >>> c = a.merge()
2064
2065 Allow merging of features 500 bp apart:
2066
2067 >>> c = a.merge(d=500)
2068
2069 """
2070
2071 @_log_to_history
2072 @_wraps(prog="closestBed", implicit="a", other="b", bam=None)
2073 def closest(self, *args, **kwargs) -> BedTool: # type: ignore
2074 """
2075 Wraps `bedtools closest`.
2076
2077 Return a new BedTool object containing closest features in *b*. Note
2078 that the resulting file is no longer a valid BED format; use the
2079 special "_closest" methods to work with the resulting file.
2080
2081 Example usage::
2082
2083 a = BedTool('in.bed')
2084
2085 # get the closest feature in 'other.bed' on the same strand
2086 b = a.closest('other.bed', s=True)
2087
2088 """
2089
2090 @_log_to_history
2091 @_wraps(prog="windowBed", implicit="a", other="b", bam="abam", nonbam="bed")
2092 def window(self, *args, **kwargs) -> BedTool: # type: ignore
2093 """
2094 Wraps `bedtools window`.
2095
2096 Example usage::
2097
2098 >>> a = pybedtools.example_bedtool('a.bed')
2099 >>> b = pybedtools.example_bedtool('b.bed')
2100 >>> print(a.window(b, w=1000)) #doctest: +NORMALIZE_WHITESPACE
2101 chr1 1 100 feature1 0 + chr1 155 200 feature5 0 -
2102 chr1 1 100 feature1 0 + chr1 800 901 feature6 0 +
2103 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 -
2104 chr1 100 200 feature2 0 + chr1 800 901 feature6 0 +
2105 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 -
2106 chr1 150 500 feature3 0 - chr1 800 901 feature6 0 +
2107 chr1 900 950 feature4 0 + chr1 155 200 feature5 0 -
2108 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 +
2109 <BLANKLINE>
2110 """
2111
2112 @_log_to_history
2113 @_wraps(prog="shuffleBed", implicit="i", other=None, bam=None, uses_genome=True)
2114 def shuffle(self, *args, **kwargs) -> BedTool: # type: ignore
2115 """
2116 Wraps `bedtools shuffle`.
2117
2118 Example usage:
2119
2120 >>> a = pybedtools.example_bedtool('a.bed')
2121 >>> seed = 1 # so this test always returns the same results
2122 >>> b = a.shuffle(genome='hg19', chrom=True, seed=seed)
2123 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
2124 chr1 123081365 123081464 feature1 0 +
2125 chr1 243444570 243444670 feature2 0 +
2126 chr1 194620241 194620591 feature3 0 -
2127 chr1 172792873 172792923 feature4 0 +
2128 <BLANKLINE>
2129 """
2130
2131 @_log_to_history
2132 @_wraps(prog="sortBed", implicit="i", uses_genome=True, genome_if=["g", "genome"])
2133 def sort(self, *args, **kwargs) -> BedTool: # type: ignore
2134 """
2135 Wraps `bedtools sort`.
2136
2137 Note that chromosomes are sorted lexicographically, so chr12 will come
2138 before chr9.
2139
2140 Example usage:
2141
2142 >>> a = pybedtools.BedTool('''
2143 ... chr9 300 400
2144 ... chr1 100 200
2145 ... chr1 1 50
2146 ... chr12 1 100
2147 ... chr9 500 600
2148 ... ''', from_string=True)
2149 >>> print(a.sort()) #doctest: +NORMALIZE_WHITESPACE
2150 chr1 1 50
2151 chr1 100 200
2152 chr12 1 100
2153 chr9 300 400
2154 chr9 500 600
2155 <BLANKLINE>
2156 """
2157
2158 @_log_to_history
2159 @_wraps(prog="annotateBed", implicit="i")
2160 def annotate(self, *args, **kwargs) -> BedTool: # type: ignore
2161 """
2162 Wraps `bedtools annotate`.
2163
2164 Annotate this BedTool with a list of other files.
2165 Example usage:
2166
2167 >>> a = pybedtools.example_bedtool('a.bed')
2168 >>> b_fn = pybedtools.example_filename('b.bed')
2169 >>> print(a.annotate(files=b_fn)) #doctest: +NORMALIZE_WHITESPACE
2170 chr1 1 100 feature1 0 + 0.000000
2171 chr1 100 200 feature2 0 + 0.450000
2172 chr1 150 500 feature3 0 - 0.128571
2173 chr1 900 950 feature4 0 + 0.020000
2174 <BLANKLINE>
2175 """
2176
2177 @_log_to_history
2178 @_wraps(prog="flankBed", implicit="i", uses_genome=True)
2179 def flank(self, *args, **kwargs) -> BedTool:
2180 """
2181 Wraps `bedtools flank`.
2182
2183 Example usage:
2184
2185 >>> a = pybedtools.example_bedtool('a.bed')
2186 >>> print(a.flank(genome='hg19', b=100)) #doctest: +NORMALIZE_WHITESPACE
2187 chr1 0 1 feature1 0 +
2188 chr1 100 200 feature1 0 +
2189 chr1 0 100 feature2 0 +
2190 chr1 200 300 feature2 0 +
2191 chr1 50 150 feature3 0 -
2192 chr1 500 600 feature3 0 -
2193 chr1 800 900 feature4 0 +
2194 chr1 950 1050 feature4 0 +
2195 <BLANKLINE>
2196
2197 """
2198 kwargs = self.check_genome(**kwargs)
2199
2200 if "i" not in kwargs:
2201 kwargs["i"] = self.fn
2202
2203 cmds, tmp, stdin = self.handle_kwargs(prog="flankBed", **kwargs)
2204 stream = call_bedtools(cmds, tmp, stdin=stdin)
2205 return BedTool(stream)
2206
2207 @_log_to_history
2208 @_wraps(
2209 prog="genomeCoverageBed",
2210 implicit="i",
2211 bam="ibam",
2212 genome_none_if=["ibam"],
2213 genome_ok_if=["ibam"],
2214 uses_genome=True,
2215 nonbam="ALL",
2216 )
2217 def genome_coverage(self, *args, **kwargs):
2218 """
2219 Wraps `bedtools genomecov`.
2220
2221 Note that some invocations of `bedtools genomecov` do not result in
2222 a properly-formatted BED file. For example, the default behavior is to
2223 report a histogram of coverage. Iterating over the resulting,
2224 non-BED-format file will raise exceptions in pybedtools' parser.
2225
2226 Consider using the `BedTool.to_dataframe` method to convert these
2227 non-BED files into a pandas DataFrame for further use.
2228
2229 Example usage:
2230
2231 BAM file input does not require a genome:
2232
2233 >>> a = pybedtools.example_bedtool('x.bam')
2234 >>> b = a.genome_coverage(bg=True)
2235 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
2236 chr2L 9329 9365 1
2237 chr2L 10212 10248 1
2238 chr2L 10255 10291 1
2239
2240 Other input does require a genome:
2241
2242 >>> a = pybedtools.example_bedtool('x.bed')
2243 >>> b = a.genome_coverage(bg=True, genome='dm3')
2244 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
2245 chr2L 9329 9365 1
2246 chr2L 10212 10248 1
2247 chr2L 10255 10291 1
2248
2249 Non-BED format results:
2250 >>> a = pybedtools.example_bedtool('x.bed')
2251 >>> b = a.genome_coverage(genome='dm3')
2252 >>> df = b.to_dataframe(names=['chrom', 'depth', 'n', 'chromsize', 'fraction'])
2253 """
2254
2255 # Alias for backwards compatibility
2256 genomecov = genome_coverage
2257
2258 @_log_to_history
2259 @_wraps(prog="coverageBed", implicit="a", other="b", bam="abam", nonbam="ALL")
2260 def coverage(self, *args, **kwargs) -> BedTool: # type: ignore
2261 """
2262 Wraps `bedtools coverage`.
2263
2264 Note that starting in version 2.24.0, BEDTools swapped the semantics of
2265 the "a" and "b" files.
2266
2267 Example usage:
2268
2269 >>> a = pybedtools.example_bedtool('a.bed')
2270 >>> b = pybedtools.example_bedtool('b.bed')
2271 >>> c = b.coverage(a)
2272 >>> c.head(3) #doctest: +NORMALIZE_WHITESPACE
2273 chr1 155 200 feature5 0 - 2 45 45 1.0000000
2274 chr1 800 901 feature6 0 + 1 1 101 0.0099010
2275 """
2276
2277 @_log_to_history
2278 @_wraps(
2279 prog="maskFastaFromBed",
2280 implicit="bed",
2281 other="fi",
2282 make_tempfile_for="fo",
2283 add_to_bedtool={"fo": "seqfn"},
2284 check_stderr=_check_sequence_stderr,
2285 )
2286 def mask_fasta(self, *args, **kwargs) -> BedTool: # type: ignore
2287 """
2288 Wraps `bedtools maskfasta`.
2289
2290 Masks a fasta file at the positions in a BED file and saves result as
2291 'out' and stores the filename in seqfn.
2292
2293 >>> a = pybedtools.BedTool('chr1 100 110', from_string=True)
2294 >>> fasta_fn = pybedtools.example_filename('test.fa')
2295 >>> a = a.mask_fasta(fi=fasta_fn, fo='masked.fa.example')
2296 >>> b = a.slop(b=2, genome='hg19')
2297 >>> b = b.sequence(fi=a.seqfn)
2298 >>> print(open(b.seqfn).read())
2299 >chr1:98-112
2300 TTNNNNNNNNNNAT
2301 <BLANKLINE>
2302 >>> os.unlink('masked.fa.example')
2303 >>> if os.path.exists('masked.fa.example.fai'):
2304 ... os.unlink('masked.fa.example.fai')
2305 """
2306
2307 # Alias for backwards compatibility
2308 maskfasta = mask_fasta
2309
2310 @_log_to_history
2311 @_wraps(prog="complementBed", implicit="i", uses_genome=True)
2312 def complement(self, *args, **kwargs) -> BedTool: # type: ignore
2313 """
2314 Wraps `bedtools complement`.
2315 Example usage:
2316
2317 >>> a = pybedtools.example_bedtool('a.bed')
2318 >>> a.complement(genome='hg19').head(5) #doctest: +NORMALIZE_WHITESPACE
2319 chr1 0 1
2320 chr1 500 900
2321 chr1 950 249250621
2322 chr2 0 243199373
2323 chr3 0 198022430
2324 """
2325
2326 @_log_to_history
2327 @_wraps(prog="getOverlap", implicit="i")
2328 def overlap(self, *args, **kwargs) -> BedTool: # type: ignore
2329 """
2330 Wraps `bedtools overlap`.
2331
2332 Example usage:
2333
2334 >>> a = pybedtools.example_bedtool('a.bed')
2335 >>> b = pybedtools.example_bedtool('b.bed')
2336 >>> c = a.window(b, w=10).overlap(cols=[2,3,8,9])
2337 >>> print(c) #doctest: +NORMALIZE_WHITESPACE
2338 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 - 45
2339 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 - 45
2340 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 + 1
2341 <BLANKLINE>
2342 """
2343
2344 # TODO: needs test files and doctests written
2345 @_log_to_history
2346 @_wraps(prog="pairToBed", implicit="a", other="b", bam="abam", nonbam="bedpe")
2347 def pair_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore
2348 """
2349 Wraps `bedtools pairtobed`.
2350 """
2351
2352 # Alias for backwards compatibility
2353 pairtobed = pair_to_bed
2354
2355 @_log_to_history
2356 @_wraps(prog="pairToPair", implicit="a", other="b")
2357 def pair_to_pair(self) -> BedTool: # type: ignore
2358 """
2359 Wraps `bedtools pairtopair`.
2360 """
2361
2362 # Alias for backwards compatibility
2363 pairtopair = pair_to_pair
2364
2365 @_log_to_history
2366 @_wraps(prog="groupBy", implicit="i")
2367 def groupby(self, *args, **kwargs) -> BedTool:
2368 """
2369 Wraps `bedtools groupby`.
2370
2371 Example usage:
2372
2373 >>> a = pybedtools.example_bedtool('gdc.gff')
2374 >>> b = pybedtools.example_bedtool('gdc.bed')
2375 >>> c = a.intersect(b, c=True)
2376 >>> d = c.groupby(g=[1, 4, 5], c=10, o=['sum'])
2377 >>> print(d) #doctest: +NORMALIZE_WHITESPACE
2378 chr2L 41 70 0
2379 chr2L 71 130 2
2380 chr2L 131 170 4
2381 chr2L 171 200 0
2382 chr2L 201 220 1
2383 chr2L 41 130 2
2384 chr2L 171 220 1
2385 chr2L 41 220 7
2386 chr2L 161 230 6
2387 chr2L 41 220 7
2388 <BLANKLINE>
2389
2390 """
2391
2392 @_log_to_history
2393 @_wraps(prog="tagBam", implicit="i", bam="i")
2394 def tag_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore
2395 """
2396 Wraps `bedtools tag`.
2397
2398 `files` and `labels` should lists of equal length.
2399
2400 """
2401 if "i" in kwargs:
2402 if "labels" in kwargs:
2403 if len(kwargs["i"]) != len(kwargs["labels"]):
2404 raise ValueError("files and labels must be lists of equal length")
2405
2406
2407 # Alias for backwards compatibility
2408 tag = tag_bam
2409
2410 @_log_to_history
2411 @_wraps(prog="mapBed", implicit="a", other="b")
2412 def map(self, *args, **kwargs) -> BedTool: # type: ignore
2413 """
2414 Wraps `bedtools map`; See also :meth:`BedTool.each`.
2415 """
2416
2417 @_log_to_history
2418 @_wraps(prog="multiIntersectBed", uses_genome=True, genome_if=["empty"])
2419 def multi_intersect(self, *args, **kwargs) -> BedTool: # type: ignore
2420 """
2421 Wraps `bedtools multiintersect`.
2422
2423 Provide a list of filenames as the "i" argument. e.g. if you already
2424 have BedTool objects then use their `.fn` attribute, like this::
2425
2426 >>> x = pybedtools.BedTool()
2427 >>> a = pybedtools.example_bedtool('a.bed')
2428 >>> b = pybedtools.example_bedtool('b.bed')
2429 >>> result = x.multi_intersect(i=[a.fn, b.fn])
2430 >>> print(result) #doctest: +NORMALIZE_WHITESPACE
2431 chr1 1 155 1 1 1 0
2432 chr1 155 200 2 1,2 1 1
2433 chr1 200 500 1 1 1 0
2434 chr1 800 900 1 2 0 1
2435 chr1 900 901 2 1,2 1 1
2436 chr1 901 950 1 1 1 0
2437 <BLANKLINE>
2438
2439 """
2440
2441 # Alias for backwards compatibility
2442 multiinter = multi_intersect
2443
2444 @_log_to_history
2445 @_wraps(prog="randomBed", uses_genome=True)
2446 def random(self, *args, **kwargs) -> BedTool: # type: ignore
2447 """
2448 Wraps `bedtools random`.
2449
2450 Since this method does not operate on an existing file, create
2451 a BedTool with no arguments and then call this method, e.g.,
2452
2453 >>> x = BedTool()
2454 >>> y = x.random(l=100, n=10, genome='hg19')
2455 """
2456
2457 @_log_to_history
2458 @_wraps("bedpeToBam", implicit="i", uses_genome=True, force_bam=True)
2459 def bedpe_to_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore
2460 """
2461 Wraps `bedtools bedpetobam`.
2462 """
2463
2464 # Alias for backwards compatibility
2465 bedpetobam = bedpe_to_bam
2466
2467 @_log_to_history
2468 @_wraps(prog="clusterBed", implicit="i")
2469 def cluster(self, *args, **kwargs) -> BedTool: # type: ignore
2470 """
2471 Wraps `bedtools cluster`.
2472 """
2473
2474 @_log_to_history
2475 @_wraps(prog="unionBedGraphs")
2476 def union_bedgraphs(self, *args, **kwargs) -> BedTool: # type: ignore
2477 """
2478 Wraps `bedtools unionbedg`.
2479
2480 Warning: using the `header=True` kwarg will result in a file that is
2481 not in true BED format, which may break downstream analysis.
2482 """
2483 if "header" in kwargs:
2484 if kwargs["header"] is True:
2485 warn("Using header=True with unionbedg will result in a file that is not in true BED format, which may break downstream analysis.")
2486
2487
2488 # Alias for backwards compatibility
2489 unionbedg = union_bedgraphs
2490
2491 @_log_to_history
2492 @_wraps(prog="windowMaker", uses_genome=True, genome_none_if=["b"], other="b", arg_order=["w"])
2493 def window_maker(self, *args, **kwargs) -> BedTool: # type: ignore
2494 """
2495 Wraps `bedtools makewindows`.
2496 """
2497
2498 # Alias for backwards compatibility
2499 makewindows = window_maker
2500
2501 @_log_to_history
2502 @_wraps(prog="expandCols", implicit="i")
2503 def expand(self, *args, **kwargs) -> BedTool: # type: ignore
2504 """
2505 Wraps `bedtools expand`
2506 """
2507
2508 @_log_to_history
2509 @_wraps(prog="linksBed", implicit="i", add_to_bedtool={"stdout": "links_html"})
2510 def links(self, *args, **kwargs) -> BedTool: # type: ignore
2511 """
2512 Wraps `linksBed`.
2513
2514 The resulting BedTool will assign a value to the attribute `links_html`. This
2515 attribute is a temp filename containing the HTML links.
2516 """
2517
2518 @_log_to_history
2519 @_wraps(prog="bedToIgv", implicit="i", add_to_bedtool={"stdout": "igv_script"})
2520 def igv(self, *args, **kwargs) -> BedTool: # type: ignore
2521 """
2522 Wraps `bedtools igv`.
2523
2524 The resulting BedTool will assign a value to the attribute `igv_script`. This
2525 attribute is a temp filename containing the IGV script.
2526 """
2527
2528 @_log_to_history
2529 @_wraps(
2530 prog="bamToFastq",
2531 implicit="i",
2532 bam="i",
2533 make_tempfile_for="fq",
2534 add_to_bedtool={"fq": "fastq"},
2535 )
2536 def bam_to_fastq(self, *args, **kwargs) -> BedTool: # type: ignore
2537 """
2538 Wraps `bedtools bamtofastq`.
2539
2540 The `fq` argument is required.
2541
2542 The resulting BedTool will assign a value to the attribute `fastq`.
2543 """
2544
2545 # Alias for backwards compatibility
2546 bamtofastq = bam_to_fastq
2547
2548 @_wraps(
2549 prog="jaccard",
2550 implicit="a",
2551 other="b",
2552 does_not_return_bedtool=_jaccard_output_to_dict,
2553 )
2554 def jaccard(self, *args, **kwargs) -> dict[str, Any]: # type: ignore
2555 """
2556 Returns a dictionary with keys (intersection, union, jaccard).
2557 """
2558
2559 @_wraps(
2560 prog="reldist",
2561 implicit="a",
2562 other="b",
2563 does_not_return_bedtool=_reldist_output_handler,
2564 )
2565 def reldist(self, *args, **kwargs) -> BedTool | dict[str, Any]: # type: ignore
2566 """
2567 If detail=False, then return a dictionary with keys (reldist, count,
2568 total, fraction), which is the summary of the bedtools reldist.
2569
2570 Otherwise return a BedTool, with the relative distance for each
2571 interval in A in the last column.
2572 """
2573 if "detail" in kwargs:
2574 if kwargs["detail"] is False:
2575 warn("Using detail=False with reldist will return a dictionary with keys (reldist, count, total, fraction), which is the summary of the bedtools reldist."
2576 "Not a BedTool object.")
2577
2578
2579 @_wraps(prog="sample", implicit="i", bam="i")
2580 def sample(self, *args, **kwargs) -> BedTool: # type: ignore
2581 """
2582 Wraps 'sample'.
2583 """
2584
2585 @_wraps(
2586 prog="fisher",
2587 implicit="a",
2588 other="b",
2589 uses_genome=True,
2590 does_not_return_bedtool=FisherOutput,
2591 )
2592 def fisher(self, *args, **kwargs) -> FisherOutput: # type: ignore
2593 """
2594 Wraps 'fisher'. Returns an object representing the output.
2595
2596 >>> a = pybedtools.example_bedtool('a.bed')
2597 >>> b = pybedtools.example_bedtool('b.bed')
2598 >>> f = a.fisher(b, genome='hg19')
2599 >>> print(f) # doctest: +NORMALIZE_WHITESPACE
2600 # Number of query intervals: 4
2601 # Number of db intervals: 2
2602 # Number of overlaps: 3
2603 # Number of possible intervals (estimated): 13958448
2604 # phyper(3 - 1, 4, 13958448 - 4, 2, lower.tail=F)
2605 # Contingency Table Of Counts
2606 #_________________________________________
2607 # | in -b | not in -b |
2608 # in -a | 3 | 1 |
2609 # not in -a | 0 | 13958444 |
2610 #_________________________________________
2611 # p-values for fisher's exact test
2612 left right two-tail ratio
2613 1 8.8247e-21 8.8247e-21 inf
2614 <BLANKLINE>
2615
2616
2617 >>> f.table['not in -a']['in -b']
2618 0
2619
2620 >>> f.table['not in -a']['not in -b']
2621 13958444
2622
2623 >>> f.table['in -a']['in -b']
2624 3
2625
2626 >>> f.table['in -a']['not in -b']
2627 1
2628
2629 >>> f.two_tail
2630 8.8247e-21
2631 """
2632
2633 @_wraps(prog="split", implicit="i", does_not_return_bedtool=SplitOutput)
2634 def splitbed(self, *args, **kwargs) -> SplitOutput: # type: ignore
2635 """
2636 Wraps 'bedtools split'.
2637
2638 BedTool objects have long had a `split` method which splits intervals
2639 according to a custom function. Now that BEDTools has a `split` tool,
2640 the method name conflicts. To maintain backwards compatibility, the
2641 method wrapping the BEDTools command is called `splitbed`.
2642
2643 Since this tool does not return a single BED file, the method parses
2644 the output and returns a SplitOutput object, which includes an
2645 attribute, `bedtools`, that is a list of BedTool objects created from
2646 the split files.
2647
2648 To keep the working directory clean, you may want to consider using
2649 `prefix=BedTool._tmp()` to get a temp file that will be deleted when
2650 Python exits cleanly.
2651
2652 >>> a = pybedtools.example_bedtool('a.bed')
2653 >>> s = a.splitbed(n=2, p="split")
2654 >>> assert len(a) == 4, len(a)
2655 >>> assert len(s.bedtools) == 2
2656 >>> print(s.bedtools[0]) # doctest: +NORMALIZE_WHITESPACE
2657 chr1 150 500 feature3 0 -
2658 <BLANKLINE>
2659 >>> print(s.bedtools[1]) # doctest: +NORMALIZE_WHITESPACE
2660 chr1 100 200 feature2 0 +
2661 chr1 1 100 feature1 0 +
2662 chr1 900 950 feature4 0 +
2663 <BLANKLINE>
2664 """
2665
2666 @_wraps(prog="spacing", implicit="i")
2667 def spacing(self, *args, **kwargs) -> BedTool: # type: ignore
2668 """
2669 Wraps `bedtools spacing`
2670
2671 >>> a = pybedtools.example_bedtool('a.bed')
2672 >>> print(a.spacing()) # doctest: +NORMALIZE_WHITESPACE
2673 chr1 1 100 feature1 0 + .
2674 chr1 100 200 feature2 0 + 0
2675 chr1 150 500 feature3 0 - -1
2676 chr1 900 950 feature4 0 + 400
2677 """
2678
2679 def count(self) -> int:
2680 """
2681 Count the number features in this BedTool.
2682
2683 Number of features in BED file. Does the same thing as len(self), which
2684 actually just calls this method.
2685
2686 Only counts the actual features. Ignores any track lines, browser
2687 lines, lines starting with a "#", or blank lines.
2688
2689 Example usage:
2690
2691 >>> a = pybedtools.example_bedtool('a.bed')
2692 >>> a.count()
2693 4
2694 """
2695 if hasattr(self, "next") or hasattr(self, "__next__"):
2696 return sum(1 for _ in self)
2697 return sum(1 for _ in iter(self))
2698
2699 def print_sequence(self) -> str:
2700 """
2701 Print the sequence that was retrieved by BedTool.sequence.
2702 See usage example in :meth:`BedTool.sequence`.
2703
2704 Returns:
2705 str: The sequence as a string.
2706
2707 Raises:
2708 ValueError: If the sequence has not been generated using .sequence(fasta).
2709 """
2710 if self.seqfn is None:
2711 raise ValueError("Use .sequence(fasta) to get the sequence first")
2712 f = open(self.seqfn)
2713 s = f.read()
2714 f.close()
2715 return s
2716
2717 def save_seqs(self, fn:str) -> BedTool:
2718 """
2719 Save sequences, after calling BedTool.sequence.
2720
2721 In order to use this function, you need to have called
2722 the :meth:`BedTool.sequence()` method.
2723
2724 A new BedTool object is returned which references the newly saved file.
2725
2726 Example usage:
2727
2728 >>> a = pybedtools.BedTool('''
2729 ... chr1 1 10
2730 ... chr1 50 55''', from_string=True)
2731 >>> fasta = pybedtools.example_filename('test.fa')
2732 >>> a = a.sequence(fi=fasta)
2733 >>> print(open(a.seqfn).read())
2734 >chr1:1-10
2735 GATGAGTCT
2736 >chr1:50-55
2737 CCATC
2738 <BLANKLINE>
2739 >>> b = a.save_seqs('example.fa')
2740 >>> assert open(b.fn).read() == open(a.fn).read()
2741 >>> if os.path.exists('example.fa'):
2742 ... os.unlink('example.fa')
2743 """
2744
2745 if self.seqfn is None:
2746 raise ValueError("Use .sequence(fasta) to get the sequence first")
2747
2748 with open(fn, "w") as fout:
2749 if self.seqfn is None:
2750 raise ValueError("Use .sequence(fasta) to get the sequence first")
2751 with open(self.seqfn) as seqfile:
2752 fout.write(seqfile.read())
2753
2754 new_bedtool = BedTool(self.fn)
2755 new_bedtool.seqfn = fn
2756 return new_bedtool
2757
2758 def randomstats(
2759 self,
2760 other: BedTool,
2761 iterations: int,
2762 new: bool = False,
2763 genome_fn: Optional[str] = None,
2764 include_distribution: bool = False,
2765 **kwargs
2766 ) -> dict[str, Any]:
2767 """
2768 Dictionary of results from many randomly shuffled intersections.
2769
2770 Sends args and kwargs to :meth:`BedTool.randomintersection` and
2771 compiles results into a dictionary with useful stats. Requires
2772 numpy.
2773
2774 If `include_distribution` is True, then the dictionary will include the
2775 full distribution; otherwise, the distribution is deleted and cleaned
2776 up to save on memory usage.
2777
2778 This is one possible way of assigning significance to overlaps between
2779 two files. See, for example:
2780
2781 Negre N, Brown CD, Shah PK, Kheradpour P, Morrison CA, et al. 2010
2782 A Comprehensive Map of Insulator Elements for the Drosophila
2783 Genome. PLoS Genet 6(1): e1000814. doi:10.1371/journal.pgen.1000814
2784
2785 Example usage:
2786
2787 Make chromsizes a very small genome for this example:
2788
2789 >>> chromsizes = {'chr1':(1,1000)}
2790 >>> a = pybedtools.example_bedtool('a.bed').set_chromsizes(chromsizes)
2791 >>> b = pybedtools.example_bedtool('b.bed')
2792 >>> try:
2793 ... results = a.randomstats(b, 100, debug=True)
2794 ... except ImportError:
2795 ... pass
2796
2797 *results* is a dictionary that you can inspect.
2798
2799 (Note that the following examples are not run as part of the doctests
2800 to avoid forcing users to install NumPy just to pass tests)
2801
2802 The actual overlap::
2803
2804 print(results['actual'])
2805 3
2806
2807 The median of all randomized overlaps::
2808
2809 print(results['median randomized'])
2810 2.0
2811
2812 The percentile of the actual overlap in the distribution of randomized
2813 overlaps, which can be used to get an empirical p-value::
2814
2815 print(results['percentile'])
2816 90.0
2817 """
2818 if ("intersect_kwargs" not in kwargs) or (kwargs["intersect_kwargs"] is None):
2819 kwargs["intersect_kwargs"] = {"u": True}
2820 try:
2821 import numpy as np
2822 except ImportError:
2823 raise ImportError("Need to install NumPy for stats...")
2824
2825 def percentileofscore(a, score):
2826 """
2827 copied from scipy.stats.percentileofscore, to avoid dependency on
2828 scipy.
2829 """
2830 a = np.array(a)
2831 n = len(a)
2832
2833 if not (np.any(a == score)):
2834 a = np.append(a, score)
2835 a_len = np.array(list(range(len(a))))
2836 else:
2837 a_len = np.array(list(range(len(a)))) + 1.0
2838
2839 a = np.sort(a)
2840 idx = tuple([a == score])
2841 pct = (np.mean(a_len[idx]) / n) * 100.0
2842 return pct
2843
2844 if isinstance(other, str):
2845 other = BedTool(other)
2846 else:
2847 assert isinstance(
2848 other, BedTool
2849 ), "Either filename or another BedTool instance required"
2850
2851 # Actual (unshuffled) counts.
2852 i_kwargs = kwargs["intersect_kwargs"]
2853 actual = len(self.intersect(other, **i_kwargs))
2854
2855 # List of counts from randomly shuffled versions.
2856 # Length of counts == *iterations*.
2857
2858 if not new:
2859 distribution = self.randomintersection(
2860 other, iterations=iterations, **kwargs
2861 )
2862 else:
2863 # use new mechanism
2864 if genome_fn is None:
2865 raise ValueError(
2866 "`genome_fn` must be provided if using the "
2867 "new _randomintersection mechanism"
2868 )
2869 distribution = self._randomintersection(
2870 other, iterations=iterations, genome_fn=genome_fn, **kwargs
2871 )
2872
2873 distribution = np.array(list(distribution))
2874
2875 # Median of distribution
2876 med_count = np.median(distribution)
2877
2878 n = float(len(distribution))
2879
2880 frac_above = sum(distribution > actual) / n
2881 frac_below = sum(distribution < actual) / n
2882
2883 normalized = actual / med_count
2884
2885 lower_thresh = 2.5
2886 upper_thresh = 97.5
2887 lower, upper = np.percentile(distribution, [lower_thresh, upper_thresh])
2888
2889 actual_percentile = percentileofscore(distribution, actual)
2890 d = {
2891 "iterations": iterations,
2892 "actual": actual,
2893 "file_a": self.fn,
2894 "file_b": other.fn,
2895 self.fn: len(self),
2896 other.fn: len(other),
2897 "self": len(self),
2898 "other": len(other),
2899 "frac randomized above actual": frac_above,
2900 "frac randomized below actual": frac_below,
2901 "median randomized": med_count,
2902 "normalized": normalized,
2903 "percentile": actual_percentile,
2904 "lower_%sth" % lower_thresh: lower,
2905 "upper_%sth" % upper_thresh: upper,
2906 }
2907 if include_distribution:
2908 d["distribution"] = distribution
2909 else:
2910 del distribution
2911 return d
2912
2913 def random_op(self, *args, **kwargs):
2914 """
2915 For backwards compatibility; see BedTool.parallel_apply instead.
2916 """
2917 return self.parallel_apply(*args, **kwargs)
2918
2919 def parallel_apply(
2920 self,
2921 iterations: int,
2922 func: Callable,
2923 func_args: Iterable,
2924 func_kwargs: dict,
2925 processes: int=1,
2926 _orig_pool: Optional[Pool] = None # type: ignore
2927 ):
2928 """
2929 Generalized method for applying a function in parallel.
2930
2931 Typically used when having to do many random shufflings.
2932
2933 `func_args` and `func_kwargs` will be passed to `func` each time in
2934 `iterations`, and these iterations will be split across `processes`
2935 processes.
2936
2937 Notes on the function, `func`:
2938
2939 * the function should manually remove any tempfiles created. This
2940 is because the BedTool.TEMPFILES list of auto-created tempfiles
2941 does not share state across processes, so things will not get
2942 cleaned up automatically as they do in a single-process
2943 pybedtools session.
2944
2945 * this includes deleting any "chromsizes" or genome files --
2946 generally it will be best to require a genome filename in
2947 `func_kwargs` if you'll be using any BedTool methods that accept
2948 the `g` kwarg.
2949
2950 * the function should be a module-level function (rather than a
2951 class method) because class methods can't be pickled across
2952 process boundaries
2953
2954 * the function can have any signature and have any return value
2955
2956 `_orig_pool` can be a previously-created multiprocessing.Pool instance;
2957 otherwise, a new Pool will be created with `processes`
2958 """
2959 if processes == 1:
2960 for _ in range(iterations):
2961 yield func(*func_args, **func_kwargs)
2962 raise StopIteration
2963
2964 if _orig_pool:
2965 p = _orig_pool
2966 else:
2967 p = Pool(processes)
2968 iterations_each = [iterations / processes] * processes
2969 iterations_each[-1] += iterations % processes
2970
2971 # FYI some useful info on apply_async:
2972 # http://stackoverflow.com/questions/8533318/
2973 # python-multiprocessing-pool-when-to-use-apply-apply-async-or-map
2974 #
2975 # Here, we don't care about the order, and don't want the subprocesses
2976 # to block.
2977 results = [
2978 p.apply_async(func, func_args, func_kwargs) for _ in range(iterations)
2979 ]
2980 for r in results:
2981 yield r.get()
2982 raise StopIteration
2983
2984 def random_jaccard(
2985 self,
2986 other: BedTool,
2987 genome_fn: Optional[str] = None,
2988 iterations: Optional[int] = None,
2989 processes: int = 1,
2990 _orig_pool: Optional[Pool] = None,
2991 shuffle_kwargs: Optional[dict[str, Any]] = None,
2992 jaccard_kwargs: Optional[dict[str, Any]] = None,
2993 ) -> list:
2994 """
2995 Computes the naive Jaccard statistic (intersection divided by union).
2996
2997 .. note::
2998
2999 If you don't need the randomization functionality of this method,
3000 you can use the simpler BedTool.jaccard method instead.
3001
3002 See Favorov et al. (2012) PLoS Comput Biol 8(5): e1002529 for more
3003 info on the Jaccard statistic for intersections.
3004
3005 If `iterations` is None, then do not perform random shufflings.
3006
3007 If `iterations` is an integer, perform `iterations` random shufflings,
3008 each time computing the Jaccard statistic to build an empirical
3009 distribution. `genome_fn` will also be needed; optional `processes`
3010 will split the iterations across multiple CPUs.
3011
3012 Returns a tuple of the observed Jaccard statistic and a list of the
3013 randomized statistics (which will be an empty list if `iterations` was
3014 None).
3015 """
3016 if shuffle_kwargs is None:
3017 shuffle_kwargs = {}
3018 if jaccard_kwargs is None:
3019 jaccard_kwargs = {}
3020 if not genome_fn:
3021 raise ValueError("Need a genome filename in order to perform randomization")
3022 return list(
3023 self.parallel_apply(
3024 iterations=iterations if iterations else 1,
3025 func=pybedtools.stats.random_jaccard,
3026 func_args=(self, other),
3027 func_kwargs=dict(
3028 genome_fn=genome_fn,
3029 shuffle_kwargs=shuffle_kwargs,
3030 jaccard_kwargs=jaccard_kwargs,
3031 ),
3032 processes=processes,
3033 _orig_pool=_orig_pool,
3034 )
3035 )
3036
3037 def _randomintersection(
3038 self,
3039 other: BedTool,
3040 iterations: int,
3041 genome_fn: str,
3042 intersect_kwargs: Optional[dict[str, Any]] = None,
3043 _orig_pool:Optional[Pool] = None,
3044 shuffle_kwargs: Optional[dict[str, Any]] = None,
3045 processes: int = 1,
3046 ):
3047 """
3048 Re-implementation of BedTool.randomintersection using the new
3049 `random_op` method
3050 """
3051 if shuffle_kwargs is None:
3052 shuffle_kwargs = {}
3053 if intersect_kwargs is None:
3054 intersect_kwargs = dict(u=True)
3055 if not genome_fn:
3056 raise ValueError("Need a genome filename in order to perform randomization")
3057 return list(
3058 self.parallel_apply(
3059 iterations=iterations,
3060 func=pybedtools.stats.random_intersection,
3061 func_args=(self, other),
3062 func_kwargs=dict(
3063 genome_fn=genome_fn,
3064 shuffle_kwargs=shuffle_kwargs,
3065 intersect_kwargs=intersect_kwargs,
3066 ),
3067 processes=processes,
3068 _orig_pool=_orig_pool,
3069 )
3070 )
3071
3072 def randomintersection_bp(
3073 self,
3074 other: BedTool,
3075 iterations: int,
3076 genome_fn: str,
3077 intersect_kwargs: Optional[dict[str, Any]] = None,
3078 shuffle_kwargs: Optional[dict[str, Any]] = None,
3079 processes: int = 1,
3080 _orig_pool:Optional[Pool] = None,
3081 ) -> list[int]:
3082 """
3083 Like randomintersection, but return the bp overlap instead of the
3084 number of intersecting intervals.
3085 """
3086 if shuffle_kwargs is None:
3087 shuffle_kwargs = {}
3088 if intersect_kwargs is None:
3089 intersect_kwargs = {}
3090 if not genome_fn:
3091 raise ValueError("Need a genome filename in order to perform randomization")
3092 return list(
3093 self.parallel_apply(
3094 iterations=iterations,
3095 func=pybedtools.stats.random_intersection_bp,
3096 func_args=(self, other),
3097 func_kwargs=dict(
3098 genome_fn=genome_fn,
3099 shuffle_kwargs=shuffle_kwargs,
3100 intersect_kwargs=intersect_kwargs,
3101 ),
3102 processes=processes,
3103 _orig_pool=_orig_pool,
3104 )
3105 )
3106
3107 def randomintersection(
3108 self,
3109 other: BedTool,
3110 iterations: int,
3111 intersect_kwargs: Optional[dict[str, Any]] = None,
3112 shuffle_kwargs: Optional[dict[str, Any]] = None,
3113 debug: bool = False,
3114 report_iterations: bool = False,
3115 processes: Optional[int] = None,
3116 _orig_processes: Optional[int] = None,
3117 ) -> Iterator[int]:
3118 """
3119 Perform `iterations` shufflings, each time intersecting with `other`.
3120
3121 Returns a generator of integers where each integer is the number of
3122 intersections of a shuffled file with *other*. This distribution can
3123 be used in downstream analysis for things like empirical p-values.
3124
3125 *intersect_kwargs* and *shuffle_kwargs* are passed to self.intersect()
3126 and self.shuffle() respectively. By default for intersect, u=True is
3127 specified -- but s=True might be a useful option for strand-specific
3128 work.
3129
3130 Useful kwargs for *shuffle_kwargs* are chrom, excl, or incl. If you
3131 use the "seed" kwarg, that seed will be used *each* time shuffleBed is
3132 called -- so all your randomization results will be identical for each
3133 iteration. To get around this and to allow for tests, debug=True will
3134 set the seed to the iteration number. You may also break up the
3135 intersections across multiple processes with *processes* > 1.
3136
3137 Example usage:
3138
3139 >>> chromsizes = {'chr1':(0, 1000)}
3140 >>> a = pybedtools.example_bedtool('a.bed')
3141 >>> a = a.set_chromsizes(chromsizes)
3142 >>> b = pybedtools.example_bedtool('b.bed')
3143 >>> results = a.randomintersection(b, 10, debug=True)
3144 >>> print(list(results))
3145 [1, 0, 1, 2, 4, 2, 2, 1, 2, 4]
3146
3147 """
3148 if processes is not None:
3149 p = Pool(processes)
3150 iterations_each = [iterations // processes] * processes
3151 iterations_each[-1] += iterations % processes
3152 results = [
3153 p.apply_async(
3154 _call_randomintersect,
3155 (self, other, it),
3156 dict(
3157 intersect_kwargs=intersect_kwargs,
3158 shuffle_kwargs=shuffle_kwargs,
3159 debug=debug,
3160 report_iterations=report_iterations,
3161 _orig_processes=processes,
3162 ),
3163 )
3164 for it in iterations_each
3165 ]
3166 for r in results:
3167 for value in r.get():
3168 yield value
3169 raise StopIteration
3170
3171 if shuffle_kwargs is None:
3172 shuffle_kwargs = {}
3173 if intersect_kwargs is None:
3174 intersect_kwargs = {"u": True}
3175
3176 if "u" not in intersect_kwargs:
3177 intersect_kwargs["u"] = True
3178
3179 resort = intersect_kwargs.get("sorted", False)
3180
3181 for i in range(iterations):
3182 if debug:
3183 shuffle_kwargs["seed"] = i
3184 if report_iterations:
3185 if _orig_processes > 1:
3186 msg = "\rapprox (total across %s processes): %s" % (
3187 _orig_processes,
3188 i * _orig_processes,
3189 )
3190 else:
3191 msg = "\r%s" % i
3192 sys.stderr.write(msg)
3193 sys.stderr.flush()
3194
3195 # Re-sort if sorted=True in kwargs
3196 if resort:
3197 tmp0 = self.shuffle(**shuffle_kwargs)
3198 tmp = tmp0.sort()
3199 else:
3200 tmp = self.shuffle(**shuffle_kwargs)
3201
3202 tmp2 = tmp.intersect(other, stream=True, **intersect_kwargs)
3203
3204 yield len(tmp2)
3205
3206 # Close the open stdouts from subprocess.Popen calls. Note: doing
3207 # this in self.__del__ doesn't fix the open file limit bug; it
3208 # needs to be done here.
3209 # if resort:
3210 # tmp0.fn.close()
3211 # tmp.fn.close()
3212 tmp2.fn.close()
3213 del tmp
3214 del tmp2
3215
3216 @_log_to_history
3217 def cat(self, *others: Iterable[str|Path|BedTool], **kwargs) -> BedTool:
3218 """
3219 Concatenate interval files together.
3220
3221 Concatenates two BedTool objects (or an object and a file) and does an
3222 optional post-merge of the features.
3223
3224 *postmerge=True* by default; use *postmerge=False* if you want to keep
3225 features separate.
3226
3227 *force_truncate=False* by default; *force_truncate=True* to truncate
3228 all files to chrom, start, stop.
3229
3230 When *force_truncate=False* and *postmerge=False*, the output will
3231 contain the smallest number of fields observed across all inputs. This
3232 maintains compatibility with BEDTools programs, which assume constant
3233 number of fields in all lines of a file.
3234
3235 Other kwargs are sent to :meth:`BedTool.merge` (and assuming that
3236 *postmerge=True*).
3237
3238 Example usage:
3239
3240 >>> a = pybedtools.example_bedtool('a.bed')
3241 >>> b = pybedtools.example_bedtool('b.bed')
3242 >>> print(a.cat(b)) #doctest: +NORMALIZE_WHITESPACE
3243 chr1 1 500
3244 chr1 800 950
3245 <BLANKLINE>
3246 >>> print(a.cat(*[b,b],
3247 ... postmerge=False)) #doctest: +NORMALIZE_WHITESPACE
3248 chr1 1 100 feature1 0 +
3249 chr1 100 200 feature2 0 +
3250 chr1 150 500 feature3 0 -
3251 chr1 900 950 feature4 0 +
3252 chr1 155 200 feature5 0 -
3253 chr1 800 901 feature6 0 +
3254 chr1 155 200 feature5 0 -
3255 chr1 800 901 feature6 0 +
3256 <BLANKLINE>
3257 """
3258 same_type = None
3259 same_field_num = None
3260 field_nums = set()
3261
3262 assert len(others) > 0, "You must specify at least one other bedfile!"
3263 other_beds = []
3264 for other in others:
3265 if isinstance(other, (str, Path)):
3266 other = BedTool(other)
3267 else:
3268 assert isinstance(
3269 other, BedTool
3270 ), "Either filename or another BedTool instance required"
3271 other_beds.append(other)
3272
3273 # postmerge and force_truncate don't get passed on to merge
3274 postmerge = kwargs.pop("postmerge", True)
3275 force_truncate = kwargs.pop("force_truncate", False)
3276 stream_merge = kwargs.get("stream", False)
3277 if stream_merge and postmerge:
3278 raise ValueError(
3279 "The post-merge step in the `cat()` method "
3280 "performs a sort, which uses stream=True. Using "
3281 "stream=True for the merge as well will result in a "
3282 "deadlock!"
3283 )
3284
3285 # if filetypes and field counts are the same, don't truncate
3286 if not force_truncate:
3287 try:
3288 filetypes = set(
3289 [self.file_type] + [i.file_type for i in other_beds]
3290 ).difference(["empty"])
3291 field_nums = (
3292 set([self.field_count()] + [i.field_count() for i in other_beds])
3293 .difference([None])
3294 .difference([0])
3295 )
3296 same_field_num = len(field_nums) == 1
3297 same_type = len(set(filetypes)) == 1
3298 except ValueError:
3299 raise ValueError(
3300 "Can't check filetype or field count -- "
3301 "is one of the files you're merging a 'streaming' "
3302 "BedTool? If so, use .saveas() to save to file first"
3303 )
3304
3305 tmp = self._tmp()
3306
3307 if not force_truncate and same_type and same_field_num:
3308 with open(tmp, "w") as TMP:
3309 for f in self:
3310 TMP.write(str(f))
3311 for other in other_beds:
3312 for f in other:
3313 TMP.write(str(f))
3314
3315 # Types match, so we can use the min number of fields observed across
3316 # all inputs
3317 elif not force_truncate and same_type:
3318 minfields = min(field_nums)
3319 with open(tmp, "w") as TMP:
3320 for f in self:
3321 TMP.write("\t".join(f.fields[:minfields]) + "\n")
3322 for other in other_beds:
3323 for f in other:
3324 TMP.write("\t".join(f.fields[:minfields]) + "\n")
3325
3326 # Otherwise, use the zero-based chrom/start/stop to create a BED3,
3327 # which will work when catting a GFF and a BED together.
3328 else:
3329 with open(tmp, "w") as TMP:
3330 for f in self:
3331 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
3332 for other in other_beds:
3333 for f in other:
3334 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
3335
3336 c = BedTool(tmp)
3337 if postmerge:
3338 d = c.sort(stream=True).merge(**kwargs)
3339
3340 # Explicitly delete -- needed when using multiprocessing
3341 os.unlink(tmp)
3342 return d
3343 else:
3344 return c
3345
3346 @_log_to_history
3347 def saveas(self, fn: Optional[str] = None, trackline: Optional[str] = None, compressed: Optional[bool] = None) -> BedTool:
3348 """
3349 Make a copy of the BedTool.
3350
3351 Optionally adds `trackline` to the beginning of the file.
3352
3353 Optionally compresses output using gzip.
3354
3355 if the filename extension is .gz, or compressed=True,
3356 the output is compressed using gzip
3357
3358 Returns a new BedTool for the newly saved file.
3359
3360 A newline is automatically added to the trackline if it does not
3361 already have one.
3362
3363 Example usage:
3364
3365 >>> a = pybedtools.example_bedtool('a.bed')
3366 >>> b = a.saveas('other.bed')
3367 >>> b.fn
3368 'other.bed'
3369 >>> print(b == a)
3370 True
3371
3372 >>> b = a.saveas('other.bed', trackline="name='test run' color=0,55,0")
3373 >>> open(b.fn).readline()
3374 "name='test run' color=0,55,0\\n"
3375 >>> if os.path.exists('other.bed'):
3376 ... os.unlink('other.bed')
3377 """
3378 if fn is None:
3379 fn = self._tmp()
3380
3381 # Default to compressed if extension is .gz
3382 if compressed is None:
3383 __, extension = os.path.splitext(fn)
3384 if extension == ".gz":
3385 compressed = True
3386 else:
3387 compressed = False
3388
3389 in_compressed = isinstance(self.fn, str) and isGZIP(self.fn)
3390
3391 fn = self._collapse(
3392 self,
3393 fn=fn,
3394 trackline=trackline,
3395 in_compressed=in_compressed,
3396 out_compressed=compressed,
3397 )
3398 return BedTool(fn)
3399
3400 @_log_to_history
3401 def moveto(self, fn: Optional[str]=None) -> BedTool:
3402 """
3403 Move to a new filename (can be much quicker than BedTool.saveas())
3404
3405 Move BED file to new filename, `fn`.
3406
3407 Returns a new BedTool for the new file.
3408
3409 Example usage:
3410
3411 >>> # make a copy so we don't mess up the example file
3412 >>> a = pybedtools.example_bedtool('a.bed').saveas()
3413 >>> a_contents = str(a)
3414 >>> b = a.moveto('other.bed')
3415 >>> b.fn
3416 'other.bed'
3417 >>> b == a_contents
3418 True
3419 """
3420 if not isinstance(self.fn, str):
3421 fn = self._collapse(self, fn=fn)
3422 else:
3423 shutil.move(self.fn, fn)
3424 return BedTool(fn)
3425
3426 @_log_to_history
3427 def random_subset(self, n: Optional[int] = None, f: Optional[float] = None, seed: Optional[float|int]=None) -> BedTool:
3428 """
3429 Return a BedTool containing a random subset.
3430
3431 NOTE: using `n` will be slower and use more memory than using `f`.
3432
3433 Parameters
3434 ----------
3435
3436 n : int
3437 Number of features to return. Only one of `n` or `f` can be provided.
3438
3439 f : float, 0 <= f <= 1
3440 Fraction of features to return. Cannot be provided with `n`.
3441
3442 seed : float or int
3443 Set random.seed
3444
3445 Example
3446 -------
3447
3448 >>> seed = 0 # only for test, otherwise use None
3449
3450 `n` will always give the same number of returned features, but will be
3451 slower since it is creating an index and then shuffling it.
3452
3453 >>> a = pybedtools.example_bedtool('a.bed')
3454 >>> b = a.random_subset(n=2)
3455 >>> len(b)
3456 2
3457
3458 Using a fraction `f` will be faster but depending on seed will result
3459 in slightly different total numbers.
3460
3461 >>> a = pybedtools.example_bedtool('x.bam')
3462 >>> len(a)
3463 45593
3464 >>> b = a.random_subset(f=0.4, seed=seed)
3465 >>> len(b)
3466 18316
3467
3468 Check that we have approximately the right fraction
3469 >>> print('{0:.2f}'.format(len(b) / len(a)))
3470 0.40
3471
3472 """
3473 if ((n is None) and (f is None)) or ((n is not None) and (f is not None)):
3474 raise ValueError("Exactly one of `n` or `f` must be provided")
3475
3476 tmpfn = self._tmp()
3477 if seed is not None:
3478 random.seed(seed)
3479
3480 if n:
3481 idxs = list(range(len(self)))
3482 random.shuffle(idxs)
3483 idxs = idxs[:n]
3484 with open(tmpfn, "w") as tmp:
3485 for i, feature in enumerate(self):
3486 if i in idxs:
3487 tmp.write(str(feature))
3488
3489 elif f:
3490 with open(tmpfn, "w") as tmp:
3491 for i in self:
3492 if random.random() <= f:
3493 tmp.write(str(i))
3494
3495 return BedTool(tmpfn)
3496
3497 def total_coverage(self) -> int:
3498 """
3499 Return the total number of bases covered by this interval file.
3500
3501 Does a self.merge() first to remove potentially multiple-counting
3502 bases.
3503
3504 Example usage:
3505
3506 >>> a = pybedtools.example_bedtool('a.bed')
3507
3508 This does a merge() first, so this is what the total coverage is
3509 counting:
3510
3511 >>> print(a.merge()) #doctest: +NORMALIZE_WHITESPACE
3512 chr1 1 500
3513 chr1 900 950
3514 <BLANKLINE>
3515
3516 >>> print(a.total_coverage())
3517 549
3518 """
3519 b = self.merge()
3520 total_bp = 0
3521 for feature in b.features():
3522 total_bp += len(feature)
3523 return total_bp
3524
3525 @_log_to_history
3526 def with_attrs(self, **kwargs) -> BedTool:
3527 """
3528 Helper method for adding attributes in the middle of a pipeline.
3529
3530 Given arbitrary keyword arguments, turns the keys and values into
3531 attributes. Useful for labeling BedTools at creation time.
3532
3533 Example usage:
3534
3535 >>> # add a "label" attribute to each BedTool
3536 >>> a = pybedtools.example_bedtool('a.bed')\
3537 .with_attrs(label='transcription factor 1')
3538 >>> b = pybedtools.example_bedtool('b.bed')\
3539 .with_attrs(label='transcription factor 2')
3540 >>> for i in [a, b]:
3541 ... print('{0} features for {1}'.format(i.count(), i.label))
3542 4 features for transcription factor 1
3543 2 features for transcription factor 2
3544
3545 """
3546 for key, value in list(kwargs.items()):
3547 setattr(self, key, value)
3548 return self
3549
3550 def as_intervalfile(self) -> IntervalFile:
3551 """
3552 Returns an IntervalFile of this BedTool for low-level interface.
3553 """
3554 if not isinstance(self.fn, str):
3555 fn = self._collapse(self.fn)
3556 else:
3557 fn = self.fn
3558 return IntervalFile(fn)
3559
3560 def liftover(self, chainfile: str, unmapped: Optional[str] = None, liftover_args: str = "") -> BedTool:
3561 """
3562 Returns a new BedTool of the liftedOver features, saving the unmapped
3563 ones as `unmapped`. If `unmapped` is None, then discards the unmapped
3564 features.
3565
3566 `liftover_args` is a string of additional args that is passed,
3567 verbatim, to liftOver.
3568
3569 Needs `liftOver` from UCSC to be on the path and a `chainfile`
3570 downloaded from UCSC.
3571 """
3572 result = BedTool._tmp()
3573 if unmapped is None:
3574 unmapped = BedTool._tmp()
3575 cmds = ["liftOver", liftover_args, self.fn, chainfile, result, unmapped]
3576 os.system(" ".join(cmds))
3577 return BedTool(result)
3578
3579 def absolute_distance(self, other: BedTool, closest_kwargs: Optional[dict[str, Any]]=None, use_midpoints: bool=False) -> Iterator[int]:
3580 """
3581 Returns an iterator of the *absolute* distances between features in
3582 self and other.
3583
3584 If `use_midpoints` is True, then only use the midpoints of features
3585 (which will return values where features are overlapping). Otherwise,
3586 when features overlap the value will always be zero.
3587
3588 `closest_kwargs` are passed to self.closest(); either `d` or
3589 'D` are required in order to get back distance values (`d=True` is
3590 default)
3591 """
3592 from .featurefuncs import midpoint
3593
3594 if closest_kwargs is None:
3595 closest_kwargs = {"d": True}
3596
3597 if "D" not in closest_kwargs:
3598 closest_kwargs.update(dict(d=True))
3599
3600 if use_midpoints:
3601 mid_self = self.each(midpoint).saveas()
3602 mid_other = other.each(midpoint).saveas()
3603 c = mid_self.closest(mid_other, stream=True, **closest_kwargs)
3604 else:
3605 c = self.closest(other, stream=True, **closest_kwargs)
3606 for i in c:
3607 yield int(i[-1])
3608
3609 def relative_distance(self, other: BedTool, genome:Optional[dict|str] =None, g: Optional[str]=None) -> Iterator[float]:
3610 """
3611 Returns an iterator of relative distances between features in self and
3612 other.
3613
3614 First computes the midpoints of self and other, then returns distances
3615 of each feature in `other` relative to the distance between `self`
3616 features.
3617
3618 Requires either `genome` (dictionary of chromsizes or assembly name) or
3619 `g` (filename of chromsizes file).
3620 """
3621 g_dict = {}
3622 if (genome is None) and (g is None):
3623 raise ValueError("Need either `genome` or `g` arg for relative distance")
3624 elif genome and g:
3625 raise ValueError("Please specify only one of `genome` or `g`")
3626 elif genome:
3627 g_dict = dict(genome=genome)
3628 elif g:
3629 g_dict = dict(g=g)
3630
3631 from .featurefuncs import midpoint
3632
3633 # This gets the space between features in self.
3634 c = self.each(midpoint).complement(**g_dict)
3635
3636 hits = c.intersect(other, wao=True, stream=True) # TODO: should this be other or mid_other?
3637 for i in hits:
3638 yield float(i[-1]) / len(i)
3639
3640 def colormap_normalize(self,
3641 vmin: Optional[float|int]=None,
3642 vmax: Optional[float|int]=None,
3643 percentile: bool=False,
3644 log: bool=False) -> mcolors.LogNorm|mcolors.Normalize:
3645 """
3646 Returns a normalization instance for use by featurefuncs.add_color().
3647
3648 Parameters
3649 ----------
3650 vmin, vmax : float, int, or None
3651 `vmin` and `vmax` set the colormap bounds; if None then
3652 these will be determined from the scores in the BED file.
3653
3654 log : bool
3655 If True, put the scores on a log scale; of course be careful
3656 if you have negative scores
3657
3658 percentile : bool
3659 If True, interpret vmin and vmax as a percentile in the range
3660 [0,100] rather than absolute values.
3661 """
3662 field_count = self.field_count()
3663 if (self.file_type != "bed") or (field_count < 5):
3664 raise ValueError("colorizing only works for BED files with score " "fields")
3665 try:
3666 import matplotlib.colors as mcolors
3667 except ImportError:
3668 raise ImportError("matplotlib.colors must be installed to use colormap_normalize")
3669
3670 try:
3671 import numpy as np
3672 except ImportError:
3673 raise ImportError("numpy must be installed to use colormap_normalize")
3674
3675 if log:
3676 norm = mcolors.LogNorm()
3677 else:
3678 norm = mcolors.Normalize()
3679
3680 scores = np.array([i.score for i in self], dtype=float)
3681 scores = scores[np.isfinite(scores)]
3682 norm.autoscale(scores)
3683
3684 if vmin is not None:
3685 if percentile:
3686 vmin = float(np.percentile(scores, vmin))
3687 norm.vmin = vmin
3688 if vmax is not None:
3689 if percentile:
3690 vmax = float(np.percentile(scores, vmax))
3691 norm.vmax = vmax
3692
3693 return norm
3694
3695 def at(self, inds: list[int]) -> BedTool:
3696 """
3697 Returns a new BedTool with only intervals at lines `inds`
3698
3699 Parameters
3700 ----------
3701 inds : List[int]
3702 List of line numbers
3703
3704 Returns
3705 -------
3706 BedTool
3707 New BedTool with only intervals at `inds`
3708 """
3709 length = len(inds)
3710
3711 def _gen():
3712 k = 0
3713 for i, feature in enumerate(self):
3714 if i == inds[k]:
3715 yield feature
3716 k += 1
3717 if k == length:
3718 break
3719
3720 return BedTool(_gen()).saveas()
3721
3722 def to_dataframe(self, disable_auto_names: bool = False, *args, **kwargs) -> pd.DataFrame:
3723 """
3724 Create a pandas.DataFrame, passing args and kwargs to pandas.read_csv
3725 The separator kwarg `sep` is given a tab `\\t` as value by default.
3726
3727 Parameters
3728 ----------
3729 disable_auto_names : bool
3730 By default, the created dataframe fills in column names
3731 automatically according to the detected filetype (e.g., "chrom",
3732 "start", "end" for a BED3 file). Set this argument to True to
3733 disable this behavior.
3734 """
3735 # Complain if BAM or if not a file
3736 if self._isbam:
3737 raise ValueError("BAM not supported for converting to DataFrame")
3738 if not isinstance(self.fn, str):
3739 raise ValueError("use .saveas() to make sure self.fn is a file")
3740
3741 try:
3742 import pandas
3743 except ImportError:
3744 raise ImportError("pandas must be installed to convert to pandas.DataFrame")
3745 # Otherwise we're good:
3746 names = kwargs.get("names", None)
3747 if names is None and not disable_auto_names:
3748 try:
3749 _names = settings._column_names[self.file_type][: self.field_count()]
3750 if len(_names) < self.field_count():
3751 warn(
3752 "Default names for filetype %s are:\n%s\nbut file has "
3753 "%s fields; you can supply custom names with the "
3754 "`names` kwarg" % (self.file_type, _names, self.field_count())
3755 )
3756 _names = None
3757 except KeyError:
3758 _names = None
3759 kwargs["names"] = _names
3760
3761 if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0:
3762 return pandas.read_csv(self.fn, *args, sep="\t", **kwargs) # type: ignore
3763 else:
3764 return pandas.DataFrame()
3765
3766 def tail(self, lines:int = 10, as_string: bool = False) -> Optional[str]:
3767 """
3768 Like `head`, but prints last 10 lines of the file by default.
3769
3770 To avoid consuming iterables, this only works with file-based, non-BAM
3771 BedTool objects.
3772
3773 Use `as_string=True` to return a string.
3774 """
3775 if self._isbam:
3776 raise ValueError("tail() not yet implemented for BAM files")
3777 if not isinstance(self.fn, str):
3778 raise ValueError(
3779 "tail() not implemented for non-file-based "
3780 "BedTool objects. Please use saveas() first."
3781 )
3782 bufsize = 8192
3783 offset = bufsize
3784 f = open(self.fn, "rb")
3785
3786 # whence=2 arg means relative to end (i.e., go to the end)
3787 f.seek(0, 2)
3788 file_size = f.tell()
3789 data = []
3790 while True:
3791 if file_size < bufsize:
3792 offset = file_size
3793 f.seek(-offset, 2)
3794 chunk = f.read(offset)
3795 data.extend(chunk.splitlines(True))
3796 if len(data) >= lines or offset == file_size:
3797 break
3798 offset += bufsize
3799
3800 result = "".join([i.decode() for i in data[-lines:]])
3801 if as_string:
3802 return result
3803 else:
3804 print(result)
3805
3806
3807 class BAM(object):
3808 def __init__(self, stream):
3809 """
3810 Wraps pysam.Samfile so that it yields pybedtools.Interval objects when
3811 iterated over.
3812
3813 The pysam.Samfile can be accessed via the .pysam_bamfile attribute.
3814 """
3815 self.stream = stream
3816 if not isinstance(self.stream, str):
3817 raise ValueError("Only files are supported, not streams")
3818 self.pysam_bamfile = pysam.Samfile(self.stream)
3819
3820 def _aligned_segment_to_interval(self, r):
3821
3822 if r.rname >= 0:
3823 rname = self.pysam_bamfile.get_reference_name(r.rname)
3824 else:
3825 rname = "*"
3826
3827 if r.rnext >= 0:
3828 if r.rnext == r.rname:
3829 rnext = "="
3830 else:
3831 rnext = self.pysam_bamfile.get_reference_name(r.rnext)
3832 else:
3833 rnext = "*"
3834
3835 # SAM spec says if unavailable should be set to 0. Pysam sets to -1.
3836
3837 if r.pnext <= 0:
3838 pnext = "0"
3839 else:
3840 # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
3841 # but pysam uses 0-based.
3842 pnext = str(r.pnext + 1)
3843
3844 if r.cigarstring:
3845 cigarstring = r.cigarstring
3846 else:
3847 cigarstring = "*"
3848
3849 # Rudimentary support.
3850 # TODO: remove when refactoring to new BAM iterating
3851 tags = []
3852 for k, v in r.tags:
3853 if isinstance(v, int):
3854 t = "i"
3855 elif isinstance(v, float):
3856 t = "f"
3857 else:
3858 t = "Z"
3859 tags.append("{0}:{1}:{2}".format(k, t, v))
3860
3861 tags = "\t".join(tags)
3862
3863 if r.seq:
3864 seq = r.seq
3865 else:
3866 seq = "*"
3867
3868 if r.qual:
3869 qual = r.qual
3870 else:
3871 qual = "*"
3872
3873 fields = [
3874 r.qname,
3875 str(r.flag),
3876 rname,
3877 # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
3878 # but pysam uses 0-based.
3879 str(r.pos + 1),
3880 str(r.mapq),
3881 cigarstring,
3882 rnext,
3883 pnext,
3884 str(r.tlen),
3885 seq,
3886 qual,
3887 ]
3888 if tags:
3889 fields.append(tags)
3890
3891 if None in fields:
3892 raise ValueError("Found 'None' in fields: %s" % fields)
3893 return create_interval_from_list(fields)
3894
3895 def __iter__(self):
3896 return self
3897
3898 # TODO: this is PAINFUL but it ensures that existing tests work. Once all
3899 # tests work, the new behavior will be to yield pysam AlignedSegment
3900 # objects directly.
3901 def __next__(self):
3902 return self._aligned_segment_to_interval(next(self.pysam_bamfile))
3903
3904 def next(self):
3905 return self.__next__()
3906
3907
3908 class History(list):
3909 def __init__(self):
3910 """
3911 Represents one or many HistorySteps. Mostly used for nicely formatting
3912 a series of HistorySteps.
3913 """
3914 list.__init__(self)
3915
3916
3917 class HistoryStep(object):
3918 def __init__(self, method, args, kwargs, bedtool_instance, parent_tag, result_tag):
3919 """
3920 Class to represent one step in the history.
3921
3922 Mostly used for its __repr__ method, to try and exactly replicate code
3923 that can be pasted to re-do history steps
3924 """
3925 try:
3926 self.method = method._name
3927 except AttributeError:
3928 self.method = method.__name__
3929
3930 self.args = args
3931 self.kwargs = kwargs
3932 self.fn = bedtool_instance.fn
3933 self.parent_tag = parent_tag
3934 self.result_tag = result_tag
3935
3936 def _clean_arg(self, arg: str|BedTool) -> str:
3937 """
3938 Wrap strings in quotes and convert bedtool instances to filenames.
3939 """
3940 if isinstance(arg, BedTool):
3941 arg = arg.fn
3942 if isinstance(arg, str):
3943 arg = '"%s"' % arg
3944 return arg
3945
3946 def __repr__(self):
3947 # Still not sure whether to use pybedtools.bedtool() or bedtool()
3948 s = ""
3949 s += "<HistoryStep> "
3950 if os.path.exists(self.fn):
3951 s += 'BedTool("%(fn)s").%(method)s(%%s%%s)' % self.__dict__
3952 else:
3953 s += 'BedTool("MISSING FILE: %(fn)s")' % self.__dict__
3954 s += ".%(method)s(%%s%%s)" % self.__dict__
3955
3956 # Format args and kwargs
3957 args_string = ",".join(map(self._clean_arg, self.args))
3958 kwargs_string = ",".join(
3959 ["%s=%s" % (i[0], self._clean_arg(i[1])) for i in list(self.kwargs.items())]
3960 )
3961 # stick a comma on the end if there's something here
3962 if len(args_string) > 0:
3963 args_string += ", "
3964
3965 s = s % (args_string, kwargs_string)
3966 s += ", parent tag: %s" % self.parent_tag
3967 s += ", result tag: %s" % self.result_tag
3968 return s
3969
3970
3971 def example_bedtool(fn):
3972 """
3973 Return a bedtool using a bed file from the pybedtools examples directory.
3974 Use :func:`list_example_files` to see a list of files that are included.
3975 """
3976 fn = os.path.join(filenames.data_dir(), fn)
3977 if not os.path.exists(fn):
3978 msg = "%s does not exist" % fn
3979 raise FileNotFoundError(msg)
3980 return BedTool(fn)
3981
3982
3983 if __name__ == "__main__":
3984 import doctest
3985
3986 doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)