annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/bedtool.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 from __future__ import annotations
jpayne@68 2 import tempfile
jpayne@68 3 from textwrap import dedent
jpayne@68 4 import shutil
jpayne@68 5 import subprocess
jpayne@68 6 import operator
jpayne@68 7 import os
jpayne@68 8 import sys
jpayne@68 9 import random
jpayne@68 10 import string
jpayne@68 11 import pprint
jpayne@68 12 from itertools import islice
jpayne@68 13 from multiprocessing import Pool
jpayne@68 14 import gzip
jpayne@68 15 from typing import Any, Callable, Iterable, Iterator, Literal, Optional, TYPE_CHECKING, cast
jpayne@68 16 import pysam
jpayne@68 17 from warnings import warn
jpayne@68 18 import pathlib
jpayne@68 19 from pathlib import Path
jpayne@68 20
jpayne@68 21 from .helpers import (
jpayne@68 22 get_tempdir,
jpayne@68 23 _tags,
jpayne@68 24 call_bedtools,
jpayne@68 25 _flatten_list,
jpayne@68 26 _check_sequence_stderr,
jpayne@68 27 isBAM,
jpayne@68 28 isBGZIP,
jpayne@68 29 isGZIP,
jpayne@68 30 BEDToolsError,
jpayne@68 31 pybedtoolsError,
jpayne@68 32 _call_randomintersect,
jpayne@68 33 SplitOutput,
jpayne@68 34 FisherOutput,
jpayne@68 35 )
jpayne@68 36 from . import helpers
jpayne@68 37 from .cbedtools import (
jpayne@68 38 IntervalFile,
jpayne@68 39 IntervalIterator,
jpayne@68 40 Interval,
jpayne@68 41 create_interval_from_list,
jpayne@68 42 BedToolsFileError,
jpayne@68 43 )
jpayne@68 44 import pybedtools
jpayne@68 45 from . import settings
jpayne@68 46 from . import filenames
jpayne@68 47
jpayne@68 48 if TYPE_CHECKING:
jpayne@68 49 import pandas as pd
jpayne@68 50 import matplotlib.colors as mcolors
jpayne@68 51
jpayne@68 52 _implicit_registry = {}
jpayne@68 53 _other_registry = {}
jpayne@68 54 _bam_registry = {}
jpayne@68 55
jpayne@68 56
jpayne@68 57 def _jaccard_output_to_dict(s, **kwargs) -> dict:
jpayne@68 58 """
jpayne@68 59 jaccard method doesn't return an interval file, rather, it returns a short
jpayne@68 60 summary of results. Here, we simply parse it into a dict for convenience.
jpayne@68 61 """
jpayne@68 62 if isinstance(s, str):
jpayne@68 63 _s = open(s).read()
jpayne@68 64 elif hasattr(s, "next") or hasattr(s, "__next__"):
jpayne@68 65 _s = "".join([i for i in s])
jpayne@68 66 else:
jpayne@68 67 raise ValueError("Unexpected object %r" % s)
jpayne@68 68 header, data = _s.splitlines()
jpayne@68 69 header = header.split()
jpayne@68 70 data = data.split()
jpayne@68 71 data[0] = int(data[0])
jpayne@68 72 data[1] = int(data[1])
jpayne@68 73 data[2] = float(data[2])
jpayne@68 74 data[3] = int(data[3])
jpayne@68 75 return dict(list(zip(header, data)))
jpayne@68 76
jpayne@68 77
jpayne@68 78 def _reldist_output_handler(s, **kwargs):
jpayne@68 79 """
jpayne@68 80 reldist, if called with -detail, returns a valid BED file with the relative
jpayne@68 81 distance as the last field. In that case, return the BedTool immediately.
jpayne@68 82 If not -detail, then the results are a table, in which case here we parse
jpayne@68 83 into a dict for convenience.
jpayne@68 84 """
jpayne@68 85 if "detail" in kwargs:
jpayne@68 86 return BedTool(s)
jpayne@68 87 if isinstance(s, str):
jpayne@68 88 iterable = open(s)
jpayne@68 89 if hasattr(s, "next"):
jpayne@68 90 iterable = s
jpayne@68 91 header = next(iterable).split()
jpayne@68 92 results = {}
jpayne@68 93 for h in header:
jpayne@68 94 results[h] = []
jpayne@68 95 for i in iterable:
jpayne@68 96 reldist, count, total, fraction = i.split()
jpayne@68 97 data = [float(reldist), int(count), int(total), float(fraction)]
jpayne@68 98 for h, d in zip(header, data):
jpayne@68 99 results[h].append(d)
jpayne@68 100 return results
jpayne@68 101
jpayne@68 102
jpayne@68 103 def _wraps(
jpayne@68 104 prog: Optional[str] = None,
jpayne@68 105 implicit: Optional[str] = None,
jpayne@68 106 bam: Optional[str] = None,
jpayne@68 107 other: Optional[str] = None,
jpayne@68 108 uses_genome: bool = False,
jpayne@68 109 make_tempfile_for: Optional[str] = None,
jpayne@68 110 check_stderr:Optional[Callable]=None,
jpayne@68 111 add_to_bedtool:Optional[dict] = None,
jpayne@68 112 nonbam: Optional[Literal["ALL"] | str | list[str]] = None,
jpayne@68 113 force_bam: bool = False,
jpayne@68 114 genome_none_if: Optional[list[str]] =None,
jpayne@68 115 genome_if: Optional[list[str]] = None,
jpayne@68 116 genome_ok_if: Optional[list[str]] = None,
jpayne@68 117 does_not_return_bedtool: Optional[Callable] =None,
jpayne@68 118 arg_order: Optional[list[str]] = None,
jpayne@68 119 ):
jpayne@68 120 """
jpayne@68 121 Do-it-all wrapper, to be used as a decorator.
jpayne@68 122
jpayne@68 123 *prog* is the name of the BEDTools program that will be called. The help
jpayne@68 124 for this program will also be added to the decorated method's docstring.
jpayne@68 125
jpayne@68 126 *implicit* is the BEDTools program arg that should be filled in
jpayne@68 127 automatically.
jpayne@68 128
jpayne@68 129 *bam* will disable the implicit substitution if *bam* is in the kwargs.
jpayne@68 130 This is typically 'abam' or 'ibam' if the program accepts BAM input.
jpayne@68 131
jpayne@68 132 *other* is the BEDTools program arg that is passed in as the second input,
jpayne@68 133 if supported. Within the semantics of BEDTools, the typical case will be
jpayne@68 134 that if implicit='a' then other='b'; if implicit='i' then other=None.
jpayne@68 135
jpayne@68 136 *uses_genome*, if True, will check for 'g' and/or 'genome' args and
jpayne@68 137 retrieve the corresponding genome files as needed.
jpayne@68 138
jpayne@68 139 *make_tempfile_for* is used for the sequence methods and indicates which
jpayne@68 140 kwarg should have a tempfile made for it if it's not provided ('fo' for the
jpayne@68 141 sequence methods)
jpayne@68 142
jpayne@68 143 *check_stderr*, if not None, is a function that accepts a string (which
jpayne@68 144 will be anything written to stdout when calling the wrapped program). This
jpayne@68 145 function should return True if the string is OK, and False if it should
jpayne@68 146 truly be considered an error. This is needed for wrapping fastaFromBed,
jpayne@68 147 which will report to stderr that it's creating an index file.
jpayne@68 148
jpayne@68 149 *add_to_bedtool* is used for sequence methods. It is a dictionary mapping
jpayne@68 150 kwargs to attributes to be created in the resulting BedTool. Typically it
jpayne@68 151 is {'fo':'seqfn'} which will add the resulting sequence name to the
jpayne@68 152 BedTool's .seqfn attribute. If *add_to_bedtool* is not None, then the
jpayne@68 153 returned BedTool will be *self* with the added attribute. If a key is
jpayne@68 154 "stdout" (e.g., {"stdout": attr_name}), then save the stdout of the command
jpayne@68 155 as a tempfile and store the tempfile's name in the attribute. This is
jpayne@68 156 required for linksBed and bedToIgv.
jpayne@68 157
jpayne@68 158 *nonbam* is a kwarg that even if the input file was a BAM, the output will
jpayne@68 159 *not* be BAM format. For example, the `-bed` arg for intersectBed will
jpayne@68 160 cause the output to be in BED format, not BAM. If not None, this can be a
jpayne@68 161 string, a list of strings, or the special string "ALL", which means that
jpayne@68 162 the wrapped program will never return BAM output.
jpayne@68 163
jpayne@68 164 *force_bam*, if True, will force the output to be BAM. This is used for
jpayne@68 165 bedToBam.
jpayne@68 166
jpayne@68 167 *genome_none_if* is a list of arguments that will ignore the requirement
jpayne@68 168 for a genome. This is needed for window_maker, where -b and -g are
jpayne@68 169 mutually exclusive.
jpayne@68 170
jpayne@68 171 *genome_ok_if* is a list of arguments that, if they are in
jpayne@68 172 *genome_none_if*, are still OK to pass in. This is needed for bedtool
jpayne@68 173 genomecov, where -g is not needed if -ibam is specified...but it's still OK
jpayne@68 174 if the user passes a genome arg.
jpayne@68 175
jpayne@68 176 *genome_if* is a list of arguments that will trigger the requirement for
jpayne@68 177 a genome; otherwise no genome needs to be specified.
jpayne@68 178
jpayne@68 179 *does_not_return_bedtool*, if not None, should be a function that handles
jpayne@68 180 the returned output. Its signature should be ``func(output, kwargs)``,
jpayne@68 181 where `output` is the output from the [possibly streaming] call to BEDTools
jpayne@68 182 and `kwargs` are passed verbatim from the wrapped method call. Some
jpayne@68 183 examples of methods that use this are jaccard, reldist, fisher, and split
jpayne@68 184 methods.
jpayne@68 185
jpayne@68 186 *arg_order*, if not None, is a sorted list of arguments. This is used by
jpayne@68 187 handle_kwargs() to deal with things like issues 81 and 345, where some
jpayne@68 188 BEDTools programs are sensitive to argument order.
jpayne@68 189 """
jpayne@68 190
jpayne@68 191 # NOTE: We are calling each BEDTools program to get its help and adding
jpayne@68 192 # that to the docstring of each method. This is run at import time. However
jpayne@68 193 # if BEDTools is not on the path at import time, `not_implemented` is set
jpayne@68 194 # to True and isn't reset later until the module is reloaded.
jpayne@68 195 #
jpayne@68 196 # helpers.set_bedtools_path therefore will trigger a module reload.
jpayne@68 197 not_implemented = False
jpayne@68 198
jpayne@68 199 # Call the program with -h to get help, which prints to stderr.
jpayne@68 200 try:
jpayne@68 201 p = subprocess.Popen(
jpayne@68 202 helpers._version_2_15_plus_names(prog) + ["-h"],
jpayne@68 203 stdout=subprocess.PIPE,
jpayne@68 204 stderr=subprocess.PIPE,
jpayne@68 205 )
jpayne@68 206 help_str = p.communicate()[1].decode()
jpayne@68 207
jpayne@68 208 # underscores throw off ReStructuredText syntax of docstrings, so
jpayne@68 209 # replace 'em
jpayne@68 210 help_str = help_str.replace("_", "**")
jpayne@68 211
jpayne@68 212 # indent
jpayne@68 213 help_str = help_str.split("\n")
jpayne@68 214 help_str = ["\n\n**Original BEDTools help:**::"] + ["\t" + i for i in help_str]
jpayne@68 215 help_str = "\n".join(help_str) + "\n"
jpayne@68 216
jpayne@68 217 # If the program can't be found, then we'll eventually replace the method
jpayne@68 218 # with a version that does nothing but raise a NotImplementedError (plus
jpayne@68 219 # a helpful message).
jpayne@68 220 except OSError:
jpayne@68 221 help_str = (
jpayne@68 222 '"%s" does not appear to be installed '
jpayne@68 223 "or on the path, so this method is "
jpayne@68 224 "disabled. Please install a more recent "
jpayne@68 225 "version of BEDTools and re-import to "
jpayne@68 226 "use this method." % prog
jpayne@68 227 )
jpayne@68 228 not_implemented = True
jpayne@68 229
jpayne@68 230 def decorator(func):
jpayne@68 231 """
jpayne@68 232 Accepts a function to be wrapped; discards the original and returns a
jpayne@68 233 new, rebuilt-from-scratch function based on the kwargs passed to
jpayne@68 234 _wraps().
jpayne@68 235 """
jpayne@68 236 # Register the implicit (as well as bam and other) args in the global
jpayne@68 237 # registry. BedTool.handle_kwargs() will access these at runtime. The
jpayne@68 238 # registry is keyed by program name (like intersectBed).
jpayne@68 239 _implicit_registry[prog] = implicit
jpayne@68 240 if other is not None:
jpayne@68 241 _other_registry[prog] = other
jpayne@68 242 if bam is not None:
jpayne@68 243 _bam_registry[prog] = bam
jpayne@68 244
jpayne@68 245 # Here's where we replace an unable-to-be-found program's method with
jpayne@68 246 # one that only returns a NotImplementedError
jpayne@68 247 if not_implemented:
jpayne@68 248
jpayne@68 249 def not_implemented_func(*args, **kwargs):
jpayne@68 250 raise NotImplementedError(help_str)
jpayne@68 251
jpayne@68 252 return not_implemented_func
jpayne@68 253
jpayne@68 254 _add_doc = []
jpayne@68 255 if implicit:
jpayne@68 256 _add_doc.append(
jpayne@68 257 dedent(
jpayne@68 258 """
jpayne@68 259 For convenience, the file or stream this BedTool points to
jpayne@68 260 is implicitly passed as the `-%s` argument to `%s`
jpayne@68 261 """
jpayne@68 262 % (implicit, prog)
jpayne@68 263 )
jpayne@68 264 )
jpayne@68 265
jpayne@68 266 if uses_genome:
jpayne@68 267 _add_doc.append(
jpayne@68 268 dedent(
jpayne@68 269 """
jpayne@68 270 There are two alternatives for supplying a genome. Use
jpayne@68 271 `g="genome.filename"` if you have a genome's chrom sizes
jpayne@68 272 saved as a file. This is the what BEDTools expects when
jpayne@68 273 using it from the command line. Alternatively, use the
jpayne@68 274 `genome="assembly.name"` (for example, `genome="hg19"`) to
jpayne@68 275 use chrom sizes for that assembly without having to manage
jpayne@68 276 a separate file. The `genome` argument triggers a call
jpayne@68 277 `pybedtools.chromsizes`, so see that method for more
jpayne@68 278 details.
jpayne@68 279 """
jpayne@68 280 )
jpayne@68 281 )
jpayne@68 282
jpayne@68 283 def wrapped(self, *args, **kwargs):
jpayne@68 284 """
jpayne@68 285 A newly created function that will be returned by the _wraps()
jpayne@68 286 decorator
jpayne@68 287 """
jpayne@68 288
jpayne@68 289 # Only one non-keyword argument is supported; this is then assumed
jpayne@68 290 # to be "other" (e.g., `-b` for intersectBed)
jpayne@68 291 if len(args) > 0:
jpayne@68 292 assert len(args) == 1
jpayne@68 293 kwargs[other] = args[0]
jpayne@68 294
jpayne@68 295 # Add the implicit values to kwargs. If the current BedTool is
jpayne@68 296 # BAM, it will automatically be passed to the appropriate
jpayne@68 297 # BAM-support arg (like `-abam`). But this also allows the user to
jpayne@68 298 # explicitly specify the abam kwarg, which will override the
jpayne@68 299 # auto-substitution.
jpayne@68 300 # Note: here, `implicit` is something like "a"; `bam` is something
jpayne@68 301 # like "abam"
jpayne@68 302 if (
jpayne@68 303 (implicit not in kwargs)
jpayne@68 304 and (bam not in kwargs)
jpayne@68 305 and (implicit is not None)
jpayne@68 306 ):
jpayne@68 307 if not self._isbam:
jpayne@68 308 kwargs[implicit] = self.fn
jpayne@68 309 else:
jpayne@68 310 # It is a bam file. If this program supports BAM as the
jpayne@68 311 # first input, then we set it here
jpayne@68 312 if bam is not None:
jpayne@68 313 kwargs[bam] = self.fn
jpayne@68 314
jpayne@68 315 # Otherwise, BEDTools can't currently handle it, so raise
jpayne@68 316 # an exception.
jpayne@68 317 else:
jpayne@68 318 raise pybedtoolsError(
jpayne@68 319 '"%s" currently can\'t handle BAM '
jpayne@68 320 "input, please use bam_to_bed() first." % prog
jpayne@68 321 )
jpayne@68 322
jpayne@68 323 # Should this function handle genome files?
jpayne@68 324 check_for_genome = uses_genome
jpayne@68 325 if uses_genome:
jpayne@68 326 if genome_none_if:
jpayne@68 327 for i in genome_none_if:
jpayne@68 328 if i in kwargs or i == implicit:
jpayne@68 329 check_for_genome = False
jpayne@68 330
jpayne@68 331 # for genomecov, if -ibam then -g is optional. So it's OK
jpayne@68 332 # for the user to provide genome or g kwargs, even if
jpayne@68 333 # -ibam.
jpayne@68 334 if genome_ok_if:
jpayne@68 335 for i in genome_ok_if:
jpayne@68 336 if i in kwargs or i == implicit:
jpayne@68 337 if ("g" in kwargs) or ("genome" in kwargs):
jpayne@68 338 check_for_genome = True
jpayne@68 339 if genome_if:
jpayne@68 340 check_for_genome = False
jpayne@68 341 for i in genome_if:
jpayne@68 342 if (i in kwargs) or (i == implicit):
jpayne@68 343 check_for_genome = True
jpayne@68 344 if check_for_genome:
jpayne@68 345 kwargs = self.check_genome(**kwargs)
jpayne@68 346
jpayne@68 347 # For sequence methods, we may need to make a tempfile that will
jpayne@68 348 # hold the resulting sequence. For example, fastaFromBed needs to
jpayne@68 349 # make a tempfile for 'fo' if no 'fo' was explicitly specified by
jpayne@68 350 # the user.
jpayne@68 351 if make_tempfile_for is not None:
jpayne@68 352 if make_tempfile_for not in kwargs:
jpayne@68 353 kwargs[make_tempfile_for] = self._tmp()
jpayne@68 354
jpayne@68 355 # At runtime, this will parse the kwargs, convert streams to
jpayne@68 356 # tempfiles if needed, and return all the goodies
jpayne@68 357 cmds, tmp, stdin = self.handle_kwargs(prog=prog,
jpayne@68 358 arg_order=arg_order,
jpayne@68 359 **kwargs)
jpayne@68 360
jpayne@68 361 # Decide whether the output is BAM format or not.
jpayne@68 362 result_is_bam = False
jpayne@68 363
jpayne@68 364 # By default, if the current BedTool is BAM, then the result should
jpayne@68 365 # be, too.
jpayne@68 366 if self._isbam:
jpayne@68 367 result_is_bam = True
jpayne@68 368
jpayne@68 369 # If nonbam is "ALL", then this method will never return BAM
jpayne@68 370 # output.
jpayne@68 371 if nonbam == "ALL":
jpayne@68 372 result_is_bam = False
jpayne@68 373
jpayne@68 374 # If any of the `nonbam` args are found in kwargs, then result is
jpayne@68 375 # not a BAM. Side note: the _nonbam name mangling is necessary to
jpayne@68 376 # keep the nonbam arg passed into the original _wraps() decorator
jpayne@68 377 # in scope.
jpayne@68 378 if nonbam is not None and nonbam != "ALL":
jpayne@68 379 if isinstance(nonbam, str):
jpayne@68 380 _nonbam = [nonbam]
jpayne@68 381 else:
jpayne@68 382 _nonbam = nonbam
jpayne@68 383 for i in _nonbam:
jpayne@68 384 if i in kwargs:
jpayne@68 385 result_is_bam = False
jpayne@68 386 break
jpayne@68 387
jpayne@68 388 if force_bam:
jpayne@68 389 result_is_bam = True
jpayne@68 390
jpayne@68 391 decode_output = not result_is_bam
jpayne@68 392
jpayne@68 393 # Do the actual call
jpayne@68 394 stream = call_bedtools(
jpayne@68 395 cmds,
jpayne@68 396 tmp,
jpayne@68 397 stdin=stdin,
jpayne@68 398 check_stderr=check_stderr,
jpayne@68 399 decode_output=decode_output,
jpayne@68 400 )
jpayne@68 401
jpayne@68 402 if does_not_return_bedtool:
jpayne@68 403 return does_not_return_bedtool(stream, **kwargs)
jpayne@68 404
jpayne@68 405 # Post-hoc editing of the BedTool -- for example, this is used for
jpayne@68 406 # the sequence methods to add a `seqfn` attribute to the resulting
jpayne@68 407 # BedTool.
jpayne@68 408 if add_to_bedtool is not None:
jpayne@68 409 for kw, attr in list(add_to_bedtool.items()):
jpayne@68 410 if kw == "stdout":
jpayne@68 411 value = stream
jpayne@68 412 else:
jpayne@68 413 value = kwargs[kw]
jpayne@68 414 setattr(self, attr, value)
jpayne@68 415 result = self
jpayne@68 416 else:
jpayne@68 417 result = BedTool(stream)
jpayne@68 418
jpayne@68 419 result._isbam = result_is_bam
jpayne@68 420 result._cmds = cmds
jpayne@68 421 del kwargs
jpayne@68 422 return result
jpayne@68 423
jpayne@68 424 # Now add the edited docstring (original Python docstring plus BEDTools
jpayne@68 425 # help) to the newly created method above
jpayne@68 426 if func.__doc__ is None:
jpayne@68 427 orig = ""
jpayne@68 428 else:
jpayne@68 429 orig = func.__doc__
jpayne@68 430
jpayne@68 431 wrapped.__doc__ = orig + "\n".join(_add_doc) + help_str
jpayne@68 432
jpayne@68 433 # Add the original method's name to a new attribute so we can access it
jpayne@68 434 # when logging history
jpayne@68 435 wrapped._name = func.__name__ # type: ignore
jpayne@68 436
jpayne@68 437 return wrapped
jpayne@68 438
jpayne@68 439 return decorator
jpayne@68 440
jpayne@68 441
jpayne@68 442 def _log_to_history(method: Callable):
jpayne@68 443 """
jpayne@68 444 Decorator to add a method and its kwargs to the history.
jpayne@68 445
jpayne@68 446 Assumes that you only add this decorator to bedtool instances that
jpayne@68 447 return other bedtool instances
jpayne@68 448 """
jpayne@68 449
jpayne@68 450 def decorated(self, *args, **kwargs):
jpayne@68 451
jpayne@68 452 # this calls the actual method in the first place; *result* is
jpayne@68 453 # whatever you get back
jpayne@68 454 result = method(self, *args, **kwargs)
jpayne@68 455
jpayne@68 456 # add appropriate tags
jpayne@68 457 parent_tag = self._tag
jpayne@68 458 result_tag = result._tag
jpayne@68 459
jpayne@68 460 history_step = HistoryStep(
jpayne@68 461 method, args, kwargs, self, parent_tag, result_tag
jpayne@68 462 )
jpayne@68 463
jpayne@68 464 # only add the current history to the new bedtool if there's
jpayne@68 465 # something to add
jpayne@68 466 if len(self.history) > 0:
jpayne@68 467 result.history.append(self.history)
jpayne@68 468
jpayne@68 469 # but either way, add this history step to the result.
jpayne@68 470 result.history.append(history_step)
jpayne@68 471
jpayne@68 472 return result
jpayne@68 473
jpayne@68 474 decorated.__doc__ = method.__doc__
jpayne@68 475 return decorated
jpayne@68 476
jpayne@68 477
jpayne@68 478 class BedTool(object):
jpayne@68 479 TEMPFILES = filenames.TEMPFILES
jpayne@68 480
jpayne@68 481 def __init__(self, fn: Optional[Any] = None,
jpayne@68 482 from_string: bool = False,
jpayne@68 483 remote: bool = False):
jpayne@68 484 """
jpayne@68 485 Wrapper around Aaron Quinlan's ``BEDtools`` suite of programs
jpayne@68 486 (https://github.com/arq5x/bedtools); also contains many useful
jpayne@68 487 methods for more detailed work with BED files.
jpayne@68 488
jpayne@68 489 *fn* is typically the name of a BED-like file, but can also be
jpayne@68 490 one of the following:
jpayne@68 491
jpayne@68 492 * a string filename
jpayne@68 493 * another BedTool object
jpayne@68 494 * an iterable of Interval objects
jpayne@68 495 * an open file object
jpayne@68 496 * a "file contents" string (see below)
jpayne@68 497
jpayne@68 498 If *from_string* is True, then you can pass a string that contains
jpayne@68 499 the contents of the BedTool you want to create. This will treat all
jpayne@68 500 spaces as TABs and write to tempfile, treating whatever you pass as
jpayne@68 501 *fn* as the contents of the bed file. This also strips empty lines.
jpayne@68 502
jpayne@68 503 Typical usage is to point to an existing file::
jpayne@68 504
jpayne@68 505 a = BedTool('a.bed')
jpayne@68 506
jpayne@68 507 But you can also create one from scratch from a string::
jpayne@68 508
jpayne@68 509 >>> s = '''
jpayne@68 510 ... chrX 1 100
jpayne@68 511 ... chrX 25 800
jpayne@68 512 ... '''
jpayne@68 513 >>> a = BedTool(s, from_string=True)
jpayne@68 514
jpayne@68 515 Or use examples that come with pybedtools::
jpayne@68 516
jpayne@68 517 >>> example_files = pybedtools.list_example_files()
jpayne@68 518 >>> assert 'a.bed' in example_files
jpayne@68 519 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 520
jpayne@68 521 """
jpayne@68 522 if remote:
jpayne@68 523 raise ValueError(
jpayne@68 524 "Remote BAM no longer supported (since BEDTools does not " "support it)"
jpayne@68 525 )
jpayne@68 526 self.remote = remote
jpayne@68 527 self._isbam = False
jpayne@68 528 self._bam_header = ""
jpayne@68 529 self._cmds = []
jpayne@68 530 if from_string:
jpayne@68 531 if fn is None or not isinstance(fn, str):
jpayne@68 532 raise ValueError("from_string=True requires a string to parse")
jpayne@68 533 bed_contents = fn
jpayne@68 534 fn = self._tmp()
jpayne@68 535 fout = open(fn, "w")
jpayne@68 536 for line in bed_contents.splitlines():
jpayne@68 537 if len(line.strip()) == 0:
jpayne@68 538 continue
jpayne@68 539 line = "\t".join(line.split()) + "\n"
jpayne@68 540 fout.write(line)
jpayne@68 541 fout.close()
jpayne@68 542
jpayne@68 543 else:
jpayne@68 544 # if fn is a Path object, we have to use its string representation
jpayne@68 545 if isinstance(fn, pathlib.PurePath):
jpayne@68 546 fn = str(fn)
jpayne@68 547
jpayne@68 548 # our work is already done
jpayne@68 549 if isinstance(fn, BedTool):
jpayne@68 550 fn = fn.fn
jpayne@68 551
jpayne@68 552 # from_string=False, so assume it's a filename
jpayne@68 553 elif isinstance(fn, str):
jpayne@68 554 if remote:
jpayne@68 555 self._isbam = True
jpayne@68 556 else:
jpayne@68 557 if not os.path.exists(fn):
jpayne@68 558 msg = 'File "%s" does not exist' % fn
jpayne@68 559 raise FileNotFoundError(msg)
jpayne@68 560 self._isbam = isBAM(fn)
jpayne@68 561
jpayne@68 562 # TODO: we dont' really need this, but it's added here for
jpayne@68 563 # compatibility with existing tests
jpayne@68 564 if self._isbam:
jpayne@68 565 header = pysam.Samfile(fn).header.to_dict()
jpayne@68 566 # For example:
jpayne@68 567 # {
jpayne@68 568 # 'HD': {'VN': '1.0', 'SO': 'coordinate'},
jpayne@68 569 # 'SQ': [
jpayne@68 570 # {'LN': 23011544,
jpayne@68 571 # 'SN': 'chr2L'},
jpayne@68 572 # {'LN': 21146708,
jpayne@68 573 # 'SN': 'chr2R'},
jpayne@68 574 # {'LN': 24543557,
jpayne@68 575 # 'SN': 'chr3L'},
jpayne@68 576 # {'LN': 27905053,
jpayne@68 577 # 'SN': 'chr3R'},
jpayne@68 578 # {'LN': 1351857,
jpayne@68 579 # 'SN': 'chr4'},
jpayne@68 580 # {'LN': 22422827,
jpayne@68 581 # 'SN': 'chrX'}
jpayne@68 582 # ]
jpayne@68 583 # }
jpayne@68 584
jpayne@68 585 txt_header = []
jpayne@68 586 for k, v in header.items():
jpayne@68 587 if isinstance(v, list):
jpayne@68 588 for i in v:
jpayne@68 589 if isinstance(i, dict):
jpayne@68 590 txt_header.append(
jpayne@68 591 "\t".join(
jpayne@68 592 ["@" + k]
jpayne@68 593 + [
jpayne@68 594 ":".join(map(str, j))
jpayne@68 595 for j in sorted(i.items(), reverse=True)
jpayne@68 596 ]
jpayne@68 597 )
jpayne@68 598 )
jpayne@68 599 elif isinstance(i, str):
jpayne@68 600 txt_header.append(i)
jpayne@68 601
jpayne@68 602 elif isinstance(v, dict):
jpayne@68 603 txt_header.append(
jpayne@68 604 "\t".join(
jpayne@68 605 ["@" + k]
jpayne@68 606 + [
jpayne@68 607 ":".join(map(str, j))
jpayne@68 608 for j in sorted(v.items(), reverse=True)
jpayne@68 609 ]
jpayne@68 610 )
jpayne@68 611 )
jpayne@68 612 else:
jpayne@68 613 raise ValueError("unhandled type in BAM header")
jpayne@68 614 self._bam_header = "\n".join(txt_header) + "\n"
jpayne@68 615
jpayne@68 616 # If tuple or list, then save as file first
jpayne@68 617 # (fixes #73)
jpayne@68 618 elif isinstance(fn, (list, tuple)):
jpayne@68 619 fn = BedTool(iter(fn)).saveas().fn
jpayne@68 620
jpayne@68 621 # Otherwise assume iterator, say an open file as from
jpayne@68 622 # subprocess.PIPE
jpayne@68 623 else:
jpayne@68 624 fn = fn
jpayne@68 625
jpayne@68 626 self.fn = fn
jpayne@68 627 tag = "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
jpayne@68 628 self._tag = tag
jpayne@68 629 _tags[tag] = self
jpayne@68 630 self._hascounts = False
jpayne@68 631 self._file_type = None
jpayne@68 632 self.seqfn = None
jpayne@68 633 self.fastq = None
jpayne@68 634 self.igv_script = None
jpayne@68 635 self.links_html = None
jpayne@68 636 self.history = History()
jpayne@68 637
jpayne@68 638 @classmethod
jpayne@68 639 def from_dataframe(
jpayne@68 640 cls,
jpayne@68 641 df: pd.DataFrame,
jpayne@68 642 outfile: Optional[str] =None,
jpayne@68 643 sep: str ="\t",
jpayne@68 644 header: bool =False,
jpayne@68 645 na_rep:str =".",
jpayne@68 646 index:bool =False,
jpayne@68 647 **kwargs
jpayne@68 648 ) -> BedTool:
jpayne@68 649 """
jpayne@68 650 Creates a BedTool from a pandas.DataFrame.
jpayne@68 651
jpayne@68 652 If `outfile` is None, a temporary file will be used. Otherwise it can
jpayne@68 653 be a specific filename or an open file handle. Additional kwargs will
jpayne@68 654 be passed to `pandas.DataFrame.to_csv`.
jpayne@68 655
jpayne@68 656 The fields of the resulting BedTool will match the order of columns in
jpayne@68 657 the dataframe.
jpayne@68 658 """
jpayne@68 659 try:
jpayne@68 660 import pandas
jpayne@68 661 except ImportError:
jpayne@68 662 raise ImportError("pandas must be installed to use dataframes")
jpayne@68 663 if outfile is None:
jpayne@68 664 outfile = cls._tmp()
jpayne@68 665 default_kwargs = dict(sep=sep, header=header, na_rep=na_rep, index=index)
jpayne@68 666 default_kwargs.update(kwargs)
jpayne@68 667 df.to_csv(outfile, **default_kwargs)
jpayne@68 668
jpayne@68 669 if isinstance(outfile, str):
jpayne@68 670 fn = outfile
jpayne@68 671 else:
jpayne@68 672 try:
jpayne@68 673 fn = outfile.name
jpayne@68 674 except AttributeError:
jpayne@68 675 raise ValueError(
jpayne@68 676 "`outfile` is not a string and doesn't have a `name` attribute. "
jpayne@68 677 "Unable to determine filename."
jpayne@68 678 )
jpayne@68 679 return BedTool(fn)
jpayne@68 680
jpayne@68 681 def split(self, func: Callable, *args, **kwargs) -> BedTool:
jpayne@68 682 """
jpayne@68 683 Split each feature using a user-defined function.
jpayne@68 684
jpayne@68 685 Calls the provided function `func` with each interval. In contrast to
jpayne@68 686 `each` (which does something similar), this method expects `func` to
jpayne@68 687 return an *iterable* of Interval objects.
jpayne@68 688
jpayne@68 689 args and kwargs are passed directly to `func`.
jpayne@68 690
jpayne@68 691 Returns a new BedTool.
jpayne@68 692 """
jpayne@68 693
jpayne@68 694 def generator():
jpayne@68 695 for orig_interval in self:
jpayne@68 696 for interval in func(orig_interval, *args, **kwargs):
jpayne@68 697 yield interval
jpayne@68 698
jpayne@68 699 return BedTool(generator())
jpayne@68 700
jpayne@68 701 def truncate_to_chrom(self, genome: str | dict) -> BedTool:
jpayne@68 702 """
jpayne@68 703 Ensure all features fall within chromosome limits.
jpayne@68 704
jpayne@68 705 Some peak-callers extend peaks such that the boundaries overstep
jpayne@68 706 chromosome coordinates. Upon uploading such a file to a genome browser
jpayne@68 707 like UCSC, this results in an error like::
jpayne@68 708
jpayne@68 709 Error line 101 of custom track: chromEnd larger than chrom chr2
jpayne@68 710 size
jpayne@68 711
jpayne@68 712 Use this method to clean your file, truncating any out-of-bounds
jpayne@68 713 features to fit within the chromosome coordinates of `genome`.
jpayne@68 714
jpayne@68 715 `genome` can be either an assembly name ('dm3') or a dictionary where
jpayne@68 716 keys are chrom and values are (start, stop) tuples.
jpayne@68 717 """
jpayne@68 718 if isinstance(genome, dict):
jpayne@68 719 chromdict = genome
jpayne@68 720 else:
jpayne@68 721 assert isinstance(genome, str)
jpayne@68 722 chromdict = helpers.chromsizes(genome)
jpayne@68 723
jpayne@68 724 tmp = self._tmp()
jpayne@68 725 with open(tmp, "w") as fout:
jpayne@68 726 for chrom, coords in list(chromdict.items()):
jpayne@68 727 start, stop = coords
jpayne@68 728 start = str(start)
jpayne@68 729 stop = str(stop)
jpayne@68 730 fout.write("\t".join([chrom, start, stop]) + "\n")
jpayne@68 731 return self.intersect(tmp)
jpayne@68 732
jpayne@68 733 def tabix_intervals(self, interval_or_string: Interval | str, check_coordinates: bool=False) -> BedTool:
jpayne@68 734 """
jpayne@68 735 Retrieve all intervals within coordinates from a "tabixed" BedTool.
jpayne@68 736
jpayne@68 737 Given either a string in "chrom:start-stop" format, or an interval-like
jpayne@68 738 object with chrom, start, stop attributes, return a *streaming* BedTool
jpayne@68 739 of the features in this BedTool that overlap the provided interval.
jpayne@68 740
jpayne@68 741 If the coordinates are invalid, an empty generator is returned unless
jpayne@68 742 `check_coordinates=True` in which case a ValueError will be raised.
jpayne@68 743 """
jpayne@68 744 if not self._tabixed():
jpayne@68 745 raise ValueError(
jpayne@68 746 "This BedTool has not been indexed for tabix "
jpayne@68 747 "-- please use the .tabix() method"
jpayne@68 748 )
jpayne@68 749
jpayne@68 750 # tabix expects 1-based coords, but BEDTools works with
jpayne@68 751 # zero-based. pybedtools and pysam also work with zero-based. So we can
jpayne@68 752 # pass zero-based directly to the pysam tabix interface.
jpayne@68 753 tbx = pysam.TabixFile(self.fn)
jpayne@68 754
jpayne@68 755 # If an interval is passed, use its coordinates directly
jpayne@68 756 if isinstance(interval_or_string, Interval):
jpayne@68 757 interval: Interval = interval_or_string
jpayne@68 758 chrom, start, end = interval.chrom, interval.start, interval.stop
jpayne@68 759 # Parse string directly instead of relying on Interval, in order to
jpayne@68 760 # permit full chromosome fetching
jpayne@68 761 else:
jpayne@68 762 match = helpers.coord_re.search(interval_or_string)
jpayne@68 763 # Assume string is contig if it doesn't fit chrom:start-end format
jpayne@68 764 if match is None:
jpayne@68 765 chrom = interval_or_string
jpayne@68 766 start, end = None, None
jpayne@68 767 # Otherwise parse the coordinates
jpayne@68 768 else:
jpayne@68 769 chrom, start, end = match.group(1, 2, 3)
jpayne@68 770 start, end = int(start), int(end)
jpayne@68 771
jpayne@68 772 # Fetch results.
jpayne@68 773 try:
jpayne@68 774 results = tbx.fetch(str(chrom), start, end)
jpayne@68 775 except ValueError:
jpayne@68 776 if check_coordinates:
jpayne@68 777 raise
jpayne@68 778 else:
jpayne@68 779 results = []
jpayne@68 780
jpayne@68 781 # pysam.ctabix.TabixIterator does not include newlines when yielding so
jpayne@68 782 # we need to add them.
jpayne@68 783 def gen():
jpayne@68 784 for i in results:
jpayne@68 785 yield i + "\n"
jpayne@68 786
jpayne@68 787 # xref #190
jpayne@68 788 x = BedTool(gen()).saveas()
jpayne@68 789 tbx.close()
jpayne@68 790 return x
jpayne@68 791
jpayne@68 792 def tabix_contigs(self):
jpayne@68 793 """
jpayne@68 794 Returns a list of contigs from the tabix index.
jpayne@68 795 """
jpayne@68 796 if not self._tabixed():
jpayne@68 797 raise ValueError(
jpayne@68 798 "This BedTool has not been indexed for tabix "
jpayne@68 799 "-- please use the .tabix() method"
jpayne@68 800 )
jpayne@68 801
jpayne@68 802 tbx = pysam.TabixFile(self.fn)
jpayne@68 803 return tbx.contigs
jpayne@68 804
jpayne@68 805 def tabix(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> BedTool:
jpayne@68 806 """
jpayne@68 807 Prepare a BedTool for use with Tabix.
jpayne@68 808
jpayne@68 809 Returns a new BedTool that has been BGZIP compressed
jpayne@68 810 and indexed by tabix.
jpayne@68 811
jpayne@68 812 Parameters
jpayne@68 813 ----------
jpayne@68 814
jpayne@68 815 in_place : bool
jpayne@68 816 If True (default), then assume the file is already sorted and
jpayne@68 817 replace the existing file with the BGZIPed version.
jpayne@68 818
jpayne@68 819 force : bool
jpayne@68 820 If True (default is False), then overwrite both the index and the
jpayne@68 821 BGZIP file.
jpayne@68 822
jpayne@68 823 is_sorted : bool
jpayne@68 824 If True (default is False), then assume the file is already sorted
jpayne@68 825 so that BedTool.bgzip() doesn't have to do that work.
jpayne@68 826 """
jpayne@68 827 # Return quickly if nothing to do
jpayne@68 828 if self._tabixed() and not force:
jpayne@68 829 return self
jpayne@68 830
jpayne@68 831 # Make sure it's BGZIPed
jpayne@68 832 fn = self.bgzip(in_place=in_place, force=force, is_sorted=is_sorted)
jpayne@68 833 if self.file_type is not None and self.file_type not in ["bam", "empty"]:
jpayne@68 834 pysam.tabix_index(fn, force=force, preset=self.file_type) # type: ignore
jpayne@68 835 return BedTool(fn)
jpayne@68 836
jpayne@68 837 def _tabixed(self):
jpayne@68 838 """
jpayne@68 839 Verifies that we're working with a tabixed file: a string filename
jpayne@68 840 pointing to a BGZIPed file with a .tbi file in the same dir.
jpayne@68 841 """
jpayne@68 842 if (
jpayne@68 843 isinstance(self.fn, str)
jpayne@68 844 and isBGZIP(self.fn)
jpayne@68 845 and os.path.exists(self.fn + ".tbi")
jpayne@68 846 ):
jpayne@68 847 return True
jpayne@68 848
jpayne@68 849 def bgzip(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> str:
jpayne@68 850 """
jpayne@68 851 Helper function for more control over "tabixed" BedTools.
jpayne@68 852
jpayne@68 853 Checks to see if we already have a BGZIP file; if not then prepare one.
jpayne@68 854 Always leaves the original file alone. You can always just make a
jpayne@68 855 BedTool out of an already sorted and BGZIPed file to avoid this step.
jpayne@68 856
jpayne@68 857 `in_place` will put the BGZIPed file in the same dir (possibly after
jpayne@68 858 sorting to tempfile).
jpayne@68 859
jpayne@68 860 If `is_sorted`, then assume the file is already sorted. Otherwise call
jpayne@68 861 bedtools sort with the `-header` option.
jpayne@68 862
jpayne@68 863 `force` will overwrite without asking.
jpayne@68 864 """
jpayne@68 865 # It may already be BGZIPed...
jpayne@68 866 if isinstance(self.fn, str) and not force:
jpayne@68 867 if isBGZIP(self.fn):
jpayne@68 868 return self.fn
jpayne@68 869
jpayne@68 870 # If not in_place, then make a tempfile for the BGZIPed version
jpayne@68 871 if not in_place:
jpayne@68 872 # Get tempfile name, sorted or not
jpayne@68 873 if not is_sorted:
jpayne@68 874 fn = self.sort(header=True).fn
jpayne@68 875 else:
jpayne@68 876 fn = self._tmp()
jpayne@68 877
jpayne@68 878 # Register for later deletion
jpayne@68 879 outfn = fn + ".gz"
jpayne@68 880 BedTool.TEMPFILES.append(outfn)
jpayne@68 881
jpayne@68 882 # Creates tempfile.gz
jpayne@68 883 pysam.tabix_compress(fn, outfn, force=force)
jpayne@68 884 return outfn
jpayne@68 885
jpayne@68 886 # Otherwise, make sure the BGZIPed version has a similar name to the
jpayne@68 887 # current BedTool's file
jpayne@68 888 if in_place:
jpayne@68 889 if not is_sorted:
jpayne@68 890 fn = self.sort(header=True).saveas().fn
jpayne@68 891 else:
jpayne@68 892 fn = self.fn
jpayne@68 893 outfn = self.fn + ".gz"
jpayne@68 894 pysam.tabix_compress(fn, outfn, force=force)
jpayne@68 895 return outfn
jpayne@68 896
jpayne@68 897 def delete_temporary_history(self, ask: bool = True, raw_input_func=None):
jpayne@68 898 """
jpayne@68 899 Use at your own risk! This method will delete temp files. You will be
jpayne@68 900 prompted for deletion of files unless you specify *ask=False*.
jpayne@68 901
jpayne@68 902 Deletes all temporary files created during the history of this BedTool
jpayne@68 903 up to but not including the file this current BedTool points to.
jpayne@68 904
jpayne@68 905 Any filenames that are in the history and have the following pattern
jpayne@68 906 will be deleted::
jpayne@68 907
jpayne@68 908 <TEMP_DIR>/pybedtools.*.tmp
jpayne@68 909
jpayne@68 910 (where <TEMP_DIR> is the result from get_tempdir() and is by default
jpayne@68 911 "/tmp")
jpayne@68 912
jpayne@68 913 Any files that don't have this format will be left alone.
jpayne@68 914
jpayne@68 915 (*raw_input_func* is used for testing)
jpayne@68 916 """
jpayne@68 917 flattened_history = _flatten_list(self.history)
jpayne@68 918 to_delete = []
jpayne@68 919 tempdir = get_tempdir()
jpayne@68 920 for i in flattened_history:
jpayne@68 921 fn = i.fn
jpayne@68 922 if fn.startswith(os.path.join(os.path.abspath(tempdir), "pybedtools")):
jpayne@68 923 if fn.endswith(".tmp"):
jpayne@68 924 to_delete.append(fn)
jpayne@68 925
jpayne@68 926 if raw_input_func is None:
jpayne@68 927 raw_input_func = input
jpayne@68 928
jpayne@68 929 str_fns = "\n\t".join(to_delete)
jpayne@68 930 if ask:
jpayne@68 931 answer = raw_input_func("Delete these files?\n\t%s\n(y/N) " % str_fns)
jpayne@68 932
jpayne@68 933 if not answer.lower()[0] == "y":
jpayne@68 934 print("OK, not deleting.")
jpayne@68 935 return
jpayne@68 936 for fn in to_delete:
jpayne@68 937 os.unlink(fn)
jpayne@68 938 return
jpayne@68 939
jpayne@68 940 def filter(self, func: Callable, *args, **kwargs) -> BedTool:
jpayne@68 941 """
jpayne@68 942 Filter features by user-defined function.
jpayne@68 943
jpayne@68 944 Takes a function *func* that is called for each feature in the
jpayne@68 945 `BedTool` object and returns only those for which the function returns
jpayne@68 946 True.
jpayne@68 947
jpayne@68 948 *args and **kwargs are passed directly to *func*.
jpayne@68 949
jpayne@68 950 Returns a streaming BedTool; if you want the filename then use the
jpayne@68 951 .saveas() method.
jpayne@68 952
jpayne@68 953 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 954 >>> subset = a.filter(lambda b: b.chrom == 'chr1' and b.start < 150)
jpayne@68 955 >>> len(a), len(subset)
jpayne@68 956 (4, 2)
jpayne@68 957
jpayne@68 958 so it has extracted 2 records from the original 4.
jpayne@68 959
jpayne@68 960 """
jpayne@68 961 return BedTool((f for f in self if func(f, *args, **kwargs)))
jpayne@68 962
jpayne@68 963 def field_count(self, n:int=10) -> int:
jpayne@68 964 """
jpayne@68 965 Number of fields in each line of this BedTool (checks `n` lines)
jpayne@68 966
jpayne@68 967 Return the number of fields in the features this file contains. Checks
jpayne@68 968 the first *n* features.
jpayne@68 969
jpayne@68 970 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 971 >>> a.field_count()
jpayne@68 972 6
jpayne@68 973 """
jpayne@68 974 if self.file_type == "empty":
jpayne@68 975 return 0
jpayne@68 976 i = 0
jpayne@68 977 fields = set([])
jpayne@68 978 for feat in self:
jpayne@68 979 if i > n:
jpayne@68 980 break
jpayne@68 981 i += 1
jpayne@68 982 # TODO: make this more efficient.
jpayne@68 983 fields.update([len(feat.fields)])
jpayne@68 984 assert len(fields) == 1, fields
jpayne@68 985 return list(fields)[0]
jpayne@68 986
jpayne@68 987 def each(self, func: Callable, *args, **kwargs) -> BedTool:
jpayne@68 988 """
jpayne@68 989 Modify each feature with a user-defined function.
jpayne@68 990
jpayne@68 991 Applies user-defined function *func* to each feature. *func* must
jpayne@68 992 accept an Interval as its first argument; *args and **kwargs will be
jpayne@68 993 passed to *func*.
jpayne@68 994
jpayne@68 995 *func* must return an Interval object OR a value that evaluates to
jpayne@68 996 False, in which case the original feature will be removed from the
jpayne@68 997 output. This way, an additional "filter" call is not necessary.
jpayne@68 998
jpayne@68 999 >>> def truncate_feature(feature, limit=0):
jpayne@68 1000 ... feature.score = str(len(feature))
jpayne@68 1001 ... if len(feature) > limit:
jpayne@68 1002 ... feature.stop = feature.start + limit
jpayne@68 1003 ... feature.name = feature.name + '.short'
jpayne@68 1004 ... return feature
jpayne@68 1005
jpayne@68 1006 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 1007 >>> b = a.each(truncate_feature, limit=100)
jpayne@68 1008 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 1009 chr1 1 100 feature1 99 +
jpayne@68 1010 chr1 100 200 feature2 100 +
jpayne@68 1011 chr1 150 250 feature3.short 350 -
jpayne@68 1012 chr1 900 950 feature4 50 +
jpayne@68 1013 <BLANKLINE>
jpayne@68 1014
jpayne@68 1015 """
jpayne@68 1016
jpayne@68 1017 def _generator():
jpayne@68 1018 for f in self:
jpayne@68 1019 result = func(f, *args, **kwargs)
jpayne@68 1020 if result:
jpayne@68 1021 yield result
jpayne@68 1022
jpayne@68 1023 return BedTool(_generator())
jpayne@68 1024
jpayne@68 1025 def introns(self, gene: str = "gene", exon: str = "exon") -> BedTool:
jpayne@68 1026 """
jpayne@68 1027 Create intron features (requires specific input format).
jpayne@68 1028
jpayne@68 1029 NOTE: this method assumes a simple file with non-overlapping exons. For
jpayne@68 1030 more sophisticated features, consider the gffutils package instead.
jpayne@68 1031
jpayne@68 1032 Given a BED12 or a GFF with exons, create a new `BedTool` with just
jpayne@68 1033 introns. The output is a bed6 file with the score column (5) being one
jpayne@68 1034 of 'intron'/'utr5'/'utr3'
jpayne@68 1035 """
jpayne@68 1036 # iterate over all the features in the gene.
jpayne@68 1037 s = self.sort()
jpayne@68 1038 if self.file_type == "gff":
jpayne@68 1039 exon_iter = BedTool((f for f in s if f[2] == exon)).saveas()
jpayne@68 1040 gene_iter = BedTool((f for f in s if f[2] == gene)).saveas()
jpayne@68 1041
jpayne@68 1042 elif self.file_type == "bed":
jpayne@68 1043 if s.field_count() == 12:
jpayne@68 1044 exon_iter = s.bed6().saveas()
jpayne@68 1045 gene_iter = s.saveas()
jpayne@68 1046 else:
jpayne@68 1047 # TODO: bed6. groupby on name and find smallest start,
jpayne@68 1048 # largest stop.
jpayne@68 1049 exon_iter = s
jpayne@68 1050 gene_iter = None
jpayne@68 1051 raise NotImplementedError(
jpayne@68 1052 ".introns() only supported for bed12" "and GFF"
jpayne@68 1053 )
jpayne@68 1054
jpayne@68 1055 else:
jpayne@68 1056 raise NotImplementedError(".introns() only supported for BED and GFF")
jpayne@68 1057
jpayne@68 1058 with open(BedTool._tmp(), "w") as fh:
jpayne@68 1059 # group on the name.
jpayne@68 1060 exon_intervals = IntervalFile(exon_iter.fn)
jpayne@68 1061 for g in gene_iter:
jpayne@68 1062 # search finds all, but we just want the ones that completely
jpayne@68 1063 # overlap this gene.
jpayne@68 1064 exons = [
jpayne@68 1065 e
jpayne@68 1066 for e in exon_intervals.search(g, same_strand=True)
jpayne@68 1067 if e.start >= g.start and e.end <= g.end
jpayne@68 1068 ]
jpayne@68 1069
jpayne@68 1070 for i, exon_instance in enumerate(exons):
jpayne@68 1071 exon_instance: pybedtools.Interval
jpayne@68 1072 # 5' utr between gene start and first intron
jpayne@68 1073 if i == 0 and exon_instance.start > g.start:
jpayne@68 1074 utr = {"+": "utr5", "-": "utr3"}[g.strand]
jpayne@68 1075 print(
jpayne@68 1076 "%s\t%i\t%i\t%s\t%s\t%s"
jpayne@68 1077 % (g.chrom, g.start, exon_instance.start, g.name, utr, g.strand),
jpayne@68 1078 file=fh,
jpayne@68 1079 )
jpayne@68 1080 elif i == len(exons) - 1 and exon_instance.end < g.end:
jpayne@68 1081 utr = {"+": "utr3", "-": "utr5"}[g.strand]
jpayne@68 1082 print(
jpayne@68 1083 "%s\t%i\t%i\t%s\t%s\t%s"
jpayne@68 1084 % (g.chrom, exon_instance.end, g.end, g.name, utr, g.strand),
jpayne@68 1085 file=fh,
jpayne@68 1086 )
jpayne@68 1087 elif i != len(exons) - 1:
jpayne@68 1088 istart = exon_instance.end
jpayne@68 1089 iend = exons[i + 1].start
jpayne@68 1090 print(
jpayne@68 1091 "%s\t%i\t%i\t%s\tintron\t%s"
jpayne@68 1092 % (g.chrom, istart, iend, g.name, g.strand),
jpayne@68 1093 file=fh,
jpayne@68 1094 )
jpayne@68 1095 return BedTool(fh.name)
jpayne@68 1096
jpayne@68 1097 def features(self):
jpayne@68 1098 """
jpayne@68 1099 Returns an iterable of features
jpayne@68 1100 """
jpayne@68 1101 if hasattr(self, "next") or hasattr(self, "__next__"):
jpayne@68 1102 return self
jpayne@68 1103 return iter(self)
jpayne@68 1104
jpayne@68 1105 FileType = Literal['bed', 'vcf', 'gff', 'bam', 'sam', 'empty']
jpayne@68 1106
jpayne@68 1107 @property
jpayne@68 1108 def file_type(self) -> Optional[FileType]:
jpayne@68 1109 """
jpayne@68 1110 Return the type of the current file. One of ('bed','vcf','gff', 'bam',
jpayne@68 1111 'sam', 'empty').
jpayne@68 1112
jpayne@68 1113 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 1114 >>> print(a.file_type)
jpayne@68 1115 bed
jpayne@68 1116 """
jpayne@68 1117 if not isinstance(self.fn, str):
jpayne@68 1118 raise ValueError(
jpayne@68 1119 "Checking file_type not supported for "
jpayne@68 1120 "non-file BedTools. Use .saveas() to "
jpayne@68 1121 "save as a temp file first."
jpayne@68 1122 )
jpayne@68 1123 if self._isbam:
jpayne@68 1124 self._file_type = "bam"
jpayne@68 1125 else:
jpayne@68 1126 try:
jpayne@68 1127 self._file_type = next(iter(self)).file_type
jpayne@68 1128 except StopIteration:
jpayne@68 1129 self._file_type = "empty"
jpayne@68 1130
jpayne@68 1131 return self._file_type
jpayne@68 1132
jpayne@68 1133 def cut(self, indexes: list[int], stream: bool = False) -> BedTool:
jpayne@68 1134 """
jpayne@68 1135 Analogous to unix `cut`.
jpayne@68 1136
jpayne@68 1137 Similar to unix `cut` except indexes are 0-based, must be a list
jpayne@68 1138 and the columns are returned in the order requested.
jpayne@68 1139
jpayne@68 1140 This method returns a BedTool of results, which means that the indexes
jpayne@68 1141 returned must be valid GFF/GTF/BED/SAM features.
jpayne@68 1142
jpayne@68 1143 If you would like arbitrary columns -- say, just chrom and featuretype
jpayne@68 1144 of a GFF, which would not comprise a valid feature -- then instead of
jpayne@68 1145 this method, simply use indexes on each feature, e.g,
jpayne@68 1146
jpayne@68 1147 >>> gff = pybedtools.example_bedtool('d.gff')
jpayne@68 1148 >>> results = [(f[0], f[2]) for f in gff]
jpayne@68 1149
jpayne@68 1150 In addition, `indexes` can contain keys of the GFF/GTF attributes, in
jpayne@68 1151 which case the values are returned. e.g. 'gene_name' will return the
jpayne@68 1152 corresponding name from a GTF, or 'start' will return the start
jpayne@68 1153 attribute of a BED Interval.
jpayne@68 1154 """
jpayne@68 1155 if stream:
jpayne@68 1156 return BedTool(([f[attr] for attr in indexes] for f in self))
jpayne@68 1157 else:
jpayne@68 1158 with open(self._tmp(), "w") as fh:
jpayne@68 1159 for f in self:
jpayne@68 1160 print("\t".join(map(str, [f[attr] for attr in indexes])), file=fh)
jpayne@68 1161 return BedTool(fh.name)
jpayne@68 1162
jpayne@68 1163 @classmethod
jpayne@68 1164 def _tmp(cls) -> str:
jpayne@68 1165 """
jpayne@68 1166 Makes a tempfile and registers it in the BedTool.TEMPFILES class
jpayne@68 1167 variable. Adds a "pybedtools." prefix and ".tmp" extension for easy
jpayne@68 1168 deletion if you forget to call pybedtools.cleanup().
jpayne@68 1169 """
jpayne@68 1170 tmpfn = tempfile.NamedTemporaryFile(
jpayne@68 1171 prefix=settings.tempfile_prefix,
jpayne@68 1172 suffix=settings.tempfile_suffix,
jpayne@68 1173 delete=False,
jpayne@68 1174 )
jpayne@68 1175 tmpfn = tmpfn.name
jpayne@68 1176 cls.TEMPFILES.append(tmpfn)
jpayne@68 1177 return tmpfn
jpayne@68 1178
jpayne@68 1179 def __iter__(self):
jpayne@68 1180 """
jpayne@68 1181 Dispatches the right iterator depending on how this BedTool was
jpayne@68 1182 created
jpayne@68 1183 """
jpayne@68 1184 if self._isbam:
jpayne@68 1185 # Note: BAM class takes filename or stream, so self.fn is OK
jpayne@68 1186 # here
jpayne@68 1187 return BAM(self.fn)
jpayne@68 1188
jpayne@68 1189 # Plain ol' filename
jpayne@68 1190 if isinstance(self.fn, str):
jpayne@68 1191 if not os.path.exists(self.fn):
jpayne@68 1192 raise BedToolsFileError("{0} does not exist".format(self.fn))
jpayne@68 1193 if isGZIP(self.fn):
jpayne@68 1194 return IntervalIterator(gzip.open(self.fn, "rt"))
jpayne@68 1195 else:
jpayne@68 1196 return IntervalIterator(open(self.fn, "r"))
jpayne@68 1197 # Any other kind of input (streaming string from stdout; iterable of
jpayne@68 1198 # Intervals, iterable of (chrom, start, stop) tuples, etc are handled
jpayne@68 1199 # appropriately by IntervalIterator.
jpayne@68 1200 else:
jpayne@68 1201 return IntervalIterator(self.fn)
jpayne@68 1202
jpayne@68 1203 @property
jpayne@68 1204 def intervals(self):
jpayne@68 1205 if isinstance(self.fn, str):
jpayne@68 1206 return IntervalFile(self.fn)
jpayne@68 1207 else:
jpayne@68 1208 raise ValueError("Please convert to a file-based BedTool using saveas")
jpayne@68 1209
jpayne@68 1210 def __repr__(self):
jpayne@68 1211 if isinstance(self.fn, str):
jpayne@68 1212 if os.path.exists(self.fn) or self.remote:
jpayne@68 1213 return "<BedTool(%s)>" % self.fn
jpayne@68 1214 else:
jpayne@68 1215 return "<BedTool(MISSING FILE: %s)>" % self.fn
jpayne@68 1216 elif isinstance(self.fn, BedTool):
jpayne@68 1217 return repr(self.fn)
jpayne@68 1218 else:
jpayne@68 1219 return "<BedTool(%s)>" % repr(self.fn)
jpayne@68 1220
jpayne@68 1221 def __str__(self):
jpayne@68 1222 """
jpayne@68 1223 Returns the string representation of the whole `BedTool`
jpayne@68 1224 """
jpayne@68 1225 items = []
jpayne@68 1226 for i in iter(self):
jpayne@68 1227 i = str(i)
jpayne@68 1228 if isinstance(i, bytes):
jpayne@68 1229 i = i.decode("UTF-8")
jpayne@68 1230 items.append(i)
jpayne@68 1231 return "".join(items)
jpayne@68 1232
jpayne@68 1233 def __len__(self):
jpayne@68 1234 return self.count()
jpayne@68 1235
jpayne@68 1236 def __eq__(self, other: object) -> bool:
jpayne@68 1237 if isinstance(other, BedTool):
jpayne@68 1238 if not isinstance(self.fn, str) or not isinstance(
jpayne@68 1239 other.fn, str
jpayne@68 1240 ):
jpayne@68 1241 raise NotImplementedError(
jpayne@68 1242 "Testing equality only supported for"
jpayne@68 1243 " BedTools that point to files"
jpayne@68 1244 )
jpayne@68 1245 elif not isinstance(other, str):
jpayne@68 1246 raise NotImplementedError(
jpayne@68 1247 "Testing equality only supported for"
jpayne@68 1248 " BedTools that point to files or str of content"
jpayne@68 1249 )
jpayne@68 1250 return str(self) == str(other)
jpayne@68 1251
jpayne@68 1252 def __ne__(self, other:object):
jpayne@68 1253 return not self.__eq__(other)
jpayne@68 1254
jpayne@68 1255 def __getitem__(self, key: slice|int):
jpayne@68 1256 if isinstance(key, slice):
jpayne@68 1257 return islice(self, key.start, key.stop, key.step)
jpayne@68 1258 elif isinstance(key, int):
jpayne@68 1259 return list(islice(self, key, key + 1))[0]
jpayne@68 1260 else:
jpayne@68 1261 raise ValueError(
jpayne@68 1262 "Only slices or integers allowed for indexing " "into a BedTool"
jpayne@68 1263 )
jpayne@68 1264
jpayne@68 1265 def __add__(self, other: BedTool) -> BedTool:
jpayne@68 1266 try:
jpayne@68 1267 result = self.intersect(other, u=True)
jpayne@68 1268 except BEDToolsError as e:
jpayne@68 1269 # BEDTools versions <2.20 would raise BEDToolsError
jpayne@68 1270 if (self.file_type == "empty") or (other.file_type == "empty"):
jpayne@68 1271 result = pybedtools.BedTool("", from_string=True)
jpayne@68 1272 else:
jpayne@68 1273 raise e
jpayne@68 1274 return result
jpayne@68 1275
jpayne@68 1276 def __sub__(self, other: BedTool) -> BedTool:
jpayne@68 1277 result = None
jpayne@68 1278
jpayne@68 1279 try:
jpayne@68 1280 result = self.intersect(other, v=True)
jpayne@68 1281 except BEDToolsError:
jpayne@68 1282 # BEDTools versions <2.20 would raise BEDToolsError
jpayne@68 1283
jpayne@68 1284 if (self.file_type == "empty") and (other.file_type == "empty"):
jpayne@68 1285 result = pybedtools.BedTool("", from_string=True)
jpayne@68 1286 elif other.file_type == "empty":
jpayne@68 1287 result = self.saveas()
jpayne@68 1288 elif self.file_type == "empty":
jpayne@68 1289 result = pybedtools.BedTool("", from_string=True)
jpayne@68 1290 if result is None:
jpayne@68 1291 raise ValueError("Subtraction operation failed.")
jpayne@68 1292
jpayne@68 1293 return result
jpayne@68 1294
jpayne@68 1295 def head(self, n: int = 10, as_string: bool = False):
jpayne@68 1296 """
jpayne@68 1297 Prints the first *n* lines or returns them if as_string is True
jpayne@68 1298
jpayne@68 1299 Note that this only opens the underlying file (gzipped or not), so it
jpayne@68 1300 does not check to see if the file is a valid BED file.
jpayne@68 1301
jpayne@68 1302 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 1303 >>> a.head(2) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 1304 chr1 1 100 feature1 0 +
jpayne@68 1305 chr1 100 200 feature2 0 +
jpayne@68 1306 <BLANKLINE>
jpayne@68 1307
jpayne@68 1308 """
jpayne@68 1309 if not isinstance(self.fn, str):
jpayne@68 1310 raise NotImplementedError(
jpayne@68 1311 "head() not supported for non file-based BedTools"
jpayne@68 1312 )
jpayne@68 1313 if as_string:
jpayne@68 1314 return "".join(str(line) for line in self[:n])
jpayne@68 1315 if self._isbam:
jpayne@68 1316 raise NotImplementedError("head() not supported for BAM")
jpayne@68 1317 else:
jpayne@68 1318 if isGZIP(self.fn):
jpayne@68 1319 openfunc = gzip.open
jpayne@68 1320 openmode = "rt"
jpayne@68 1321 else:
jpayne@68 1322 openfunc = open
jpayne@68 1323 openmode = "r"
jpayne@68 1324 with openfunc(self.fn, openmode) as fin:
jpayne@68 1325 for i, line in enumerate(fin):
jpayne@68 1326 if i == (n):
jpayne@68 1327 break
jpayne@68 1328 print(line, end=" ")
jpayne@68 1329
jpayne@68 1330 def set_chromsizes(self, chromsizes: str | dict):
jpayne@68 1331 """
jpayne@68 1332 Prepare BedTool for operations that require chromosome coords.
jpayne@68 1333
jpayne@68 1334 Set the chromsizes for this genome. If *chromsizes* is a string, it
jpayne@68 1335 will be considered a genome assembly name. If that assembly name is
jpayne@68 1336 not available in pybedtools.genome_registry, then it will be searched
jpayne@68 1337 for on the UCSC Genome Browser.
jpayne@68 1338
jpayne@68 1339 Example usage:
jpayne@68 1340
jpayne@68 1341 >>> hg19 = pybedtools.chromsizes('hg19')
jpayne@68 1342 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 1343 >>> a = a.set_chromsizes(hg19)
jpayne@68 1344 >>> print(a.chromsizes['chr1'])
jpayne@68 1345 (0, 249250621)
jpayne@68 1346
jpayne@68 1347 """
jpayne@68 1348 if isinstance(chromsizes, str):
jpayne@68 1349 self.chromsizes = pybedtools.chromsizes(chromsizes)
jpayne@68 1350 elif isinstance(chromsizes, dict):
jpayne@68 1351 self.chromsizes = chromsizes
jpayne@68 1352 else:
jpayne@68 1353 raise ValueError(
jpayne@68 1354 "Need to specify chromsizes either as a string"
jpayne@68 1355 " (assembly name) or a dictionary"
jpayne@68 1356 )
jpayne@68 1357 return self
jpayne@68 1358
jpayne@68 1359 def _collapse(
jpayne@68 1360 self,
jpayne@68 1361 iterable: Iterable,
jpayne@68 1362 fn: Optional[str] = None,
jpayne@68 1363 trackline: Optional[str] = None,
jpayne@68 1364 in_compressed: bool = False,
jpayne@68 1365 out_compressed: bool = False,
jpayne@68 1366 ) -> str:
jpayne@68 1367 """
jpayne@68 1368 Collapses an iterable into file *fn* (or a new tempfile if *fn* is
jpayne@68 1369 None).
jpayne@68 1370
jpayne@68 1371 Returns the newly created filename.
jpayne@68 1372
jpayne@68 1373 Parameters
jpayne@68 1374 ----------
jpayne@68 1375
jpayne@68 1376 iterable : iter
jpayne@68 1377 Any iterable object whose items can be converted to an Interval.
jpayne@68 1378
jpayne@68 1379 fn : str
jpayne@68 1380 Output filename, if None then creates a temp file for output
jpayne@68 1381
jpayne@68 1382 trackline : str
jpayne@68 1383 If not None, string to be added to the top of the output. Newline
jpayne@68 1384 will be added.
jpayne@68 1385
jpayne@68 1386 in_compressed : bool
jpayne@68 1387 Indicates whether the input is compressed
jpayne@68 1388
jpayne@68 1389 out_compressed : bool
jpayne@68 1390 Indicates whether the output should be compressed
jpayne@68 1391 """
jpayne@68 1392 if fn is None:
jpayne@68 1393 fn = self._tmp()
jpayne@68 1394
jpayne@68 1395 in_open_func = gzip.open if in_compressed else open
jpayne@68 1396 out_open_func = gzip.open if out_compressed else open
jpayne@68 1397
jpayne@68 1398 # special case: if BAM-format BedTool is provided, no trackline should
jpayne@68 1399 # be supplied, and don't iterate -- copy the file wholesale
jpayne@68 1400 if isinstance(iterable, BedTool) and iterable._isbam:
jpayne@68 1401 if trackline:
jpayne@68 1402 raise ValueError(
jpayne@68 1403 "trackline provided, but input is a BAM "
jpayne@68 1404 "file, which takes no track line"
jpayne@68 1405 )
jpayne@68 1406 with open(fn, "wb") as out_:
jpayne@68 1407 out_.write(open(self.fn, "rb").read())
jpayne@68 1408 return fn
jpayne@68 1409
jpayne@68 1410 # If we're just working with filename-based BedTool objects, just copy
jpayne@68 1411 # the files directly
jpayne@68 1412 if isinstance(iterable, BedTool) and isinstance(iterable.fn, str):
jpayne@68 1413 with out_open_func(fn, "wt") as out_:
jpayne@68 1414 if sys.version_info > (3,0):
jpayne@68 1415 in_ = in_open_func(iterable.fn, "rt", errors="ignore")
jpayne@68 1416 else:
jpayne@68 1417 in_ = in_open_func(iterable.fn, "rt")
jpayne@68 1418 if trackline:
jpayne@68 1419 out_.write(trackline.strip() + "\n")
jpayne@68 1420 out_.writelines(in_)
jpayne@68 1421 in_.close()
jpayne@68 1422 else:
jpayne@68 1423 with out_open_func(fn, "wt") as out_:
jpayne@68 1424 for i in iterable:
jpayne@68 1425 if isinstance(i, (list, tuple)):
jpayne@68 1426 i = create_interval_from_list(list(i))
jpayne@68 1427 out_.write(str(i))
jpayne@68 1428 return fn
jpayne@68 1429
jpayne@68 1430 def handle_kwargs(self, prog:str, arg_order: Optional[list[str]] = None, **kwargs):
jpayne@68 1431 """
jpayne@68 1432 Handle most cases of BEDTool program calls, but leave the specifics
jpayne@68 1433 up to individual methods.
jpayne@68 1434
jpayne@68 1435 *prog* is a BEDTools program name, e.g., 'intersectBed'.
jpayne@68 1436
jpayne@68 1437 *arg_order* lists any arguments that are sensitive to order. Everything
jpayne@68 1438 else will be reverse-sorted.
jpayne@68 1439
jpayne@68 1440 *kwargs* are passed directly from the calling method (like
jpayne@68 1441 self.intersect).
jpayne@68 1442
jpayne@68 1443 This method figures out, given how this BedTool was constructed, what
jpayne@68 1444 to send to BEDTools programs -- for example, an open file to stdin with
jpayne@68 1445 the `-` argument, or a filename with the `-a` argument.
jpayne@68 1446 """
jpayne@68 1447 pybedtools.logger.debug(
jpayne@68 1448 "BedTool.handle_kwargs() got these kwargs:\n%s", pprint.pformat(kwargs)
jpayne@68 1449 )
jpayne@68 1450
jpayne@68 1451 # If you pass in a list, how should it be converted to a BedTools arg?
jpayne@68 1452 default_list_delimiter = " "
jpayne@68 1453 list_delimiters = {
jpayne@68 1454 "annotateBed": " ",
jpayne@68 1455 "getOverlap": ",",
jpayne@68 1456 "groupBy": ",",
jpayne@68 1457 "multiIntersectBed": " ",
jpayne@68 1458 "mergeBed": ",",
jpayne@68 1459 "intersectBed": " ",
jpayne@68 1460 "mapBed": ",",
jpayne@68 1461 }
jpayne@68 1462 stdin = None
jpayne@68 1463
jpayne@68 1464 # -----------------------------------------------------------------
jpayne@68 1465 # Decide how to send instream1 to BEDTools. If there's no implicit
jpayne@68 1466 # instream1 arg, then do nothing.
jpayne@68 1467 #
jpayne@68 1468 try:
jpayne@68 1469 # e.g., 'a' for intersectBed
jpayne@68 1470 if self._isbam:
jpayne@68 1471 inarg1 = _bam_registry[prog]
jpayne@68 1472 else:
jpayne@68 1473 inarg1 = _implicit_registry[prog]
jpayne@68 1474
jpayne@68 1475 # e.g., self.fn or 'a.bed' or an iterator...
jpayne@68 1476 instream1 = kwargs[inarg1]
jpayne@68 1477
jpayne@68 1478 # If it's a BedTool, then get underlying stream
jpayne@68 1479 if isinstance(instream1, BedTool):
jpayne@68 1480 instream1 = instream1.fn
jpayne@68 1481
jpayne@68 1482 # Filename? No pipe, just provide the file
jpayne@68 1483 if isinstance(instream1, str):
jpayne@68 1484 kwargs[inarg1] = instream1
jpayne@68 1485 stdin = None
jpayne@68 1486
jpayne@68 1487 # Open file? Pipe it
jpayne@68 1488 # elif isinstance(instream1, file):
jpayne@68 1489 # kwargs[inarg1] = 'stdin'
jpayne@68 1490 # stdin = instream1
jpayne@68 1491
jpayne@68 1492 # A generator or iterator: pipe it as a generator of lines
jpayne@68 1493 else:
jpayne@68 1494 kwargs[inarg1] = "stdin"
jpayne@68 1495 stdin = (str(i) for i in instream1)
jpayne@68 1496 except KeyError:
jpayne@68 1497 pass
jpayne@68 1498
jpayne@68 1499 # -----------------------------------------------------------------
jpayne@68 1500 # Decide how to send instream2 to BEDTools.
jpayne@68 1501 try:
jpayne@68 1502 # e.g., 'b' for intersectBed
jpayne@68 1503 inarg2 = _other_registry[prog]
jpayne@68 1504
jpayne@68 1505 # e.g., another BedTool
jpayne@68 1506 instream2 = kwargs[inarg2]
jpayne@68 1507
jpayne@68 1508 # Get stream if BedTool
jpayne@68 1509 if isinstance(instream2, BedTool):
jpayne@68 1510 instream2 = instream2.fn
jpayne@68 1511
jpayne@68 1512 # Filename
jpayne@68 1513 if isinstance(instream2, str):
jpayne@68 1514 kwargs[inarg2] = instream2
jpayne@68 1515
jpayne@68 1516 # If it's a list of strings, then we need to figure out if it's
jpayne@68 1517 # a list of filenames or a list of intervals (see issue #156)
jpayne@68 1518 #
jpayne@68 1519 # Several options:
jpayne@68 1520 #
jpayne@68 1521 # - assume intervals have tabs but filenames don't
jpayne@68 1522 # - assume that, upon being split on tabs, an interval is >=3 fields
jpayne@68 1523 # - try creating an interval out of the first thing, success means interval
jpayne@68 1524 #
jpayne@68 1525 # The last seems the most robust. It does allow filenames with
jpayne@68 1526 # tabs; deciding whether or not such filenames are a good idea is
jpayne@68 1527 # left to the user.
jpayne@68 1528 #
jpayne@68 1529 elif isinstance(instream2, (list, tuple)) and isinstance(
jpayne@68 1530 instream2[0], str
jpayne@68 1531 ):
jpayne@68 1532 try:
jpayne@68 1533 _ = create_interval_from_list(instream2[0].split("\t"))
jpayne@68 1534 kwargs[inarg2] = self._collapse(instream2)
jpayne@68 1535 except IndexError:
jpayne@68 1536 kwargs[inarg2] = instream2
jpayne@68 1537
jpayne@68 1538 # Otherwise we need to collapse it in order to send to BEDTools
jpayne@68 1539 # programs
jpayne@68 1540 else:
jpayne@68 1541 kwargs[inarg2] = self._collapse(instream2)
jpayne@68 1542
jpayne@68 1543 except KeyError:
jpayne@68 1544 pass
jpayne@68 1545
jpayne@68 1546 # If stream not specified, then a tempfile will be created
jpayne@68 1547 if kwargs.pop("stream", None):
jpayne@68 1548 tmp = None
jpayne@68 1549 else:
jpayne@68 1550 output = kwargs.pop("output", None)
jpayne@68 1551 if output:
jpayne@68 1552 tmp = output
jpayne@68 1553 else:
jpayne@68 1554 tmp = self._tmp()
jpayne@68 1555
jpayne@68 1556 additional_args = kwargs.pop("additional_args", None)
jpayne@68 1557
jpayne@68 1558 # Parse the kwargs into BEDTools-ready args
jpayne@68 1559 cmds = [prog]
jpayne@68 1560
jpayne@68 1561 # arg_order mechanism added to fix #345
jpayne@68 1562 if arg_order is None:
jpayne@68 1563 arg_order = []
jpayne@68 1564
jpayne@68 1565 for arg in arg_order:
jpayne@68 1566 if arg in kwargs:
jpayne@68 1567 val = kwargs.pop(arg)
jpayne@68 1568 cmds.append("-" + arg)
jpayne@68 1569 cmds.append(val)
jpayne@68 1570
jpayne@68 1571 # The reverse-sort is a temp fix for issue #81
jpayne@68 1572 for key, value in sorted(list(kwargs.items()), reverse=True):
jpayne@68 1573 if isinstance(value, bool):
jpayne@68 1574 if value:
jpayne@68 1575 cmds.append("-" + key)
jpayne@68 1576 else:
jpayne@68 1577 continue
jpayne@68 1578 elif isinstance(value, list) or isinstance(value, tuple):
jpayne@68 1579 value = list(map(str, value))
jpayne@68 1580 try:
jpayne@68 1581 delim = list_delimiters[prog]
jpayne@68 1582 except KeyError:
jpayne@68 1583 delim = default_list_delimiter
jpayne@68 1584
jpayne@68 1585 if delim == " ":
jpayne@68 1586 cmds.append("-" + key)
jpayne@68 1587 cmds.extend(value)
jpayne@68 1588
jpayne@68 1589 # make comma-separated list if that's what's needed
jpayne@68 1590 else:
jpayne@68 1591 cmds.append("-" + key)
jpayne@68 1592 cmds.append(delim.join(value))
jpayne@68 1593
jpayne@68 1594 else:
jpayne@68 1595 cmds.append("-" + key)
jpayne@68 1596 cmds.append(str(value))
jpayne@68 1597
jpayne@68 1598 if additional_args:
jpayne@68 1599 cmds.append(additional_args)
jpayne@68 1600
jpayne@68 1601 return cmds, tmp, stdin
jpayne@68 1602
jpayne@68 1603 def check_genome(self, **kwargs):
jpayne@68 1604 """
jpayne@68 1605 Handles the different ways of specifying a genome in kwargs:
jpayne@68 1606
jpayne@68 1607 g='genome.file' specifies a file directly
jpayne@68 1608 genome='dm3' gets the file from genome registry
jpayne@68 1609 self.chromsizes could be a dict.\
jpayne@68 1610 """
jpayne@68 1611
jpayne@68 1612 # If both g and genome are missing, assume self.chromsizes
jpayne@68 1613 if ("g" not in kwargs) and ("genome" not in kwargs):
jpayne@68 1614 if hasattr(self, "chromsizes"):
jpayne@68 1615 kwargs["g"] = self.chromsizes
jpayne@68 1616 else:
jpayne@68 1617 raise ValueError(
jpayne@68 1618 'No genome specified. Use the "g" or '
jpayne@68 1619 '"genome" kwargs, or use the '
jpayne@68 1620 ".set_chromsizes() method"
jpayne@68 1621 )
jpayne@68 1622
jpayne@68 1623 # If both specified, rather than make an implicit decision, raise an
jpayne@68 1624 # exception
jpayne@68 1625 if "g" in kwargs and "genome" in kwargs:
jpayne@68 1626 raise ValueError('Cannot specify both "g" and "genome"')
jpayne@68 1627
jpayne@68 1628 # Something like genome='dm3' was specified
jpayne@68 1629 if "g" not in kwargs and "genome" in kwargs:
jpayne@68 1630 if isinstance(kwargs["genome"], dict):
jpayne@68 1631 genome_dict = kwargs["genome"]
jpayne@68 1632 else:
jpayne@68 1633 genome_dict = pybedtools.chromsizes(kwargs["genome"])
jpayne@68 1634 genome_file = pybedtools.chromsizes_to_file(genome_dict)
jpayne@68 1635 kwargs["g"] = genome_file
jpayne@68 1636 del kwargs["genome"]
jpayne@68 1637
jpayne@68 1638 # By the time we get here, 'g' is specified.
jpayne@68 1639
jpayne@68 1640 # If a dict was provided, convert to tempfile here
jpayne@68 1641 if isinstance(kwargs["g"], dict):
jpayne@68 1642 kwargs["g"] = pybedtools.chromsizes_to_file(kwargs["g"])
jpayne@68 1643
jpayne@68 1644 if not os.path.exists(kwargs["g"]):
jpayne@68 1645 msg = 'Genome file "%s" does not exist' % (kwargs["g"])
jpayne@68 1646 raise FileNotFoundError(msg)
jpayne@68 1647
jpayne@68 1648 return kwargs
jpayne@68 1649
jpayne@68 1650 @_log_to_history
jpayne@68 1651 def remove_invalid(self):
jpayne@68 1652 """
jpayne@68 1653 Remove invalid features that may break BEDTools programs.
jpayne@68 1654
jpayne@68 1655 >>> a = pybedtools.BedTool("chr1 10 100\\nchr1 10 1",
jpayne@68 1656 ... from_string=True)
jpayne@68 1657 >>> print(a.remove_invalid()) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 1658 chr1 10 100
jpayne@68 1659 <BLANKLINE>
jpayne@68 1660
jpayne@68 1661 """
jpayne@68 1662 tmp = self._tmp()
jpayne@68 1663
jpayne@68 1664 # If it's a file-based BedTool -- which is likely, if we're trying to
jpayne@68 1665 # remove invalid features -- then we need to parse it line by line.
jpayne@68 1666 if isinstance(self.fn, str):
jpayne@68 1667 i = IntervalIterator(open(self.fn, "r"))
jpayne@68 1668 else:
jpayne@68 1669 tmp = self.saveas()
jpayne@68 1670 i = IntervalIterator(open(tmp.fn, "r"))
jpayne@68 1671
jpayne@68 1672 def _generator():
jpayne@68 1673 while True:
jpayne@68 1674 try:
jpayne@68 1675 feature = next(i)
jpayne@68 1676 if feature.start <= feature.stop:
jpayne@68 1677 yield feature
jpayne@68 1678 else:
jpayne@68 1679 continue
jpayne@68 1680 except pybedtools.MalformedBedLineError:
jpayne@68 1681 continue
jpayne@68 1682 except OverflowError:
jpayne@68 1683 # This can happen if coords are negative
jpayne@68 1684 continue
jpayne@68 1685 except IndexError:
jpayne@68 1686 continue
jpayne@68 1687 except StopIteration:
jpayne@68 1688 break
jpayne@68 1689
jpayne@68 1690 return BedTool(_generator())
jpayne@68 1691
jpayne@68 1692 def all_hits(self, interval: Interval, same_strand: bool = False, overlap: float = 0.0):
jpayne@68 1693 """
jpayne@68 1694 Return all intervals that overlap `interval`.
jpayne@68 1695
jpayne@68 1696 Calls the `all_hits` method of an IntervalFile to return all intervals
jpayne@68 1697 in this current BedTool that overlap `interval`.
jpayne@68 1698
jpayne@68 1699 Require that overlaps have the same strand with same_strand=True.
jpayne@68 1700
jpayne@68 1701 Notes:
jpayne@68 1702 If this current BedTool is generator-based, it will be
jpayne@68 1703 converted into a file first.
jpayne@68 1704
jpayne@68 1705 If this current BedTool refers to a BAM file, it will be
jpayne@68 1706 converted to a BED file first using default arguments. If you
jpayne@68 1707 don't want this to happen, please convert to BED first before
jpayne@68 1708 using this method.
jpayne@68 1709 """
jpayne@68 1710 if not isinstance(interval, Interval):
jpayne@68 1711 raise ValueError("Need an Interval instance")
jpayne@68 1712 fn = self.fn
jpayne@68 1713 if not isinstance(fn, str):
jpayne@68 1714 fn = self.saveas().fn
jpayne@68 1715 if self._isbam:
jpayne@68 1716 fn = self.bam_to_bed().fn
jpayne@68 1717 interval_file = pybedtools.IntervalFile(fn)
jpayne@68 1718 return interval_file.all_hits(interval, same_strand, overlap)
jpayne@68 1719
jpayne@68 1720 def any_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0):
jpayne@68 1721 """
jpayne@68 1722 Return whether or not any intervals overlap `interval`.
jpayne@68 1723
jpayne@68 1724 Calls the `any_hits` method of an IntervalFile. If there were any hits
jpayne@68 1725 within `interval` in this BedTool, then return 1; otherwise 0.
jpayne@68 1726
jpayne@68 1727 Require that overlaps have the same strand with same_strand=True.
jpayne@68 1728
jpayne@68 1729 Notes:
jpayne@68 1730 If this current BedTool is generator-based, it will be
jpayne@68 1731 converted into a file first.
jpayne@68 1732
jpayne@68 1733 If this current BedTool refers to a BAM file, it will be
jpayne@68 1734 converted to a BED file first using default arguments. If you
jpayne@68 1735 don't want this to happen, please convert to BED first before
jpayne@68 1736 using this method.
jpayne@68 1737 """
jpayne@68 1738 if not isinstance(interval, Interval):
jpayne@68 1739 raise ValueError("Need an Interval instance")
jpayne@68 1740 fn = self.fn
jpayne@68 1741 if not isinstance(fn, str):
jpayne@68 1742 fn = self.saveas().fn
jpayne@68 1743 if self._isbam:
jpayne@68 1744 fn = self.bam_to_bed().fn
jpayne@68 1745 interval_file = pybedtools.IntervalFile(fn)
jpayne@68 1746 return interval_file.any_hits(interval, same_strand, overlap)
jpayne@68 1747
jpayne@68 1748 def count_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0) -> int:
jpayne@68 1749 """
jpayne@68 1750 Return the number of intervals that overlap `interval`.
jpayne@68 1751
jpayne@68 1752 Calls the `count_hits` method of an IntervalFile. Returns the number
jpayne@68 1753 of valid hits in this BedTool that overlap `interval`.
jpayne@68 1754
jpayne@68 1755 Require that overlaps have the same strand with same_strand=True.
jpayne@68 1756
jpayne@68 1757 Notes:
jpayne@68 1758 If this current BedTool is generator-based, it will be
jpayne@68 1759 converted into a file first.
jpayne@68 1760
jpayne@68 1761 If this current BedTool refers to a BAM file, it will be
jpayne@68 1762 converted to a BED file first using default arguments. If you
jpayne@68 1763 don't want this to happen, please convert to BED first before
jpayne@68 1764 using this method.
jpayne@68 1765 """
jpayne@68 1766 if not isinstance(interval, Interval):
jpayne@68 1767 raise ValueError("Need an Interval instance")
jpayne@68 1768 fn = self.fn
jpayne@68 1769 if not isinstance(fn, str):
jpayne@68 1770 fn = self.saveas().fn
jpayne@68 1771 if self._isbam:
jpayne@68 1772 fn = self.bam_to_bed().fn
jpayne@68 1773 interval_file = pybedtools.IntervalFile(fn)
jpayne@68 1774 return interval_file.count_hits(interval, same_strand, overlap)
jpayne@68 1775
jpayne@68 1776 @_log_to_history
jpayne@68 1777 @_wraps(prog="bed12ToBed6", implicit="i", bam=None, other=None)
jpayne@68 1778 def bed6(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 1779 """
jpayne@68 1780 Wraps `bedtools bed12tobed6`.
jpayne@68 1781 """
jpayne@68 1782
jpayne@68 1783 # Alias for backward compatibility
jpayne@68 1784 bed12tobed6 = bed6
jpayne@68 1785
jpayne@68 1786 @_log_to_history
jpayne@68 1787 @_wraps(prog="bamToBed", implicit="i", other=None, nonbam="ALL", bam="i")
jpayne@68 1788 def bam_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 1789 """
jpayne@68 1790 Wraps `bedtools bamtobed`.
jpayne@68 1791 """
jpayne@68 1792
jpayne@68 1793 # Alias for backward compatibility
jpayne@68 1794 bamtobed = bam_to_bed
jpayne@68 1795
jpayne@68 1796 @_wraps(prog="bedToBam", implicit="i", uses_genome=True, force_bam=True)
jpayne@68 1797 def _bed_to_bam(self, *args, **kwargs):
jpayne@68 1798 """
jpayne@68 1799 Wraps bedToBam and is called internally for BED/GFF/VCF files by
jpayne@68 1800 self.to_bam (which needs to do something different for SAM files...)
jpayne@68 1801 """
jpayne@68 1802
jpayne@68 1803 @_log_to_history
jpayne@68 1804 def to_bam(self, **kwargs):
jpayne@68 1805 """
jpayne@68 1806 Wraps `bedtools bedtobam`
jpayne@68 1807
jpayne@68 1808 If self.fn is in BED/VCF/GFF format, call BEDTools' bedToBam. If
jpayne@68 1809 self.fn is in SAM format, then create a header out of the genome file
jpayne@68 1810 and then convert using `samtools`.
jpayne@68 1811 """
jpayne@68 1812 if self.file_type == "bam":
jpayne@68 1813 return self
jpayne@68 1814 if self.file_type in ("bed", "gff", "vcf"):
jpayne@68 1815 return self._bed_to_bam(**kwargs)
jpayne@68 1816
jpayne@68 1817 # TODO: to maintain backwards compatibility we go from Interval to
jpayne@68 1818 # AlignedSegment.
jpayne@68 1819 if self.file_type == "sam":
jpayne@68 1820
jpayne@68 1821 # Use pysam, but construct the header out of a provided genome
jpayne@68 1822 # file.
jpayne@68 1823
jpayne@68 1824 # construct a genome out of whatever kwargs were passed in
jpayne@68 1825 kwargs = self.check_genome(**kwargs)
jpayne@68 1826
jpayne@68 1827 # Build a header that we can use for the output BAM file.
jpayne@68 1828 genome = dict(i.split() for i in open(kwargs["g"]))
jpayne@68 1829 SQ = []
jpayne@68 1830 ref_ids = {}
jpayne@68 1831 text_header = ["@HD\tVN:1.0"]
jpayne@68 1832
jpayne@68 1833 for i, (k, v) in enumerate(genome.items()):
jpayne@68 1834 SQ.append(dict(SN=k, LN=int(v)))
jpayne@68 1835 ref_ids[k] = i
jpayne@68 1836 text_header.append("@SQ\tSN:{0}\tLN:{1}".format(k, v))
jpayne@68 1837
jpayne@68 1838 # And the text-format header
jpayne@68 1839 text_header = "\n".join(text_header) + "\n"
jpayne@68 1840
jpayne@68 1841 # The strategy is to write an actual SAM file to disk, along with
jpayne@68 1842 # a header, and then read that back in.
jpayne@68 1843 #
jpayne@68 1844 # Painfully inefficient, but this will change once all py2 tests
jpayne@68 1845 # pass.
jpayne@68 1846 sam_tmp = self._tmp()
jpayne@68 1847 bam_tmp = self._tmp()
jpayne@68 1848 with open(sam_tmp, "w") as fout:
jpayne@68 1849 fout.write(text_header)
jpayne@68 1850 for interval in self:
jpayne@68 1851 fout.write("\t".join(map(str, interval.fields)) + "\n")
jpayne@68 1852
jpayne@68 1853 samfile = pysam.AlignmentFile(sam_tmp, "r")
jpayne@68 1854 bamfile = pysam.AlignmentFile(bam_tmp, "wb", template=samfile)
jpayne@68 1855 for alignment in samfile:
jpayne@68 1856 bamfile.write(alignment)
jpayne@68 1857
jpayne@68 1858 samfile.close()
jpayne@68 1859 bamfile.close()
jpayne@68 1860 new_bedtool = BedTool(bam_tmp)
jpayne@68 1861 new_bedtool._isbam = True
jpayne@68 1862 return new_bedtool
jpayne@68 1863
jpayne@68 1864 # Alias for backward compatibility
jpayne@68 1865 bedtobam = to_bam
jpayne@68 1866
jpayne@68 1867 @_log_to_history
jpayne@68 1868 @_wraps(prog="intersectBed", implicit="a", other="b", bam="abam",
jpayne@68 1869 nonbam="bed", arg_order=["a", "abam"])
jpayne@68 1870 def intersect(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 1871 """
jpayne@68 1872 Wraps `bedtools intersect`.
jpayne@68 1873 """
jpayne@68 1874
jpayne@68 1875 @_log_to_history
jpayne@68 1876 @_wraps(
jpayne@68 1877 prog="fastaFromBed",
jpayne@68 1878 implicit="bed",
jpayne@68 1879 bam=None,
jpayne@68 1880 other="fi",
jpayne@68 1881 make_tempfile_for="fo",
jpayne@68 1882 check_stderr=_check_sequence_stderr,
jpayne@68 1883 add_to_bedtool={"fo": "seqfn"},
jpayne@68 1884 )
jpayne@68 1885 def sequence(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 1886 '''
jpayne@68 1887 Wraps `bedtools getfasta`.
jpayne@68 1888
jpayne@68 1889 *fi* is passed in by the user; *bed* is automatically passed in as the
jpayne@68 1890 bedfile of this object; *fo* by default is a temp file. Use
jpayne@68 1891 save_seqs() to save as a file.
jpayne@68 1892
jpayne@68 1893 The end result is that this BedTool will assign a value to the attribute , self.seqfn,
jpayne@68 1894 that points to the new fasta file.
jpayne@68 1895
jpayne@68 1896 Example usage:
jpayne@68 1897
jpayne@68 1898 >>> a = pybedtools.BedTool("""
jpayne@68 1899 ... chr1 1 10
jpayne@68 1900 ... chr1 50 55""", from_string=True)
jpayne@68 1901 >>> fasta = pybedtools.example_filename('test.fa')
jpayne@68 1902 >>> a = a.sequence(fi=fasta)
jpayne@68 1903 >>> print(open(a.seqfn).read())
jpayne@68 1904 >chr1:1-10
jpayne@68 1905 GATGAGTCT
jpayne@68 1906 >chr1:50-55
jpayne@68 1907 CCATC
jpayne@68 1908 <BLANKLINE>
jpayne@68 1909
jpayne@68 1910 '''
jpayne@68 1911
jpayne@68 1912 # Alias for backwards compatibility
jpayne@68 1913 getfasta = sequence
jpayne@68 1914
jpayne@68 1915 @staticmethod
jpayne@68 1916 def seq(loc, fasta) -> str:
jpayne@68 1917 """
jpayne@68 1918 Return just the sequence from a region string or a single location
jpayne@68 1919 >>> fn = pybedtools.example_filename('test.fa')
jpayne@68 1920 >>> BedTool.seq('chr1:2-10', fn)
jpayne@68 1921 'GATGAGTCT'
jpayne@68 1922 >>> BedTool.seq(('chr1', 1, 10), fn)
jpayne@68 1923 'GATGAGTCT'
jpayne@68 1924 """
jpayne@68 1925 if isinstance(loc, str):
jpayne@68 1926 chrom, start_end = loc.split(":")
jpayne@68 1927 start, end = list(map(int, start_end.split("-")))
jpayne@68 1928 start -= 1
jpayne@68 1929 else:
jpayne@68 1930 chrom, start, end = loc[0], loc[1], loc[2]
jpayne@68 1931
jpayne@68 1932 loc = BedTool("%s\t%i\t%i" % (chrom, start, end), from_string=True)
jpayne@68 1933 lseq = loc.sequence(fi=fasta)
jpayne@68 1934 return "".join([l.rstrip() for l in open(lseq.seqfn, "r") if l[0] != ">"])
jpayne@68 1935
jpayne@68 1936 @_log_to_history
jpayne@68 1937 @_wraps(
jpayne@68 1938 prog="nucBed", implicit="bed", other="fi", check_stderr=_check_sequence_stderr
jpayne@68 1939 )
jpayne@68 1940 def nucleotide_content(self) -> BedTool: # type: ignore
jpayne@68 1941 """
jpayne@68 1942 Wraps `bedtools nuc`.
jpayne@68 1943
jpayne@68 1944 Profiles nucleotide content. The returned BED file contains extra
jpayne@68 1945 information about the nucleotide content
jpayne@68 1946 """
jpayne@68 1947
jpayne@68 1948 # Alias for backwards compatibility
jpayne@68 1949 nuc = nucleotide_content
jpayne@68 1950
jpayne@68 1951 @_log_to_history
jpayne@68 1952 @_wraps(prog="multiBamCov", implicit="bed")
jpayne@68 1953 def multi_bam_coverage(self) -> BedTool: # type: ignore
jpayne@68 1954 """
jpayne@68 1955 Wraps `bedtools multicov`.
jpayne@68 1956
jpayne@68 1957 Pass a list of sorted and indexed BAM files as `bams`
jpayne@68 1958 """
jpayne@68 1959
jpayne@68 1960 # Alias for backwards compatibility
jpayne@68 1961 multicov = multi_bam_coverage
jpayne@68 1962
jpayne@68 1963 @_log_to_history
jpayne@68 1964 @_wraps(prog="subtractBed", implicit="a", other="b", bam=None)
jpayne@68 1965 def subtract(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 1966 """
jpayne@68 1967 Wraps `bedtools subtract`.
jpayne@68 1968
jpayne@68 1969 Subtracts from another BED file and returns a new BedTool object.
jpayne@68 1970
jpayne@68 1971 Example usage:
jpayne@68 1972
jpayne@68 1973 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 1974 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 1975
jpayne@68 1976 Do a "stranded" subtraction:
jpayne@68 1977
jpayne@68 1978 >>> c = a.subtract(b, s=True)
jpayne@68 1979
jpayne@68 1980 Require 50% of features in `a` to overlap:
jpayne@68 1981
jpayne@68 1982 >>> c = a.subtract(b, f=0.5)
jpayne@68 1983 """
jpayne@68 1984 if "a" not in kwargs:
jpayne@68 1985 kwargs["a"] = self.fn
jpayne@68 1986
jpayne@68 1987 if "b" not in kwargs:
jpayne@68 1988 if len(args) > 0:
jpayne@68 1989 kwargs["b"] = args[0]
jpayne@68 1990 else:
jpayne@68 1991 raise ValueError("Must specify a BED file to subtract, either as a positional argument or as the 'b' keyword argument.")
jpayne@68 1992
jpayne@68 1993 cmds, tmp, stdin = self.handle_kwargs(prog="subtractBed", **kwargs)
jpayne@68 1994 stream = call_bedtools(cmds, tmp, stdin=stdin)
jpayne@68 1995 return BedTool(stream)
jpayne@68 1996
jpayne@68 1997 @_log_to_history
jpayne@68 1998 @_wraps(prog="slopBed", implicit="i", other=None, bam=None, uses_genome=True)
jpayne@68 1999 def slop(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2000 """
jpayne@68 2001 Wraps `bedtools slop`.
jpayne@68 2002 """
jpayne@68 2003
jpayne@68 2004 @_log_to_history
jpayne@68 2005 @_wraps(prog="shiftBed", implicit="i", other=None, bam=None, uses_genome=True)
jpayne@68 2006 def shift(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2007 """
jpayne@68 2008 Wraps `bedtools shift`.
jpayne@68 2009
jpayne@68 2010 Shift each feature by user-defined number of bases. Returns a new BedTool object.
jpayne@68 2011
jpayne@68 2012 Example usage:
jpayne@68 2013
jpayne@68 2014 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2015
jpayne@68 2016 Shift every feature by 5bp:
jpayne@68 2017
jpayne@68 2018 >>> b = a.shift(genome='hg19', s=5)
jpayne@68 2019 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2020 chr1 6 105 feature1 0 +
jpayne@68 2021 chr1 105 205 feature2 0 +
jpayne@68 2022 chr1 155 505 feature3 0 -
jpayne@68 2023 chr1 905 955 feature4 0 +
jpayne@68 2024 <BLANKLINE>
jpayne@68 2025
jpayne@68 2026 Shift features on the '+' strand by -1bp and on '-' strand by +3bp:
jpayne@68 2027
jpayne@68 2028 >>> b = a.shift(genome='hg19', p=-1, m=3)
jpayne@68 2029 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2030 chr1 0 99 feature1 0 +
jpayne@68 2031 chr1 99 199 feature2 0 +
jpayne@68 2032 chr1 153 503 feature3 0 -
jpayne@68 2033 chr1 899 949 feature4 0 +
jpayne@68 2034 <BLANKLINE>
jpayne@68 2035
jpayne@68 2036 # Disabling, see https://github.com/arq5x/bedtools2/issues/807
jpayne@68 2037 Shift features by a fraction of their length (0.50):
jpayne@68 2038
jpayne@68 2039 #>>> b = a.shift(genome='hg19', pct=True, s=0.50)
jpayne@68 2040 #>>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2041 #chr1 50 149 feature1 0 +
jpayne@68 2042 #chr1 150 250 feature2 0 +
jpayne@68 2043 #chr1 325 675 feature3 0 -
jpayne@68 2044 #chr1 925 975 feature4 0 +
jpayne@68 2045 #<BLANKLINE>
jpayne@68 2046
jpayne@68 2047 """
jpayne@68 2048
jpayne@68 2049 @_log_to_history
jpayne@68 2050 @_wraps(prog="mergeBed", implicit="i", other=None, bam=None)
jpayne@68 2051 def merge(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2052 """
jpayne@68 2053 Wraps `bedtools merge`.
jpayne@68 2054
jpayne@68 2055 Merge overlapping features together. Returns a new BedTool object.
jpayne@68 2056
jpayne@68 2057 Example usage:
jpayne@68 2058
jpayne@68 2059 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2060
jpayne@68 2061 Merge:
jpayne@68 2062
jpayne@68 2063 >>> c = a.merge()
jpayne@68 2064
jpayne@68 2065 Allow merging of features 500 bp apart:
jpayne@68 2066
jpayne@68 2067 >>> c = a.merge(d=500)
jpayne@68 2068
jpayne@68 2069 """
jpayne@68 2070
jpayne@68 2071 @_log_to_history
jpayne@68 2072 @_wraps(prog="closestBed", implicit="a", other="b", bam=None)
jpayne@68 2073 def closest(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2074 """
jpayne@68 2075 Wraps `bedtools closest`.
jpayne@68 2076
jpayne@68 2077 Return a new BedTool object containing closest features in *b*. Note
jpayne@68 2078 that the resulting file is no longer a valid BED format; use the
jpayne@68 2079 special "_closest" methods to work with the resulting file.
jpayne@68 2080
jpayne@68 2081 Example usage::
jpayne@68 2082
jpayne@68 2083 a = BedTool('in.bed')
jpayne@68 2084
jpayne@68 2085 # get the closest feature in 'other.bed' on the same strand
jpayne@68 2086 b = a.closest('other.bed', s=True)
jpayne@68 2087
jpayne@68 2088 """
jpayne@68 2089
jpayne@68 2090 @_log_to_history
jpayne@68 2091 @_wraps(prog="windowBed", implicit="a", other="b", bam="abam", nonbam="bed")
jpayne@68 2092 def window(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2093 """
jpayne@68 2094 Wraps `bedtools window`.
jpayne@68 2095
jpayne@68 2096 Example usage::
jpayne@68 2097
jpayne@68 2098 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2099 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 2100 >>> print(a.window(b, w=1000)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2101 chr1 1 100 feature1 0 + chr1 155 200 feature5 0 -
jpayne@68 2102 chr1 1 100 feature1 0 + chr1 800 901 feature6 0 +
jpayne@68 2103 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 -
jpayne@68 2104 chr1 100 200 feature2 0 + chr1 800 901 feature6 0 +
jpayne@68 2105 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 -
jpayne@68 2106 chr1 150 500 feature3 0 - chr1 800 901 feature6 0 +
jpayne@68 2107 chr1 900 950 feature4 0 + chr1 155 200 feature5 0 -
jpayne@68 2108 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 +
jpayne@68 2109 <BLANKLINE>
jpayne@68 2110 """
jpayne@68 2111
jpayne@68 2112 @_log_to_history
jpayne@68 2113 @_wraps(prog="shuffleBed", implicit="i", other=None, bam=None, uses_genome=True)
jpayne@68 2114 def shuffle(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2115 """
jpayne@68 2116 Wraps `bedtools shuffle`.
jpayne@68 2117
jpayne@68 2118 Example usage:
jpayne@68 2119
jpayne@68 2120 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2121 >>> seed = 1 # so this test always returns the same results
jpayne@68 2122 >>> b = a.shuffle(genome='hg19', chrom=True, seed=seed)
jpayne@68 2123 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2124 chr1 123081365 123081464 feature1 0 +
jpayne@68 2125 chr1 243444570 243444670 feature2 0 +
jpayne@68 2126 chr1 194620241 194620591 feature3 0 -
jpayne@68 2127 chr1 172792873 172792923 feature4 0 +
jpayne@68 2128 <BLANKLINE>
jpayne@68 2129 """
jpayne@68 2130
jpayne@68 2131 @_log_to_history
jpayne@68 2132 @_wraps(prog="sortBed", implicit="i", uses_genome=True, genome_if=["g", "genome"])
jpayne@68 2133 def sort(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2134 """
jpayne@68 2135 Wraps `bedtools sort`.
jpayne@68 2136
jpayne@68 2137 Note that chromosomes are sorted lexicographically, so chr12 will come
jpayne@68 2138 before chr9.
jpayne@68 2139
jpayne@68 2140 Example usage:
jpayne@68 2141
jpayne@68 2142 >>> a = pybedtools.BedTool('''
jpayne@68 2143 ... chr9 300 400
jpayne@68 2144 ... chr1 100 200
jpayne@68 2145 ... chr1 1 50
jpayne@68 2146 ... chr12 1 100
jpayne@68 2147 ... chr9 500 600
jpayne@68 2148 ... ''', from_string=True)
jpayne@68 2149 >>> print(a.sort()) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2150 chr1 1 50
jpayne@68 2151 chr1 100 200
jpayne@68 2152 chr12 1 100
jpayne@68 2153 chr9 300 400
jpayne@68 2154 chr9 500 600
jpayne@68 2155 <BLANKLINE>
jpayne@68 2156 """
jpayne@68 2157
jpayne@68 2158 @_log_to_history
jpayne@68 2159 @_wraps(prog="annotateBed", implicit="i")
jpayne@68 2160 def annotate(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2161 """
jpayne@68 2162 Wraps `bedtools annotate`.
jpayne@68 2163
jpayne@68 2164 Annotate this BedTool with a list of other files.
jpayne@68 2165 Example usage:
jpayne@68 2166
jpayne@68 2167 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2168 >>> b_fn = pybedtools.example_filename('b.bed')
jpayne@68 2169 >>> print(a.annotate(files=b_fn)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2170 chr1 1 100 feature1 0 + 0.000000
jpayne@68 2171 chr1 100 200 feature2 0 + 0.450000
jpayne@68 2172 chr1 150 500 feature3 0 - 0.128571
jpayne@68 2173 chr1 900 950 feature4 0 + 0.020000
jpayne@68 2174 <BLANKLINE>
jpayne@68 2175 """
jpayne@68 2176
jpayne@68 2177 @_log_to_history
jpayne@68 2178 @_wraps(prog="flankBed", implicit="i", uses_genome=True)
jpayne@68 2179 def flank(self, *args, **kwargs) -> BedTool:
jpayne@68 2180 """
jpayne@68 2181 Wraps `bedtools flank`.
jpayne@68 2182
jpayne@68 2183 Example usage:
jpayne@68 2184
jpayne@68 2185 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2186 >>> print(a.flank(genome='hg19', b=100)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2187 chr1 0 1 feature1 0 +
jpayne@68 2188 chr1 100 200 feature1 0 +
jpayne@68 2189 chr1 0 100 feature2 0 +
jpayne@68 2190 chr1 200 300 feature2 0 +
jpayne@68 2191 chr1 50 150 feature3 0 -
jpayne@68 2192 chr1 500 600 feature3 0 -
jpayne@68 2193 chr1 800 900 feature4 0 +
jpayne@68 2194 chr1 950 1050 feature4 0 +
jpayne@68 2195 <BLANKLINE>
jpayne@68 2196
jpayne@68 2197 """
jpayne@68 2198 kwargs = self.check_genome(**kwargs)
jpayne@68 2199
jpayne@68 2200 if "i" not in kwargs:
jpayne@68 2201 kwargs["i"] = self.fn
jpayne@68 2202
jpayne@68 2203 cmds, tmp, stdin = self.handle_kwargs(prog="flankBed", **kwargs)
jpayne@68 2204 stream = call_bedtools(cmds, tmp, stdin=stdin)
jpayne@68 2205 return BedTool(stream)
jpayne@68 2206
jpayne@68 2207 @_log_to_history
jpayne@68 2208 @_wraps(
jpayne@68 2209 prog="genomeCoverageBed",
jpayne@68 2210 implicit="i",
jpayne@68 2211 bam="ibam",
jpayne@68 2212 genome_none_if=["ibam"],
jpayne@68 2213 genome_ok_if=["ibam"],
jpayne@68 2214 uses_genome=True,
jpayne@68 2215 nonbam="ALL",
jpayne@68 2216 )
jpayne@68 2217 def genome_coverage(self, *args, **kwargs):
jpayne@68 2218 """
jpayne@68 2219 Wraps `bedtools genomecov`.
jpayne@68 2220
jpayne@68 2221 Note that some invocations of `bedtools genomecov` do not result in
jpayne@68 2222 a properly-formatted BED file. For example, the default behavior is to
jpayne@68 2223 report a histogram of coverage. Iterating over the resulting,
jpayne@68 2224 non-BED-format file will raise exceptions in pybedtools' parser.
jpayne@68 2225
jpayne@68 2226 Consider using the `BedTool.to_dataframe` method to convert these
jpayne@68 2227 non-BED files into a pandas DataFrame for further use.
jpayne@68 2228
jpayne@68 2229 Example usage:
jpayne@68 2230
jpayne@68 2231 BAM file input does not require a genome:
jpayne@68 2232
jpayne@68 2233 >>> a = pybedtools.example_bedtool('x.bam')
jpayne@68 2234 >>> b = a.genome_coverage(bg=True)
jpayne@68 2235 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2236 chr2L 9329 9365 1
jpayne@68 2237 chr2L 10212 10248 1
jpayne@68 2238 chr2L 10255 10291 1
jpayne@68 2239
jpayne@68 2240 Other input does require a genome:
jpayne@68 2241
jpayne@68 2242 >>> a = pybedtools.example_bedtool('x.bed')
jpayne@68 2243 >>> b = a.genome_coverage(bg=True, genome='dm3')
jpayne@68 2244 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2245 chr2L 9329 9365 1
jpayne@68 2246 chr2L 10212 10248 1
jpayne@68 2247 chr2L 10255 10291 1
jpayne@68 2248
jpayne@68 2249 Non-BED format results:
jpayne@68 2250 >>> a = pybedtools.example_bedtool('x.bed')
jpayne@68 2251 >>> b = a.genome_coverage(genome='dm3')
jpayne@68 2252 >>> df = b.to_dataframe(names=['chrom', 'depth', 'n', 'chromsize', 'fraction'])
jpayne@68 2253 """
jpayne@68 2254
jpayne@68 2255 # Alias for backwards compatibility
jpayne@68 2256 genomecov = genome_coverage
jpayne@68 2257
jpayne@68 2258 @_log_to_history
jpayne@68 2259 @_wraps(prog="coverageBed", implicit="a", other="b", bam="abam", nonbam="ALL")
jpayne@68 2260 def coverage(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2261 """
jpayne@68 2262 Wraps `bedtools coverage`.
jpayne@68 2263
jpayne@68 2264 Note that starting in version 2.24.0, BEDTools swapped the semantics of
jpayne@68 2265 the "a" and "b" files.
jpayne@68 2266
jpayne@68 2267 Example usage:
jpayne@68 2268
jpayne@68 2269 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2270 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 2271 >>> c = b.coverage(a)
jpayne@68 2272 >>> c.head(3) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2273 chr1 155 200 feature5 0 - 2 45 45 1.0000000
jpayne@68 2274 chr1 800 901 feature6 0 + 1 1 101 0.0099010
jpayne@68 2275 """
jpayne@68 2276
jpayne@68 2277 @_log_to_history
jpayne@68 2278 @_wraps(
jpayne@68 2279 prog="maskFastaFromBed",
jpayne@68 2280 implicit="bed",
jpayne@68 2281 other="fi",
jpayne@68 2282 make_tempfile_for="fo",
jpayne@68 2283 add_to_bedtool={"fo": "seqfn"},
jpayne@68 2284 check_stderr=_check_sequence_stderr,
jpayne@68 2285 )
jpayne@68 2286 def mask_fasta(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2287 """
jpayne@68 2288 Wraps `bedtools maskfasta`.
jpayne@68 2289
jpayne@68 2290 Masks a fasta file at the positions in a BED file and saves result as
jpayne@68 2291 'out' and stores the filename in seqfn.
jpayne@68 2292
jpayne@68 2293 >>> a = pybedtools.BedTool('chr1 100 110', from_string=True)
jpayne@68 2294 >>> fasta_fn = pybedtools.example_filename('test.fa')
jpayne@68 2295 >>> a = a.mask_fasta(fi=fasta_fn, fo='masked.fa.example')
jpayne@68 2296 >>> b = a.slop(b=2, genome='hg19')
jpayne@68 2297 >>> b = b.sequence(fi=a.seqfn)
jpayne@68 2298 >>> print(open(b.seqfn).read())
jpayne@68 2299 >chr1:98-112
jpayne@68 2300 TTNNNNNNNNNNAT
jpayne@68 2301 <BLANKLINE>
jpayne@68 2302 >>> os.unlink('masked.fa.example')
jpayne@68 2303 >>> if os.path.exists('masked.fa.example.fai'):
jpayne@68 2304 ... os.unlink('masked.fa.example.fai')
jpayne@68 2305 """
jpayne@68 2306
jpayne@68 2307 # Alias for backwards compatibility
jpayne@68 2308 maskfasta = mask_fasta
jpayne@68 2309
jpayne@68 2310 @_log_to_history
jpayne@68 2311 @_wraps(prog="complementBed", implicit="i", uses_genome=True)
jpayne@68 2312 def complement(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2313 """
jpayne@68 2314 Wraps `bedtools complement`.
jpayne@68 2315 Example usage:
jpayne@68 2316
jpayne@68 2317 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2318 >>> a.complement(genome='hg19').head(5) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2319 chr1 0 1
jpayne@68 2320 chr1 500 900
jpayne@68 2321 chr1 950 249250621
jpayne@68 2322 chr2 0 243199373
jpayne@68 2323 chr3 0 198022430
jpayne@68 2324 """
jpayne@68 2325
jpayne@68 2326 @_log_to_history
jpayne@68 2327 @_wraps(prog="getOverlap", implicit="i")
jpayne@68 2328 def overlap(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2329 """
jpayne@68 2330 Wraps `bedtools overlap`.
jpayne@68 2331
jpayne@68 2332 Example usage:
jpayne@68 2333
jpayne@68 2334 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2335 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 2336 >>> c = a.window(b, w=10).overlap(cols=[2,3,8,9])
jpayne@68 2337 >>> print(c) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2338 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 - 45
jpayne@68 2339 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 - 45
jpayne@68 2340 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 + 1
jpayne@68 2341 <BLANKLINE>
jpayne@68 2342 """
jpayne@68 2343
jpayne@68 2344 # TODO: needs test files and doctests written
jpayne@68 2345 @_log_to_history
jpayne@68 2346 @_wraps(prog="pairToBed", implicit="a", other="b", bam="abam", nonbam="bedpe")
jpayne@68 2347 def pair_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2348 """
jpayne@68 2349 Wraps `bedtools pairtobed`.
jpayne@68 2350 """
jpayne@68 2351
jpayne@68 2352 # Alias for backwards compatibility
jpayne@68 2353 pairtobed = pair_to_bed
jpayne@68 2354
jpayne@68 2355 @_log_to_history
jpayne@68 2356 @_wraps(prog="pairToPair", implicit="a", other="b")
jpayne@68 2357 def pair_to_pair(self) -> BedTool: # type: ignore
jpayne@68 2358 """
jpayne@68 2359 Wraps `bedtools pairtopair`.
jpayne@68 2360 """
jpayne@68 2361
jpayne@68 2362 # Alias for backwards compatibility
jpayne@68 2363 pairtopair = pair_to_pair
jpayne@68 2364
jpayne@68 2365 @_log_to_history
jpayne@68 2366 @_wraps(prog="groupBy", implicit="i")
jpayne@68 2367 def groupby(self, *args, **kwargs) -> BedTool:
jpayne@68 2368 """
jpayne@68 2369 Wraps `bedtools groupby`.
jpayne@68 2370
jpayne@68 2371 Example usage:
jpayne@68 2372
jpayne@68 2373 >>> a = pybedtools.example_bedtool('gdc.gff')
jpayne@68 2374 >>> b = pybedtools.example_bedtool('gdc.bed')
jpayne@68 2375 >>> c = a.intersect(b, c=True)
jpayne@68 2376 >>> d = c.groupby(g=[1, 4, 5], c=10, o=['sum'])
jpayne@68 2377 >>> print(d) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2378 chr2L 41 70 0
jpayne@68 2379 chr2L 71 130 2
jpayne@68 2380 chr2L 131 170 4
jpayne@68 2381 chr2L 171 200 0
jpayne@68 2382 chr2L 201 220 1
jpayne@68 2383 chr2L 41 130 2
jpayne@68 2384 chr2L 171 220 1
jpayne@68 2385 chr2L 41 220 7
jpayne@68 2386 chr2L 161 230 6
jpayne@68 2387 chr2L 41 220 7
jpayne@68 2388 <BLANKLINE>
jpayne@68 2389
jpayne@68 2390 """
jpayne@68 2391
jpayne@68 2392 @_log_to_history
jpayne@68 2393 @_wraps(prog="tagBam", implicit="i", bam="i")
jpayne@68 2394 def tag_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore
jpayne@68 2395 """
jpayne@68 2396 Wraps `bedtools tag`.
jpayne@68 2397
jpayne@68 2398 `files` and `labels` should lists of equal length.
jpayne@68 2399
jpayne@68 2400 """
jpayne@68 2401 if "i" in kwargs:
jpayne@68 2402 if "labels" in kwargs:
jpayne@68 2403 if len(kwargs["i"]) != len(kwargs["labels"]):
jpayne@68 2404 raise ValueError("files and labels must be lists of equal length")
jpayne@68 2405
jpayne@68 2406
jpayne@68 2407 # Alias for backwards compatibility
jpayne@68 2408 tag = tag_bam
jpayne@68 2409
jpayne@68 2410 @_log_to_history
jpayne@68 2411 @_wraps(prog="mapBed", implicit="a", other="b")
jpayne@68 2412 def map(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2413 """
jpayne@68 2414 Wraps `bedtools map`; See also :meth:`BedTool.each`.
jpayne@68 2415 """
jpayne@68 2416
jpayne@68 2417 @_log_to_history
jpayne@68 2418 @_wraps(prog="multiIntersectBed", uses_genome=True, genome_if=["empty"])
jpayne@68 2419 def multi_intersect(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2420 """
jpayne@68 2421 Wraps `bedtools multiintersect`.
jpayne@68 2422
jpayne@68 2423 Provide a list of filenames as the "i" argument. e.g. if you already
jpayne@68 2424 have BedTool objects then use their `.fn` attribute, like this::
jpayne@68 2425
jpayne@68 2426 >>> x = pybedtools.BedTool()
jpayne@68 2427 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2428 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 2429 >>> result = x.multi_intersect(i=[a.fn, b.fn])
jpayne@68 2430 >>> print(result) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 2431 chr1 1 155 1 1 1 0
jpayne@68 2432 chr1 155 200 2 1,2 1 1
jpayne@68 2433 chr1 200 500 1 1 1 0
jpayne@68 2434 chr1 800 900 1 2 0 1
jpayne@68 2435 chr1 900 901 2 1,2 1 1
jpayne@68 2436 chr1 901 950 1 1 1 0
jpayne@68 2437 <BLANKLINE>
jpayne@68 2438
jpayne@68 2439 """
jpayne@68 2440
jpayne@68 2441 # Alias for backwards compatibility
jpayne@68 2442 multiinter = multi_intersect
jpayne@68 2443
jpayne@68 2444 @_log_to_history
jpayne@68 2445 @_wraps(prog="randomBed", uses_genome=True)
jpayne@68 2446 def random(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2447 """
jpayne@68 2448 Wraps `bedtools random`.
jpayne@68 2449
jpayne@68 2450 Since this method does not operate on an existing file, create
jpayne@68 2451 a BedTool with no arguments and then call this method, e.g.,
jpayne@68 2452
jpayne@68 2453 >>> x = BedTool()
jpayne@68 2454 >>> y = x.random(l=100, n=10, genome='hg19')
jpayne@68 2455 """
jpayne@68 2456
jpayne@68 2457 @_log_to_history
jpayne@68 2458 @_wraps("bedpeToBam", implicit="i", uses_genome=True, force_bam=True)
jpayne@68 2459 def bedpe_to_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore
jpayne@68 2460 """
jpayne@68 2461 Wraps `bedtools bedpetobam`.
jpayne@68 2462 """
jpayne@68 2463
jpayne@68 2464 # Alias for backwards compatibility
jpayne@68 2465 bedpetobam = bedpe_to_bam
jpayne@68 2466
jpayne@68 2467 @_log_to_history
jpayne@68 2468 @_wraps(prog="clusterBed", implicit="i")
jpayne@68 2469 def cluster(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2470 """
jpayne@68 2471 Wraps `bedtools cluster`.
jpayne@68 2472 """
jpayne@68 2473
jpayne@68 2474 @_log_to_history
jpayne@68 2475 @_wraps(prog="unionBedGraphs")
jpayne@68 2476 def union_bedgraphs(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2477 """
jpayne@68 2478 Wraps `bedtools unionbedg`.
jpayne@68 2479
jpayne@68 2480 Warning: using the `header=True` kwarg will result in a file that is
jpayne@68 2481 not in true BED format, which may break downstream analysis.
jpayne@68 2482 """
jpayne@68 2483 if "header" in kwargs:
jpayne@68 2484 if kwargs["header"] is True:
jpayne@68 2485 warn("Using header=True with unionbedg will result in a file that is not in true BED format, which may break downstream analysis.")
jpayne@68 2486
jpayne@68 2487
jpayne@68 2488 # Alias for backwards compatibility
jpayne@68 2489 unionbedg = union_bedgraphs
jpayne@68 2490
jpayne@68 2491 @_log_to_history
jpayne@68 2492 @_wraps(prog="windowMaker", uses_genome=True, genome_none_if=["b"], other="b", arg_order=["w"])
jpayne@68 2493 def window_maker(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2494 """
jpayne@68 2495 Wraps `bedtools makewindows`.
jpayne@68 2496 """
jpayne@68 2497
jpayne@68 2498 # Alias for backwards compatibility
jpayne@68 2499 makewindows = window_maker
jpayne@68 2500
jpayne@68 2501 @_log_to_history
jpayne@68 2502 @_wraps(prog="expandCols", implicit="i")
jpayne@68 2503 def expand(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2504 """
jpayne@68 2505 Wraps `bedtools expand`
jpayne@68 2506 """
jpayne@68 2507
jpayne@68 2508 @_log_to_history
jpayne@68 2509 @_wraps(prog="linksBed", implicit="i", add_to_bedtool={"stdout": "links_html"})
jpayne@68 2510 def links(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2511 """
jpayne@68 2512 Wraps `linksBed`.
jpayne@68 2513
jpayne@68 2514 The resulting BedTool will assign a value to the attribute `links_html`. This
jpayne@68 2515 attribute is a temp filename containing the HTML links.
jpayne@68 2516 """
jpayne@68 2517
jpayne@68 2518 @_log_to_history
jpayne@68 2519 @_wraps(prog="bedToIgv", implicit="i", add_to_bedtool={"stdout": "igv_script"})
jpayne@68 2520 def igv(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2521 """
jpayne@68 2522 Wraps `bedtools igv`.
jpayne@68 2523
jpayne@68 2524 The resulting BedTool will assign a value to the attribute `igv_script`. This
jpayne@68 2525 attribute is a temp filename containing the IGV script.
jpayne@68 2526 """
jpayne@68 2527
jpayne@68 2528 @_log_to_history
jpayne@68 2529 @_wraps(
jpayne@68 2530 prog="bamToFastq",
jpayne@68 2531 implicit="i",
jpayne@68 2532 bam="i",
jpayne@68 2533 make_tempfile_for="fq",
jpayne@68 2534 add_to_bedtool={"fq": "fastq"},
jpayne@68 2535 )
jpayne@68 2536 def bam_to_fastq(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2537 """
jpayne@68 2538 Wraps `bedtools bamtofastq`.
jpayne@68 2539
jpayne@68 2540 The `fq` argument is required.
jpayne@68 2541
jpayne@68 2542 The resulting BedTool will assign a value to the attribute `fastq`.
jpayne@68 2543 """
jpayne@68 2544
jpayne@68 2545 # Alias for backwards compatibility
jpayne@68 2546 bamtofastq = bam_to_fastq
jpayne@68 2547
jpayne@68 2548 @_wraps(
jpayne@68 2549 prog="jaccard",
jpayne@68 2550 implicit="a",
jpayne@68 2551 other="b",
jpayne@68 2552 does_not_return_bedtool=_jaccard_output_to_dict,
jpayne@68 2553 )
jpayne@68 2554 def jaccard(self, *args, **kwargs) -> dict[str, Any]: # type: ignore
jpayne@68 2555 """
jpayne@68 2556 Returns a dictionary with keys (intersection, union, jaccard).
jpayne@68 2557 """
jpayne@68 2558
jpayne@68 2559 @_wraps(
jpayne@68 2560 prog="reldist",
jpayne@68 2561 implicit="a",
jpayne@68 2562 other="b",
jpayne@68 2563 does_not_return_bedtool=_reldist_output_handler,
jpayne@68 2564 )
jpayne@68 2565 def reldist(self, *args, **kwargs) -> BedTool | dict[str, Any]: # type: ignore
jpayne@68 2566 """
jpayne@68 2567 If detail=False, then return a dictionary with keys (reldist, count,
jpayne@68 2568 total, fraction), which is the summary of the bedtools reldist.
jpayne@68 2569
jpayne@68 2570 Otherwise return a BedTool, with the relative distance for each
jpayne@68 2571 interval in A in the last column.
jpayne@68 2572 """
jpayne@68 2573 if "detail" in kwargs:
jpayne@68 2574 if kwargs["detail"] is False:
jpayne@68 2575 warn("Using detail=False with reldist will return a dictionary with keys (reldist, count, total, fraction), which is the summary of the bedtools reldist."
jpayne@68 2576 "Not a BedTool object.")
jpayne@68 2577
jpayne@68 2578
jpayne@68 2579 @_wraps(prog="sample", implicit="i", bam="i")
jpayne@68 2580 def sample(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2581 """
jpayne@68 2582 Wraps 'sample'.
jpayne@68 2583 """
jpayne@68 2584
jpayne@68 2585 @_wraps(
jpayne@68 2586 prog="fisher",
jpayne@68 2587 implicit="a",
jpayne@68 2588 other="b",
jpayne@68 2589 uses_genome=True,
jpayne@68 2590 does_not_return_bedtool=FisherOutput,
jpayne@68 2591 )
jpayne@68 2592 def fisher(self, *args, **kwargs) -> FisherOutput: # type: ignore
jpayne@68 2593 """
jpayne@68 2594 Wraps 'fisher'. Returns an object representing the output.
jpayne@68 2595
jpayne@68 2596 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2597 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 2598 >>> f = a.fisher(b, genome='hg19')
jpayne@68 2599 >>> print(f) # doctest: +NORMALIZE_WHITESPACE
jpayne@68 2600 # Number of query intervals: 4
jpayne@68 2601 # Number of db intervals: 2
jpayne@68 2602 # Number of overlaps: 3
jpayne@68 2603 # Number of possible intervals (estimated): 13958448
jpayne@68 2604 # phyper(3 - 1, 4, 13958448 - 4, 2, lower.tail=F)
jpayne@68 2605 # Contingency Table Of Counts
jpayne@68 2606 #_________________________________________
jpayne@68 2607 # | in -b | not in -b |
jpayne@68 2608 # in -a | 3 | 1 |
jpayne@68 2609 # not in -a | 0 | 13958444 |
jpayne@68 2610 #_________________________________________
jpayne@68 2611 # p-values for fisher's exact test
jpayne@68 2612 left right two-tail ratio
jpayne@68 2613 1 8.8247e-21 8.8247e-21 inf
jpayne@68 2614 <BLANKLINE>
jpayne@68 2615
jpayne@68 2616
jpayne@68 2617 >>> f.table['not in -a']['in -b']
jpayne@68 2618 0
jpayne@68 2619
jpayne@68 2620 >>> f.table['not in -a']['not in -b']
jpayne@68 2621 13958444
jpayne@68 2622
jpayne@68 2623 >>> f.table['in -a']['in -b']
jpayne@68 2624 3
jpayne@68 2625
jpayne@68 2626 >>> f.table['in -a']['not in -b']
jpayne@68 2627 1
jpayne@68 2628
jpayne@68 2629 >>> f.two_tail
jpayne@68 2630 8.8247e-21
jpayne@68 2631 """
jpayne@68 2632
jpayne@68 2633 @_wraps(prog="split", implicit="i", does_not_return_bedtool=SplitOutput)
jpayne@68 2634 def splitbed(self, *args, **kwargs) -> SplitOutput: # type: ignore
jpayne@68 2635 """
jpayne@68 2636 Wraps 'bedtools split'.
jpayne@68 2637
jpayne@68 2638 BedTool objects have long had a `split` method which splits intervals
jpayne@68 2639 according to a custom function. Now that BEDTools has a `split` tool,
jpayne@68 2640 the method name conflicts. To maintain backwards compatibility, the
jpayne@68 2641 method wrapping the BEDTools command is called `splitbed`.
jpayne@68 2642
jpayne@68 2643 Since this tool does not return a single BED file, the method parses
jpayne@68 2644 the output and returns a SplitOutput object, which includes an
jpayne@68 2645 attribute, `bedtools`, that is a list of BedTool objects created from
jpayne@68 2646 the split files.
jpayne@68 2647
jpayne@68 2648 To keep the working directory clean, you may want to consider using
jpayne@68 2649 `prefix=BedTool._tmp()` to get a temp file that will be deleted when
jpayne@68 2650 Python exits cleanly.
jpayne@68 2651
jpayne@68 2652 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2653 >>> s = a.splitbed(n=2, p="split")
jpayne@68 2654 >>> assert len(a) == 4, len(a)
jpayne@68 2655 >>> assert len(s.bedtools) == 2
jpayne@68 2656 >>> print(s.bedtools[0]) # doctest: +NORMALIZE_WHITESPACE
jpayne@68 2657 chr1 150 500 feature3 0 -
jpayne@68 2658 <BLANKLINE>
jpayne@68 2659 >>> print(s.bedtools[1]) # doctest: +NORMALIZE_WHITESPACE
jpayne@68 2660 chr1 100 200 feature2 0 +
jpayne@68 2661 chr1 1 100 feature1 0 +
jpayne@68 2662 chr1 900 950 feature4 0 +
jpayne@68 2663 <BLANKLINE>
jpayne@68 2664 """
jpayne@68 2665
jpayne@68 2666 @_wraps(prog="spacing", implicit="i")
jpayne@68 2667 def spacing(self, *args, **kwargs) -> BedTool: # type: ignore
jpayne@68 2668 """
jpayne@68 2669 Wraps `bedtools spacing`
jpayne@68 2670
jpayne@68 2671 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2672 >>> print(a.spacing()) # doctest: +NORMALIZE_WHITESPACE
jpayne@68 2673 chr1 1 100 feature1 0 + .
jpayne@68 2674 chr1 100 200 feature2 0 + 0
jpayne@68 2675 chr1 150 500 feature3 0 - -1
jpayne@68 2676 chr1 900 950 feature4 0 + 400
jpayne@68 2677 """
jpayne@68 2678
jpayne@68 2679 def count(self) -> int:
jpayne@68 2680 """
jpayne@68 2681 Count the number features in this BedTool.
jpayne@68 2682
jpayne@68 2683 Number of features in BED file. Does the same thing as len(self), which
jpayne@68 2684 actually just calls this method.
jpayne@68 2685
jpayne@68 2686 Only counts the actual features. Ignores any track lines, browser
jpayne@68 2687 lines, lines starting with a "#", or blank lines.
jpayne@68 2688
jpayne@68 2689 Example usage:
jpayne@68 2690
jpayne@68 2691 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 2692 >>> a.count()
jpayne@68 2693 4
jpayne@68 2694 """
jpayne@68 2695 if hasattr(self, "next") or hasattr(self, "__next__"):
jpayne@68 2696 return sum(1 for _ in self)
jpayne@68 2697 return sum(1 for _ in iter(self))
jpayne@68 2698
jpayne@68 2699 def print_sequence(self) -> str:
jpayne@68 2700 """
jpayne@68 2701 Print the sequence that was retrieved by BedTool.sequence.
jpayne@68 2702 See usage example in :meth:`BedTool.sequence`.
jpayne@68 2703
jpayne@68 2704 Returns:
jpayne@68 2705 str: The sequence as a string.
jpayne@68 2706
jpayne@68 2707 Raises:
jpayne@68 2708 ValueError: If the sequence has not been generated using .sequence(fasta).
jpayne@68 2709 """
jpayne@68 2710 if self.seqfn is None:
jpayne@68 2711 raise ValueError("Use .sequence(fasta) to get the sequence first")
jpayne@68 2712 f = open(self.seqfn)
jpayne@68 2713 s = f.read()
jpayne@68 2714 f.close()
jpayne@68 2715 return s
jpayne@68 2716
jpayne@68 2717 def save_seqs(self, fn:str) -> BedTool:
jpayne@68 2718 """
jpayne@68 2719 Save sequences, after calling BedTool.sequence.
jpayne@68 2720
jpayne@68 2721 In order to use this function, you need to have called
jpayne@68 2722 the :meth:`BedTool.sequence()` method.
jpayne@68 2723
jpayne@68 2724 A new BedTool object is returned which references the newly saved file.
jpayne@68 2725
jpayne@68 2726 Example usage:
jpayne@68 2727
jpayne@68 2728 >>> a = pybedtools.BedTool('''
jpayne@68 2729 ... chr1 1 10
jpayne@68 2730 ... chr1 50 55''', from_string=True)
jpayne@68 2731 >>> fasta = pybedtools.example_filename('test.fa')
jpayne@68 2732 >>> a = a.sequence(fi=fasta)
jpayne@68 2733 >>> print(open(a.seqfn).read())
jpayne@68 2734 >chr1:1-10
jpayne@68 2735 GATGAGTCT
jpayne@68 2736 >chr1:50-55
jpayne@68 2737 CCATC
jpayne@68 2738 <BLANKLINE>
jpayne@68 2739 >>> b = a.save_seqs('example.fa')
jpayne@68 2740 >>> assert open(b.fn).read() == open(a.fn).read()
jpayne@68 2741 >>> if os.path.exists('example.fa'):
jpayne@68 2742 ... os.unlink('example.fa')
jpayne@68 2743 """
jpayne@68 2744
jpayne@68 2745 if self.seqfn is None:
jpayne@68 2746 raise ValueError("Use .sequence(fasta) to get the sequence first")
jpayne@68 2747
jpayne@68 2748 with open(fn, "w") as fout:
jpayne@68 2749 if self.seqfn is None:
jpayne@68 2750 raise ValueError("Use .sequence(fasta) to get the sequence first")
jpayne@68 2751 with open(self.seqfn) as seqfile:
jpayne@68 2752 fout.write(seqfile.read())
jpayne@68 2753
jpayne@68 2754 new_bedtool = BedTool(self.fn)
jpayne@68 2755 new_bedtool.seqfn = fn
jpayne@68 2756 return new_bedtool
jpayne@68 2757
jpayne@68 2758 def randomstats(
jpayne@68 2759 self,
jpayne@68 2760 other: BedTool,
jpayne@68 2761 iterations: int,
jpayne@68 2762 new: bool = False,
jpayne@68 2763 genome_fn: Optional[str] = None,
jpayne@68 2764 include_distribution: bool = False,
jpayne@68 2765 **kwargs
jpayne@68 2766 ) -> dict[str, Any]:
jpayne@68 2767 """
jpayne@68 2768 Dictionary of results from many randomly shuffled intersections.
jpayne@68 2769
jpayne@68 2770 Sends args and kwargs to :meth:`BedTool.randomintersection` and
jpayne@68 2771 compiles results into a dictionary with useful stats. Requires
jpayne@68 2772 numpy.
jpayne@68 2773
jpayne@68 2774 If `include_distribution` is True, then the dictionary will include the
jpayne@68 2775 full distribution; otherwise, the distribution is deleted and cleaned
jpayne@68 2776 up to save on memory usage.
jpayne@68 2777
jpayne@68 2778 This is one possible way of assigning significance to overlaps between
jpayne@68 2779 two files. See, for example:
jpayne@68 2780
jpayne@68 2781 Negre N, Brown CD, Shah PK, Kheradpour P, Morrison CA, et al. 2010
jpayne@68 2782 A Comprehensive Map of Insulator Elements for the Drosophila
jpayne@68 2783 Genome. PLoS Genet 6(1): e1000814. doi:10.1371/journal.pgen.1000814
jpayne@68 2784
jpayne@68 2785 Example usage:
jpayne@68 2786
jpayne@68 2787 Make chromsizes a very small genome for this example:
jpayne@68 2788
jpayne@68 2789 >>> chromsizes = {'chr1':(1,1000)}
jpayne@68 2790 >>> a = pybedtools.example_bedtool('a.bed').set_chromsizes(chromsizes)
jpayne@68 2791 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 2792 >>> try:
jpayne@68 2793 ... results = a.randomstats(b, 100, debug=True)
jpayne@68 2794 ... except ImportError:
jpayne@68 2795 ... pass
jpayne@68 2796
jpayne@68 2797 *results* is a dictionary that you can inspect.
jpayne@68 2798
jpayne@68 2799 (Note that the following examples are not run as part of the doctests
jpayne@68 2800 to avoid forcing users to install NumPy just to pass tests)
jpayne@68 2801
jpayne@68 2802 The actual overlap::
jpayne@68 2803
jpayne@68 2804 print(results['actual'])
jpayne@68 2805 3
jpayne@68 2806
jpayne@68 2807 The median of all randomized overlaps::
jpayne@68 2808
jpayne@68 2809 print(results['median randomized'])
jpayne@68 2810 2.0
jpayne@68 2811
jpayne@68 2812 The percentile of the actual overlap in the distribution of randomized
jpayne@68 2813 overlaps, which can be used to get an empirical p-value::
jpayne@68 2814
jpayne@68 2815 print(results['percentile'])
jpayne@68 2816 90.0
jpayne@68 2817 """
jpayne@68 2818 if ("intersect_kwargs" not in kwargs) or (kwargs["intersect_kwargs"] is None):
jpayne@68 2819 kwargs["intersect_kwargs"] = {"u": True}
jpayne@68 2820 try:
jpayne@68 2821 import numpy as np
jpayne@68 2822 except ImportError:
jpayne@68 2823 raise ImportError("Need to install NumPy for stats...")
jpayne@68 2824
jpayne@68 2825 def percentileofscore(a, score):
jpayne@68 2826 """
jpayne@68 2827 copied from scipy.stats.percentileofscore, to avoid dependency on
jpayne@68 2828 scipy.
jpayne@68 2829 """
jpayne@68 2830 a = np.array(a)
jpayne@68 2831 n = len(a)
jpayne@68 2832
jpayne@68 2833 if not (np.any(a == score)):
jpayne@68 2834 a = np.append(a, score)
jpayne@68 2835 a_len = np.array(list(range(len(a))))
jpayne@68 2836 else:
jpayne@68 2837 a_len = np.array(list(range(len(a)))) + 1.0
jpayne@68 2838
jpayne@68 2839 a = np.sort(a)
jpayne@68 2840 idx = tuple([a == score])
jpayne@68 2841 pct = (np.mean(a_len[idx]) / n) * 100.0
jpayne@68 2842 return pct
jpayne@68 2843
jpayne@68 2844 if isinstance(other, str):
jpayne@68 2845 other = BedTool(other)
jpayne@68 2846 else:
jpayne@68 2847 assert isinstance(
jpayne@68 2848 other, BedTool
jpayne@68 2849 ), "Either filename or another BedTool instance required"
jpayne@68 2850
jpayne@68 2851 # Actual (unshuffled) counts.
jpayne@68 2852 i_kwargs = kwargs["intersect_kwargs"]
jpayne@68 2853 actual = len(self.intersect(other, **i_kwargs))
jpayne@68 2854
jpayne@68 2855 # List of counts from randomly shuffled versions.
jpayne@68 2856 # Length of counts == *iterations*.
jpayne@68 2857
jpayne@68 2858 if not new:
jpayne@68 2859 distribution = self.randomintersection(
jpayne@68 2860 other, iterations=iterations, **kwargs
jpayne@68 2861 )
jpayne@68 2862 else:
jpayne@68 2863 # use new mechanism
jpayne@68 2864 if genome_fn is None:
jpayne@68 2865 raise ValueError(
jpayne@68 2866 "`genome_fn` must be provided if using the "
jpayne@68 2867 "new _randomintersection mechanism"
jpayne@68 2868 )
jpayne@68 2869 distribution = self._randomintersection(
jpayne@68 2870 other, iterations=iterations, genome_fn=genome_fn, **kwargs
jpayne@68 2871 )
jpayne@68 2872
jpayne@68 2873 distribution = np.array(list(distribution))
jpayne@68 2874
jpayne@68 2875 # Median of distribution
jpayne@68 2876 med_count = np.median(distribution)
jpayne@68 2877
jpayne@68 2878 n = float(len(distribution))
jpayne@68 2879
jpayne@68 2880 frac_above = sum(distribution > actual) / n
jpayne@68 2881 frac_below = sum(distribution < actual) / n
jpayne@68 2882
jpayne@68 2883 normalized = actual / med_count
jpayne@68 2884
jpayne@68 2885 lower_thresh = 2.5
jpayne@68 2886 upper_thresh = 97.5
jpayne@68 2887 lower, upper = np.percentile(distribution, [lower_thresh, upper_thresh])
jpayne@68 2888
jpayne@68 2889 actual_percentile = percentileofscore(distribution, actual)
jpayne@68 2890 d = {
jpayne@68 2891 "iterations": iterations,
jpayne@68 2892 "actual": actual,
jpayne@68 2893 "file_a": self.fn,
jpayne@68 2894 "file_b": other.fn,
jpayne@68 2895 self.fn: len(self),
jpayne@68 2896 other.fn: len(other),
jpayne@68 2897 "self": len(self),
jpayne@68 2898 "other": len(other),
jpayne@68 2899 "frac randomized above actual": frac_above,
jpayne@68 2900 "frac randomized below actual": frac_below,
jpayne@68 2901 "median randomized": med_count,
jpayne@68 2902 "normalized": normalized,
jpayne@68 2903 "percentile": actual_percentile,
jpayne@68 2904 "lower_%sth" % lower_thresh: lower,
jpayne@68 2905 "upper_%sth" % upper_thresh: upper,
jpayne@68 2906 }
jpayne@68 2907 if include_distribution:
jpayne@68 2908 d["distribution"] = distribution
jpayne@68 2909 else:
jpayne@68 2910 del distribution
jpayne@68 2911 return d
jpayne@68 2912
jpayne@68 2913 def random_op(self, *args, **kwargs):
jpayne@68 2914 """
jpayne@68 2915 For backwards compatibility; see BedTool.parallel_apply instead.
jpayne@68 2916 """
jpayne@68 2917 return self.parallel_apply(*args, **kwargs)
jpayne@68 2918
jpayne@68 2919 def parallel_apply(
jpayne@68 2920 self,
jpayne@68 2921 iterations: int,
jpayne@68 2922 func: Callable,
jpayne@68 2923 func_args: Iterable,
jpayne@68 2924 func_kwargs: dict,
jpayne@68 2925 processes: int=1,
jpayne@68 2926 _orig_pool: Optional[Pool] = None # type: ignore
jpayne@68 2927 ):
jpayne@68 2928 """
jpayne@68 2929 Generalized method for applying a function in parallel.
jpayne@68 2930
jpayne@68 2931 Typically used when having to do many random shufflings.
jpayne@68 2932
jpayne@68 2933 `func_args` and `func_kwargs` will be passed to `func` each time in
jpayne@68 2934 `iterations`, and these iterations will be split across `processes`
jpayne@68 2935 processes.
jpayne@68 2936
jpayne@68 2937 Notes on the function, `func`:
jpayne@68 2938
jpayne@68 2939 * the function should manually remove any tempfiles created. This
jpayne@68 2940 is because the BedTool.TEMPFILES list of auto-created tempfiles
jpayne@68 2941 does not share state across processes, so things will not get
jpayne@68 2942 cleaned up automatically as they do in a single-process
jpayne@68 2943 pybedtools session.
jpayne@68 2944
jpayne@68 2945 * this includes deleting any "chromsizes" or genome files --
jpayne@68 2946 generally it will be best to require a genome filename in
jpayne@68 2947 `func_kwargs` if you'll be using any BedTool methods that accept
jpayne@68 2948 the `g` kwarg.
jpayne@68 2949
jpayne@68 2950 * the function should be a module-level function (rather than a
jpayne@68 2951 class method) because class methods can't be pickled across
jpayne@68 2952 process boundaries
jpayne@68 2953
jpayne@68 2954 * the function can have any signature and have any return value
jpayne@68 2955
jpayne@68 2956 `_orig_pool` can be a previously-created multiprocessing.Pool instance;
jpayne@68 2957 otherwise, a new Pool will be created with `processes`
jpayne@68 2958 """
jpayne@68 2959 if processes == 1:
jpayne@68 2960 for _ in range(iterations):
jpayne@68 2961 yield func(*func_args, **func_kwargs)
jpayne@68 2962 raise StopIteration
jpayne@68 2963
jpayne@68 2964 if _orig_pool:
jpayne@68 2965 p = _orig_pool
jpayne@68 2966 else:
jpayne@68 2967 p = Pool(processes)
jpayne@68 2968 iterations_each = [iterations / processes] * processes
jpayne@68 2969 iterations_each[-1] += iterations % processes
jpayne@68 2970
jpayne@68 2971 # FYI some useful info on apply_async:
jpayne@68 2972 # http://stackoverflow.com/questions/8533318/
jpayne@68 2973 # python-multiprocessing-pool-when-to-use-apply-apply-async-or-map
jpayne@68 2974 #
jpayne@68 2975 # Here, we don't care about the order, and don't want the subprocesses
jpayne@68 2976 # to block.
jpayne@68 2977 results = [
jpayne@68 2978 p.apply_async(func, func_args, func_kwargs) for _ in range(iterations)
jpayne@68 2979 ]
jpayne@68 2980 for r in results:
jpayne@68 2981 yield r.get()
jpayne@68 2982 raise StopIteration
jpayne@68 2983
jpayne@68 2984 def random_jaccard(
jpayne@68 2985 self,
jpayne@68 2986 other: BedTool,
jpayne@68 2987 genome_fn: Optional[str] = None,
jpayne@68 2988 iterations: Optional[int] = None,
jpayne@68 2989 processes: int = 1,
jpayne@68 2990 _orig_pool: Optional[Pool] = None,
jpayne@68 2991 shuffle_kwargs: Optional[dict[str, Any]] = None,
jpayne@68 2992 jaccard_kwargs: Optional[dict[str, Any]] = None,
jpayne@68 2993 ) -> list:
jpayne@68 2994 """
jpayne@68 2995 Computes the naive Jaccard statistic (intersection divided by union).
jpayne@68 2996
jpayne@68 2997 .. note::
jpayne@68 2998
jpayne@68 2999 If you don't need the randomization functionality of this method,
jpayne@68 3000 you can use the simpler BedTool.jaccard method instead.
jpayne@68 3001
jpayne@68 3002 See Favorov et al. (2012) PLoS Comput Biol 8(5): e1002529 for more
jpayne@68 3003 info on the Jaccard statistic for intersections.
jpayne@68 3004
jpayne@68 3005 If `iterations` is None, then do not perform random shufflings.
jpayne@68 3006
jpayne@68 3007 If `iterations` is an integer, perform `iterations` random shufflings,
jpayne@68 3008 each time computing the Jaccard statistic to build an empirical
jpayne@68 3009 distribution. `genome_fn` will also be needed; optional `processes`
jpayne@68 3010 will split the iterations across multiple CPUs.
jpayne@68 3011
jpayne@68 3012 Returns a tuple of the observed Jaccard statistic and a list of the
jpayne@68 3013 randomized statistics (which will be an empty list if `iterations` was
jpayne@68 3014 None).
jpayne@68 3015 """
jpayne@68 3016 if shuffle_kwargs is None:
jpayne@68 3017 shuffle_kwargs = {}
jpayne@68 3018 if jaccard_kwargs is None:
jpayne@68 3019 jaccard_kwargs = {}
jpayne@68 3020 if not genome_fn:
jpayne@68 3021 raise ValueError("Need a genome filename in order to perform randomization")
jpayne@68 3022 return list(
jpayne@68 3023 self.parallel_apply(
jpayne@68 3024 iterations=iterations if iterations else 1,
jpayne@68 3025 func=pybedtools.stats.random_jaccard,
jpayne@68 3026 func_args=(self, other),
jpayne@68 3027 func_kwargs=dict(
jpayne@68 3028 genome_fn=genome_fn,
jpayne@68 3029 shuffle_kwargs=shuffle_kwargs,
jpayne@68 3030 jaccard_kwargs=jaccard_kwargs,
jpayne@68 3031 ),
jpayne@68 3032 processes=processes,
jpayne@68 3033 _orig_pool=_orig_pool,
jpayne@68 3034 )
jpayne@68 3035 )
jpayne@68 3036
jpayne@68 3037 def _randomintersection(
jpayne@68 3038 self,
jpayne@68 3039 other: BedTool,
jpayne@68 3040 iterations: int,
jpayne@68 3041 genome_fn: str,
jpayne@68 3042 intersect_kwargs: Optional[dict[str, Any]] = None,
jpayne@68 3043 _orig_pool:Optional[Pool] = None,
jpayne@68 3044 shuffle_kwargs: Optional[dict[str, Any]] = None,
jpayne@68 3045 processes: int = 1,
jpayne@68 3046 ):
jpayne@68 3047 """
jpayne@68 3048 Re-implementation of BedTool.randomintersection using the new
jpayne@68 3049 `random_op` method
jpayne@68 3050 """
jpayne@68 3051 if shuffle_kwargs is None:
jpayne@68 3052 shuffle_kwargs = {}
jpayne@68 3053 if intersect_kwargs is None:
jpayne@68 3054 intersect_kwargs = dict(u=True)
jpayne@68 3055 if not genome_fn:
jpayne@68 3056 raise ValueError("Need a genome filename in order to perform randomization")
jpayne@68 3057 return list(
jpayne@68 3058 self.parallel_apply(
jpayne@68 3059 iterations=iterations,
jpayne@68 3060 func=pybedtools.stats.random_intersection,
jpayne@68 3061 func_args=(self, other),
jpayne@68 3062 func_kwargs=dict(
jpayne@68 3063 genome_fn=genome_fn,
jpayne@68 3064 shuffle_kwargs=shuffle_kwargs,
jpayne@68 3065 intersect_kwargs=intersect_kwargs,
jpayne@68 3066 ),
jpayne@68 3067 processes=processes,
jpayne@68 3068 _orig_pool=_orig_pool,
jpayne@68 3069 )
jpayne@68 3070 )
jpayne@68 3071
jpayne@68 3072 def randomintersection_bp(
jpayne@68 3073 self,
jpayne@68 3074 other: BedTool,
jpayne@68 3075 iterations: int,
jpayne@68 3076 genome_fn: str,
jpayne@68 3077 intersect_kwargs: Optional[dict[str, Any]] = None,
jpayne@68 3078 shuffle_kwargs: Optional[dict[str, Any]] = None,
jpayne@68 3079 processes: int = 1,
jpayne@68 3080 _orig_pool:Optional[Pool] = None,
jpayne@68 3081 ) -> list[int]:
jpayne@68 3082 """
jpayne@68 3083 Like randomintersection, but return the bp overlap instead of the
jpayne@68 3084 number of intersecting intervals.
jpayne@68 3085 """
jpayne@68 3086 if shuffle_kwargs is None:
jpayne@68 3087 shuffle_kwargs = {}
jpayne@68 3088 if intersect_kwargs is None:
jpayne@68 3089 intersect_kwargs = {}
jpayne@68 3090 if not genome_fn:
jpayne@68 3091 raise ValueError("Need a genome filename in order to perform randomization")
jpayne@68 3092 return list(
jpayne@68 3093 self.parallel_apply(
jpayne@68 3094 iterations=iterations,
jpayne@68 3095 func=pybedtools.stats.random_intersection_bp,
jpayne@68 3096 func_args=(self, other),
jpayne@68 3097 func_kwargs=dict(
jpayne@68 3098 genome_fn=genome_fn,
jpayne@68 3099 shuffle_kwargs=shuffle_kwargs,
jpayne@68 3100 intersect_kwargs=intersect_kwargs,
jpayne@68 3101 ),
jpayne@68 3102 processes=processes,
jpayne@68 3103 _orig_pool=_orig_pool,
jpayne@68 3104 )
jpayne@68 3105 )
jpayne@68 3106
jpayne@68 3107 def randomintersection(
jpayne@68 3108 self,
jpayne@68 3109 other: BedTool,
jpayne@68 3110 iterations: int,
jpayne@68 3111 intersect_kwargs: Optional[dict[str, Any]] = None,
jpayne@68 3112 shuffle_kwargs: Optional[dict[str, Any]] = None,
jpayne@68 3113 debug: bool = False,
jpayne@68 3114 report_iterations: bool = False,
jpayne@68 3115 processes: Optional[int] = None,
jpayne@68 3116 _orig_processes: Optional[int] = None,
jpayne@68 3117 ) -> Iterator[int]:
jpayne@68 3118 """
jpayne@68 3119 Perform `iterations` shufflings, each time intersecting with `other`.
jpayne@68 3120
jpayne@68 3121 Returns a generator of integers where each integer is the number of
jpayne@68 3122 intersections of a shuffled file with *other*. This distribution can
jpayne@68 3123 be used in downstream analysis for things like empirical p-values.
jpayne@68 3124
jpayne@68 3125 *intersect_kwargs* and *shuffle_kwargs* are passed to self.intersect()
jpayne@68 3126 and self.shuffle() respectively. By default for intersect, u=True is
jpayne@68 3127 specified -- but s=True might be a useful option for strand-specific
jpayne@68 3128 work.
jpayne@68 3129
jpayne@68 3130 Useful kwargs for *shuffle_kwargs* are chrom, excl, or incl. If you
jpayne@68 3131 use the "seed" kwarg, that seed will be used *each* time shuffleBed is
jpayne@68 3132 called -- so all your randomization results will be identical for each
jpayne@68 3133 iteration. To get around this and to allow for tests, debug=True will
jpayne@68 3134 set the seed to the iteration number. You may also break up the
jpayne@68 3135 intersections across multiple processes with *processes* > 1.
jpayne@68 3136
jpayne@68 3137 Example usage:
jpayne@68 3138
jpayne@68 3139 >>> chromsizes = {'chr1':(0, 1000)}
jpayne@68 3140 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 3141 >>> a = a.set_chromsizes(chromsizes)
jpayne@68 3142 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 3143 >>> results = a.randomintersection(b, 10, debug=True)
jpayne@68 3144 >>> print(list(results))
jpayne@68 3145 [1, 0, 1, 2, 4, 2, 2, 1, 2, 4]
jpayne@68 3146
jpayne@68 3147 """
jpayne@68 3148 if processes is not None:
jpayne@68 3149 p = Pool(processes)
jpayne@68 3150 iterations_each = [iterations // processes] * processes
jpayne@68 3151 iterations_each[-1] += iterations % processes
jpayne@68 3152 results = [
jpayne@68 3153 p.apply_async(
jpayne@68 3154 _call_randomintersect,
jpayne@68 3155 (self, other, it),
jpayne@68 3156 dict(
jpayne@68 3157 intersect_kwargs=intersect_kwargs,
jpayne@68 3158 shuffle_kwargs=shuffle_kwargs,
jpayne@68 3159 debug=debug,
jpayne@68 3160 report_iterations=report_iterations,
jpayne@68 3161 _orig_processes=processes,
jpayne@68 3162 ),
jpayne@68 3163 )
jpayne@68 3164 for it in iterations_each
jpayne@68 3165 ]
jpayne@68 3166 for r in results:
jpayne@68 3167 for value in r.get():
jpayne@68 3168 yield value
jpayne@68 3169 raise StopIteration
jpayne@68 3170
jpayne@68 3171 if shuffle_kwargs is None:
jpayne@68 3172 shuffle_kwargs = {}
jpayne@68 3173 if intersect_kwargs is None:
jpayne@68 3174 intersect_kwargs = {"u": True}
jpayne@68 3175
jpayne@68 3176 if "u" not in intersect_kwargs:
jpayne@68 3177 intersect_kwargs["u"] = True
jpayne@68 3178
jpayne@68 3179 resort = intersect_kwargs.get("sorted", False)
jpayne@68 3180
jpayne@68 3181 for i in range(iterations):
jpayne@68 3182 if debug:
jpayne@68 3183 shuffle_kwargs["seed"] = i
jpayne@68 3184 if report_iterations:
jpayne@68 3185 if _orig_processes > 1:
jpayne@68 3186 msg = "\rapprox (total across %s processes): %s" % (
jpayne@68 3187 _orig_processes,
jpayne@68 3188 i * _orig_processes,
jpayne@68 3189 )
jpayne@68 3190 else:
jpayne@68 3191 msg = "\r%s" % i
jpayne@68 3192 sys.stderr.write(msg)
jpayne@68 3193 sys.stderr.flush()
jpayne@68 3194
jpayne@68 3195 # Re-sort if sorted=True in kwargs
jpayne@68 3196 if resort:
jpayne@68 3197 tmp0 = self.shuffle(**shuffle_kwargs)
jpayne@68 3198 tmp = tmp0.sort()
jpayne@68 3199 else:
jpayne@68 3200 tmp = self.shuffle(**shuffle_kwargs)
jpayne@68 3201
jpayne@68 3202 tmp2 = tmp.intersect(other, stream=True, **intersect_kwargs)
jpayne@68 3203
jpayne@68 3204 yield len(tmp2)
jpayne@68 3205
jpayne@68 3206 # Close the open stdouts from subprocess.Popen calls. Note: doing
jpayne@68 3207 # this in self.__del__ doesn't fix the open file limit bug; it
jpayne@68 3208 # needs to be done here.
jpayne@68 3209 # if resort:
jpayne@68 3210 # tmp0.fn.close()
jpayne@68 3211 # tmp.fn.close()
jpayne@68 3212 tmp2.fn.close()
jpayne@68 3213 del tmp
jpayne@68 3214 del tmp2
jpayne@68 3215
jpayne@68 3216 @_log_to_history
jpayne@68 3217 def cat(self, *others: Iterable[str|Path|BedTool], **kwargs) -> BedTool:
jpayne@68 3218 """
jpayne@68 3219 Concatenate interval files together.
jpayne@68 3220
jpayne@68 3221 Concatenates two BedTool objects (or an object and a file) and does an
jpayne@68 3222 optional post-merge of the features.
jpayne@68 3223
jpayne@68 3224 *postmerge=True* by default; use *postmerge=False* if you want to keep
jpayne@68 3225 features separate.
jpayne@68 3226
jpayne@68 3227 *force_truncate=False* by default; *force_truncate=True* to truncate
jpayne@68 3228 all files to chrom, start, stop.
jpayne@68 3229
jpayne@68 3230 When *force_truncate=False* and *postmerge=False*, the output will
jpayne@68 3231 contain the smallest number of fields observed across all inputs. This
jpayne@68 3232 maintains compatibility with BEDTools programs, which assume constant
jpayne@68 3233 number of fields in all lines of a file.
jpayne@68 3234
jpayne@68 3235 Other kwargs are sent to :meth:`BedTool.merge` (and assuming that
jpayne@68 3236 *postmerge=True*).
jpayne@68 3237
jpayne@68 3238 Example usage:
jpayne@68 3239
jpayne@68 3240 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 3241 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68 3242 >>> print(a.cat(b)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 3243 chr1 1 500
jpayne@68 3244 chr1 800 950
jpayne@68 3245 <BLANKLINE>
jpayne@68 3246 >>> print(a.cat(*[b,b],
jpayne@68 3247 ... postmerge=False)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 3248 chr1 1 100 feature1 0 +
jpayne@68 3249 chr1 100 200 feature2 0 +
jpayne@68 3250 chr1 150 500 feature3 0 -
jpayne@68 3251 chr1 900 950 feature4 0 +
jpayne@68 3252 chr1 155 200 feature5 0 -
jpayne@68 3253 chr1 800 901 feature6 0 +
jpayne@68 3254 chr1 155 200 feature5 0 -
jpayne@68 3255 chr1 800 901 feature6 0 +
jpayne@68 3256 <BLANKLINE>
jpayne@68 3257 """
jpayne@68 3258 same_type = None
jpayne@68 3259 same_field_num = None
jpayne@68 3260 field_nums = set()
jpayne@68 3261
jpayne@68 3262 assert len(others) > 0, "You must specify at least one other bedfile!"
jpayne@68 3263 other_beds = []
jpayne@68 3264 for other in others:
jpayne@68 3265 if isinstance(other, (str, Path)):
jpayne@68 3266 other = BedTool(other)
jpayne@68 3267 else:
jpayne@68 3268 assert isinstance(
jpayne@68 3269 other, BedTool
jpayne@68 3270 ), "Either filename or another BedTool instance required"
jpayne@68 3271 other_beds.append(other)
jpayne@68 3272
jpayne@68 3273 # postmerge and force_truncate don't get passed on to merge
jpayne@68 3274 postmerge = kwargs.pop("postmerge", True)
jpayne@68 3275 force_truncate = kwargs.pop("force_truncate", False)
jpayne@68 3276 stream_merge = kwargs.get("stream", False)
jpayne@68 3277 if stream_merge and postmerge:
jpayne@68 3278 raise ValueError(
jpayne@68 3279 "The post-merge step in the `cat()` method "
jpayne@68 3280 "performs a sort, which uses stream=True. Using "
jpayne@68 3281 "stream=True for the merge as well will result in a "
jpayne@68 3282 "deadlock!"
jpayne@68 3283 )
jpayne@68 3284
jpayne@68 3285 # if filetypes and field counts are the same, don't truncate
jpayne@68 3286 if not force_truncate:
jpayne@68 3287 try:
jpayne@68 3288 filetypes = set(
jpayne@68 3289 [self.file_type] + [i.file_type for i in other_beds]
jpayne@68 3290 ).difference(["empty"])
jpayne@68 3291 field_nums = (
jpayne@68 3292 set([self.field_count()] + [i.field_count() for i in other_beds])
jpayne@68 3293 .difference([None])
jpayne@68 3294 .difference([0])
jpayne@68 3295 )
jpayne@68 3296 same_field_num = len(field_nums) == 1
jpayne@68 3297 same_type = len(set(filetypes)) == 1
jpayne@68 3298 except ValueError:
jpayne@68 3299 raise ValueError(
jpayne@68 3300 "Can't check filetype or field count -- "
jpayne@68 3301 "is one of the files you're merging a 'streaming' "
jpayne@68 3302 "BedTool? If so, use .saveas() to save to file first"
jpayne@68 3303 )
jpayne@68 3304
jpayne@68 3305 tmp = self._tmp()
jpayne@68 3306
jpayne@68 3307 if not force_truncate and same_type and same_field_num:
jpayne@68 3308 with open(tmp, "w") as TMP:
jpayne@68 3309 for f in self:
jpayne@68 3310 TMP.write(str(f))
jpayne@68 3311 for other in other_beds:
jpayne@68 3312 for f in other:
jpayne@68 3313 TMP.write(str(f))
jpayne@68 3314
jpayne@68 3315 # Types match, so we can use the min number of fields observed across
jpayne@68 3316 # all inputs
jpayne@68 3317 elif not force_truncate and same_type:
jpayne@68 3318 minfields = min(field_nums)
jpayne@68 3319 with open(tmp, "w") as TMP:
jpayne@68 3320 for f in self:
jpayne@68 3321 TMP.write("\t".join(f.fields[:minfields]) + "\n")
jpayne@68 3322 for other in other_beds:
jpayne@68 3323 for f in other:
jpayne@68 3324 TMP.write("\t".join(f.fields[:minfields]) + "\n")
jpayne@68 3325
jpayne@68 3326 # Otherwise, use the zero-based chrom/start/stop to create a BED3,
jpayne@68 3327 # which will work when catting a GFF and a BED together.
jpayne@68 3328 else:
jpayne@68 3329 with open(tmp, "w") as TMP:
jpayne@68 3330 for f in self:
jpayne@68 3331 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
jpayne@68 3332 for other in other_beds:
jpayne@68 3333 for f in other:
jpayne@68 3334 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
jpayne@68 3335
jpayne@68 3336 c = BedTool(tmp)
jpayne@68 3337 if postmerge:
jpayne@68 3338 d = c.sort(stream=True).merge(**kwargs)
jpayne@68 3339
jpayne@68 3340 # Explicitly delete -- needed when using multiprocessing
jpayne@68 3341 os.unlink(tmp)
jpayne@68 3342 return d
jpayne@68 3343 else:
jpayne@68 3344 return c
jpayne@68 3345
jpayne@68 3346 @_log_to_history
jpayne@68 3347 def saveas(self, fn: Optional[str] = None, trackline: Optional[str] = None, compressed: Optional[bool] = None) -> BedTool:
jpayne@68 3348 """
jpayne@68 3349 Make a copy of the BedTool.
jpayne@68 3350
jpayne@68 3351 Optionally adds `trackline` to the beginning of the file.
jpayne@68 3352
jpayne@68 3353 Optionally compresses output using gzip.
jpayne@68 3354
jpayne@68 3355 if the filename extension is .gz, or compressed=True,
jpayne@68 3356 the output is compressed using gzip
jpayne@68 3357
jpayne@68 3358 Returns a new BedTool for the newly saved file.
jpayne@68 3359
jpayne@68 3360 A newline is automatically added to the trackline if it does not
jpayne@68 3361 already have one.
jpayne@68 3362
jpayne@68 3363 Example usage:
jpayne@68 3364
jpayne@68 3365 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 3366 >>> b = a.saveas('other.bed')
jpayne@68 3367 >>> b.fn
jpayne@68 3368 'other.bed'
jpayne@68 3369 >>> print(b == a)
jpayne@68 3370 True
jpayne@68 3371
jpayne@68 3372 >>> b = a.saveas('other.bed', trackline="name='test run' color=0,55,0")
jpayne@68 3373 >>> open(b.fn).readline()
jpayne@68 3374 "name='test run' color=0,55,0\\n"
jpayne@68 3375 >>> if os.path.exists('other.bed'):
jpayne@68 3376 ... os.unlink('other.bed')
jpayne@68 3377 """
jpayne@68 3378 if fn is None:
jpayne@68 3379 fn = self._tmp()
jpayne@68 3380
jpayne@68 3381 # Default to compressed if extension is .gz
jpayne@68 3382 if compressed is None:
jpayne@68 3383 __, extension = os.path.splitext(fn)
jpayne@68 3384 if extension == ".gz":
jpayne@68 3385 compressed = True
jpayne@68 3386 else:
jpayne@68 3387 compressed = False
jpayne@68 3388
jpayne@68 3389 in_compressed = isinstance(self.fn, str) and isGZIP(self.fn)
jpayne@68 3390
jpayne@68 3391 fn = self._collapse(
jpayne@68 3392 self,
jpayne@68 3393 fn=fn,
jpayne@68 3394 trackline=trackline,
jpayne@68 3395 in_compressed=in_compressed,
jpayne@68 3396 out_compressed=compressed,
jpayne@68 3397 )
jpayne@68 3398 return BedTool(fn)
jpayne@68 3399
jpayne@68 3400 @_log_to_history
jpayne@68 3401 def moveto(self, fn: Optional[str]=None) -> BedTool:
jpayne@68 3402 """
jpayne@68 3403 Move to a new filename (can be much quicker than BedTool.saveas())
jpayne@68 3404
jpayne@68 3405 Move BED file to new filename, `fn`.
jpayne@68 3406
jpayne@68 3407 Returns a new BedTool for the new file.
jpayne@68 3408
jpayne@68 3409 Example usage:
jpayne@68 3410
jpayne@68 3411 >>> # make a copy so we don't mess up the example file
jpayne@68 3412 >>> a = pybedtools.example_bedtool('a.bed').saveas()
jpayne@68 3413 >>> a_contents = str(a)
jpayne@68 3414 >>> b = a.moveto('other.bed')
jpayne@68 3415 >>> b.fn
jpayne@68 3416 'other.bed'
jpayne@68 3417 >>> b == a_contents
jpayne@68 3418 True
jpayne@68 3419 """
jpayne@68 3420 if not isinstance(self.fn, str):
jpayne@68 3421 fn = self._collapse(self, fn=fn)
jpayne@68 3422 else:
jpayne@68 3423 shutil.move(self.fn, fn)
jpayne@68 3424 return BedTool(fn)
jpayne@68 3425
jpayne@68 3426 @_log_to_history
jpayne@68 3427 def random_subset(self, n: Optional[int] = None, f: Optional[float] = None, seed: Optional[float|int]=None) -> BedTool:
jpayne@68 3428 """
jpayne@68 3429 Return a BedTool containing a random subset.
jpayne@68 3430
jpayne@68 3431 NOTE: using `n` will be slower and use more memory than using `f`.
jpayne@68 3432
jpayne@68 3433 Parameters
jpayne@68 3434 ----------
jpayne@68 3435
jpayne@68 3436 n : int
jpayne@68 3437 Number of features to return. Only one of `n` or `f` can be provided.
jpayne@68 3438
jpayne@68 3439 f : float, 0 <= f <= 1
jpayne@68 3440 Fraction of features to return. Cannot be provided with `n`.
jpayne@68 3441
jpayne@68 3442 seed : float or int
jpayne@68 3443 Set random.seed
jpayne@68 3444
jpayne@68 3445 Example
jpayne@68 3446 -------
jpayne@68 3447
jpayne@68 3448 >>> seed = 0 # only for test, otherwise use None
jpayne@68 3449
jpayne@68 3450 `n` will always give the same number of returned features, but will be
jpayne@68 3451 slower since it is creating an index and then shuffling it.
jpayne@68 3452
jpayne@68 3453 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 3454 >>> b = a.random_subset(n=2)
jpayne@68 3455 >>> len(b)
jpayne@68 3456 2
jpayne@68 3457
jpayne@68 3458 Using a fraction `f` will be faster but depending on seed will result
jpayne@68 3459 in slightly different total numbers.
jpayne@68 3460
jpayne@68 3461 >>> a = pybedtools.example_bedtool('x.bam')
jpayne@68 3462 >>> len(a)
jpayne@68 3463 45593
jpayne@68 3464 >>> b = a.random_subset(f=0.4, seed=seed)
jpayne@68 3465 >>> len(b)
jpayne@68 3466 18316
jpayne@68 3467
jpayne@68 3468 Check that we have approximately the right fraction
jpayne@68 3469 >>> print('{0:.2f}'.format(len(b) / len(a)))
jpayne@68 3470 0.40
jpayne@68 3471
jpayne@68 3472 """
jpayne@68 3473 if ((n is None) and (f is None)) or ((n is not None) and (f is not None)):
jpayne@68 3474 raise ValueError("Exactly one of `n` or `f` must be provided")
jpayne@68 3475
jpayne@68 3476 tmpfn = self._tmp()
jpayne@68 3477 if seed is not None:
jpayne@68 3478 random.seed(seed)
jpayne@68 3479
jpayne@68 3480 if n:
jpayne@68 3481 idxs = list(range(len(self)))
jpayne@68 3482 random.shuffle(idxs)
jpayne@68 3483 idxs = idxs[:n]
jpayne@68 3484 with open(tmpfn, "w") as tmp:
jpayne@68 3485 for i, feature in enumerate(self):
jpayne@68 3486 if i in idxs:
jpayne@68 3487 tmp.write(str(feature))
jpayne@68 3488
jpayne@68 3489 elif f:
jpayne@68 3490 with open(tmpfn, "w") as tmp:
jpayne@68 3491 for i in self:
jpayne@68 3492 if random.random() <= f:
jpayne@68 3493 tmp.write(str(i))
jpayne@68 3494
jpayne@68 3495 return BedTool(tmpfn)
jpayne@68 3496
jpayne@68 3497 def total_coverage(self) -> int:
jpayne@68 3498 """
jpayne@68 3499 Return the total number of bases covered by this interval file.
jpayne@68 3500
jpayne@68 3501 Does a self.merge() first to remove potentially multiple-counting
jpayne@68 3502 bases.
jpayne@68 3503
jpayne@68 3504 Example usage:
jpayne@68 3505
jpayne@68 3506 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68 3507
jpayne@68 3508 This does a merge() first, so this is what the total coverage is
jpayne@68 3509 counting:
jpayne@68 3510
jpayne@68 3511 >>> print(a.merge()) #doctest: +NORMALIZE_WHITESPACE
jpayne@68 3512 chr1 1 500
jpayne@68 3513 chr1 900 950
jpayne@68 3514 <BLANKLINE>
jpayne@68 3515
jpayne@68 3516 >>> print(a.total_coverage())
jpayne@68 3517 549
jpayne@68 3518 """
jpayne@68 3519 b = self.merge()
jpayne@68 3520 total_bp = 0
jpayne@68 3521 for feature in b.features():
jpayne@68 3522 total_bp += len(feature)
jpayne@68 3523 return total_bp
jpayne@68 3524
jpayne@68 3525 @_log_to_history
jpayne@68 3526 def with_attrs(self, **kwargs) -> BedTool:
jpayne@68 3527 """
jpayne@68 3528 Helper method for adding attributes in the middle of a pipeline.
jpayne@68 3529
jpayne@68 3530 Given arbitrary keyword arguments, turns the keys and values into
jpayne@68 3531 attributes. Useful for labeling BedTools at creation time.
jpayne@68 3532
jpayne@68 3533 Example usage:
jpayne@68 3534
jpayne@68 3535 >>> # add a "label" attribute to each BedTool
jpayne@68 3536 >>> a = pybedtools.example_bedtool('a.bed')\
jpayne@68 3537 .with_attrs(label='transcription factor 1')
jpayne@68 3538 >>> b = pybedtools.example_bedtool('b.bed')\
jpayne@68 3539 .with_attrs(label='transcription factor 2')
jpayne@68 3540 >>> for i in [a, b]:
jpayne@68 3541 ... print('{0} features for {1}'.format(i.count(), i.label))
jpayne@68 3542 4 features for transcription factor 1
jpayne@68 3543 2 features for transcription factor 2
jpayne@68 3544
jpayne@68 3545 """
jpayne@68 3546 for key, value in list(kwargs.items()):
jpayne@68 3547 setattr(self, key, value)
jpayne@68 3548 return self
jpayne@68 3549
jpayne@68 3550 def as_intervalfile(self) -> IntervalFile:
jpayne@68 3551 """
jpayne@68 3552 Returns an IntervalFile of this BedTool for low-level interface.
jpayne@68 3553 """
jpayne@68 3554 if not isinstance(self.fn, str):
jpayne@68 3555 fn = self._collapse(self.fn)
jpayne@68 3556 else:
jpayne@68 3557 fn = self.fn
jpayne@68 3558 return IntervalFile(fn)
jpayne@68 3559
jpayne@68 3560 def liftover(self, chainfile: str, unmapped: Optional[str] = None, liftover_args: str = "") -> BedTool:
jpayne@68 3561 """
jpayne@68 3562 Returns a new BedTool of the liftedOver features, saving the unmapped
jpayne@68 3563 ones as `unmapped`. If `unmapped` is None, then discards the unmapped
jpayne@68 3564 features.
jpayne@68 3565
jpayne@68 3566 `liftover_args` is a string of additional args that is passed,
jpayne@68 3567 verbatim, to liftOver.
jpayne@68 3568
jpayne@68 3569 Needs `liftOver` from UCSC to be on the path and a `chainfile`
jpayne@68 3570 downloaded from UCSC.
jpayne@68 3571 """
jpayne@68 3572 result = BedTool._tmp()
jpayne@68 3573 if unmapped is None:
jpayne@68 3574 unmapped = BedTool._tmp()
jpayne@68 3575 cmds = ["liftOver", liftover_args, self.fn, chainfile, result, unmapped]
jpayne@68 3576 os.system(" ".join(cmds))
jpayne@68 3577 return BedTool(result)
jpayne@68 3578
jpayne@68 3579 def absolute_distance(self, other: BedTool, closest_kwargs: Optional[dict[str, Any]]=None, use_midpoints: bool=False) -> Iterator[int]:
jpayne@68 3580 """
jpayne@68 3581 Returns an iterator of the *absolute* distances between features in
jpayne@68 3582 self and other.
jpayne@68 3583
jpayne@68 3584 If `use_midpoints` is True, then only use the midpoints of features
jpayne@68 3585 (which will return values where features are overlapping). Otherwise,
jpayne@68 3586 when features overlap the value will always be zero.
jpayne@68 3587
jpayne@68 3588 `closest_kwargs` are passed to self.closest(); either `d` or
jpayne@68 3589 'D` are required in order to get back distance values (`d=True` is
jpayne@68 3590 default)
jpayne@68 3591 """
jpayne@68 3592 from .featurefuncs import midpoint
jpayne@68 3593
jpayne@68 3594 if closest_kwargs is None:
jpayne@68 3595 closest_kwargs = {"d": True}
jpayne@68 3596
jpayne@68 3597 if "D" not in closest_kwargs:
jpayne@68 3598 closest_kwargs.update(dict(d=True))
jpayne@68 3599
jpayne@68 3600 if use_midpoints:
jpayne@68 3601 mid_self = self.each(midpoint).saveas()
jpayne@68 3602 mid_other = other.each(midpoint).saveas()
jpayne@68 3603 c = mid_self.closest(mid_other, stream=True, **closest_kwargs)
jpayne@68 3604 else:
jpayne@68 3605 c = self.closest(other, stream=True, **closest_kwargs)
jpayne@68 3606 for i in c:
jpayne@68 3607 yield int(i[-1])
jpayne@68 3608
jpayne@68 3609 def relative_distance(self, other: BedTool, genome:Optional[dict|str] =None, g: Optional[str]=None) -> Iterator[float]:
jpayne@68 3610 """
jpayne@68 3611 Returns an iterator of relative distances between features in self and
jpayne@68 3612 other.
jpayne@68 3613
jpayne@68 3614 First computes the midpoints of self and other, then returns distances
jpayne@68 3615 of each feature in `other` relative to the distance between `self`
jpayne@68 3616 features.
jpayne@68 3617
jpayne@68 3618 Requires either `genome` (dictionary of chromsizes or assembly name) or
jpayne@68 3619 `g` (filename of chromsizes file).
jpayne@68 3620 """
jpayne@68 3621 g_dict = {}
jpayne@68 3622 if (genome is None) and (g is None):
jpayne@68 3623 raise ValueError("Need either `genome` or `g` arg for relative distance")
jpayne@68 3624 elif genome and g:
jpayne@68 3625 raise ValueError("Please specify only one of `genome` or `g`")
jpayne@68 3626 elif genome:
jpayne@68 3627 g_dict = dict(genome=genome)
jpayne@68 3628 elif g:
jpayne@68 3629 g_dict = dict(g=g)
jpayne@68 3630
jpayne@68 3631 from .featurefuncs import midpoint
jpayne@68 3632
jpayne@68 3633 # This gets the space between features in self.
jpayne@68 3634 c = self.each(midpoint).complement(**g_dict)
jpayne@68 3635
jpayne@68 3636 hits = c.intersect(other, wao=True, stream=True) # TODO: should this be other or mid_other?
jpayne@68 3637 for i in hits:
jpayne@68 3638 yield float(i[-1]) / len(i)
jpayne@68 3639
jpayne@68 3640 def colormap_normalize(self,
jpayne@68 3641 vmin: Optional[float|int]=None,
jpayne@68 3642 vmax: Optional[float|int]=None,
jpayne@68 3643 percentile: bool=False,
jpayne@68 3644 log: bool=False) -> mcolors.LogNorm|mcolors.Normalize:
jpayne@68 3645 """
jpayne@68 3646 Returns a normalization instance for use by featurefuncs.add_color().
jpayne@68 3647
jpayne@68 3648 Parameters
jpayne@68 3649 ----------
jpayne@68 3650 vmin, vmax : float, int, or None
jpayne@68 3651 `vmin` and `vmax` set the colormap bounds; if None then
jpayne@68 3652 these will be determined from the scores in the BED file.
jpayne@68 3653
jpayne@68 3654 log : bool
jpayne@68 3655 If True, put the scores on a log scale; of course be careful
jpayne@68 3656 if you have negative scores
jpayne@68 3657
jpayne@68 3658 percentile : bool
jpayne@68 3659 If True, interpret vmin and vmax as a percentile in the range
jpayne@68 3660 [0,100] rather than absolute values.
jpayne@68 3661 """
jpayne@68 3662 field_count = self.field_count()
jpayne@68 3663 if (self.file_type != "bed") or (field_count < 5):
jpayne@68 3664 raise ValueError("colorizing only works for BED files with score " "fields")
jpayne@68 3665 try:
jpayne@68 3666 import matplotlib.colors as mcolors
jpayne@68 3667 except ImportError:
jpayne@68 3668 raise ImportError("matplotlib.colors must be installed to use colormap_normalize")
jpayne@68 3669
jpayne@68 3670 try:
jpayne@68 3671 import numpy as np
jpayne@68 3672 except ImportError:
jpayne@68 3673 raise ImportError("numpy must be installed to use colormap_normalize")
jpayne@68 3674
jpayne@68 3675 if log:
jpayne@68 3676 norm = mcolors.LogNorm()
jpayne@68 3677 else:
jpayne@68 3678 norm = mcolors.Normalize()
jpayne@68 3679
jpayne@68 3680 scores = np.array([i.score for i in self], dtype=float)
jpayne@68 3681 scores = scores[np.isfinite(scores)]
jpayne@68 3682 norm.autoscale(scores)
jpayne@68 3683
jpayne@68 3684 if vmin is not None:
jpayne@68 3685 if percentile:
jpayne@68 3686 vmin = float(np.percentile(scores, vmin))
jpayne@68 3687 norm.vmin = vmin
jpayne@68 3688 if vmax is not None:
jpayne@68 3689 if percentile:
jpayne@68 3690 vmax = float(np.percentile(scores, vmax))
jpayne@68 3691 norm.vmax = vmax
jpayne@68 3692
jpayne@68 3693 return norm
jpayne@68 3694
jpayne@68 3695 def at(self, inds: list[int]) -> BedTool:
jpayne@68 3696 """
jpayne@68 3697 Returns a new BedTool with only intervals at lines `inds`
jpayne@68 3698
jpayne@68 3699 Parameters
jpayne@68 3700 ----------
jpayne@68 3701 inds : List[int]
jpayne@68 3702 List of line numbers
jpayne@68 3703
jpayne@68 3704 Returns
jpayne@68 3705 -------
jpayne@68 3706 BedTool
jpayne@68 3707 New BedTool with only intervals at `inds`
jpayne@68 3708 """
jpayne@68 3709 length = len(inds)
jpayne@68 3710
jpayne@68 3711 def _gen():
jpayne@68 3712 k = 0
jpayne@68 3713 for i, feature in enumerate(self):
jpayne@68 3714 if i == inds[k]:
jpayne@68 3715 yield feature
jpayne@68 3716 k += 1
jpayne@68 3717 if k == length:
jpayne@68 3718 break
jpayne@68 3719
jpayne@68 3720 return BedTool(_gen()).saveas()
jpayne@68 3721
jpayne@68 3722 def to_dataframe(self, disable_auto_names: bool = False, *args, **kwargs) -> pd.DataFrame:
jpayne@68 3723 """
jpayne@68 3724 Create a pandas.DataFrame, passing args and kwargs to pandas.read_csv
jpayne@68 3725 The separator kwarg `sep` is given a tab `\\t` as value by default.
jpayne@68 3726
jpayne@68 3727 Parameters
jpayne@68 3728 ----------
jpayne@68 3729 disable_auto_names : bool
jpayne@68 3730 By default, the created dataframe fills in column names
jpayne@68 3731 automatically according to the detected filetype (e.g., "chrom",
jpayne@68 3732 "start", "end" for a BED3 file). Set this argument to True to
jpayne@68 3733 disable this behavior.
jpayne@68 3734 """
jpayne@68 3735 # Complain if BAM or if not a file
jpayne@68 3736 if self._isbam:
jpayne@68 3737 raise ValueError("BAM not supported for converting to DataFrame")
jpayne@68 3738 if not isinstance(self.fn, str):
jpayne@68 3739 raise ValueError("use .saveas() to make sure self.fn is a file")
jpayne@68 3740
jpayne@68 3741 try:
jpayne@68 3742 import pandas
jpayne@68 3743 except ImportError:
jpayne@68 3744 raise ImportError("pandas must be installed to convert to pandas.DataFrame")
jpayne@68 3745 # Otherwise we're good:
jpayne@68 3746 names = kwargs.get("names", None)
jpayne@68 3747 if names is None and not disable_auto_names:
jpayne@68 3748 try:
jpayne@68 3749 _names = settings._column_names[self.file_type][: self.field_count()]
jpayne@68 3750 if len(_names) < self.field_count():
jpayne@68 3751 warn(
jpayne@68 3752 "Default names for filetype %s are:\n%s\nbut file has "
jpayne@68 3753 "%s fields; you can supply custom names with the "
jpayne@68 3754 "`names` kwarg" % (self.file_type, _names, self.field_count())
jpayne@68 3755 )
jpayne@68 3756 _names = None
jpayne@68 3757 except KeyError:
jpayne@68 3758 _names = None
jpayne@68 3759 kwargs["names"] = _names
jpayne@68 3760
jpayne@68 3761 if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0:
jpayne@68 3762 return pandas.read_csv(self.fn, *args, sep="\t", **kwargs) # type: ignore
jpayne@68 3763 else:
jpayne@68 3764 return pandas.DataFrame()
jpayne@68 3765
jpayne@68 3766 def tail(self, lines:int = 10, as_string: bool = False) -> Optional[str]:
jpayne@68 3767 """
jpayne@68 3768 Like `head`, but prints last 10 lines of the file by default.
jpayne@68 3769
jpayne@68 3770 To avoid consuming iterables, this only works with file-based, non-BAM
jpayne@68 3771 BedTool objects.
jpayne@68 3772
jpayne@68 3773 Use `as_string=True` to return a string.
jpayne@68 3774 """
jpayne@68 3775 if self._isbam:
jpayne@68 3776 raise ValueError("tail() not yet implemented for BAM files")
jpayne@68 3777 if not isinstance(self.fn, str):
jpayne@68 3778 raise ValueError(
jpayne@68 3779 "tail() not implemented for non-file-based "
jpayne@68 3780 "BedTool objects. Please use saveas() first."
jpayne@68 3781 )
jpayne@68 3782 bufsize = 8192
jpayne@68 3783 offset = bufsize
jpayne@68 3784 f = open(self.fn, "rb")
jpayne@68 3785
jpayne@68 3786 # whence=2 arg means relative to end (i.e., go to the end)
jpayne@68 3787 f.seek(0, 2)
jpayne@68 3788 file_size = f.tell()
jpayne@68 3789 data = []
jpayne@68 3790 while True:
jpayne@68 3791 if file_size < bufsize:
jpayne@68 3792 offset = file_size
jpayne@68 3793 f.seek(-offset, 2)
jpayne@68 3794 chunk = f.read(offset)
jpayne@68 3795 data.extend(chunk.splitlines(True))
jpayne@68 3796 if len(data) >= lines or offset == file_size:
jpayne@68 3797 break
jpayne@68 3798 offset += bufsize
jpayne@68 3799
jpayne@68 3800 result = "".join([i.decode() for i in data[-lines:]])
jpayne@68 3801 if as_string:
jpayne@68 3802 return result
jpayne@68 3803 else:
jpayne@68 3804 print(result)
jpayne@68 3805
jpayne@68 3806
jpayne@68 3807 class BAM(object):
jpayne@68 3808 def __init__(self, stream):
jpayne@68 3809 """
jpayne@68 3810 Wraps pysam.Samfile so that it yields pybedtools.Interval objects when
jpayne@68 3811 iterated over.
jpayne@68 3812
jpayne@68 3813 The pysam.Samfile can be accessed via the .pysam_bamfile attribute.
jpayne@68 3814 """
jpayne@68 3815 self.stream = stream
jpayne@68 3816 if not isinstance(self.stream, str):
jpayne@68 3817 raise ValueError("Only files are supported, not streams")
jpayne@68 3818 self.pysam_bamfile = pysam.Samfile(self.stream)
jpayne@68 3819
jpayne@68 3820 def _aligned_segment_to_interval(self, r):
jpayne@68 3821
jpayne@68 3822 if r.rname >= 0:
jpayne@68 3823 rname = self.pysam_bamfile.get_reference_name(r.rname)
jpayne@68 3824 else:
jpayne@68 3825 rname = "*"
jpayne@68 3826
jpayne@68 3827 if r.rnext >= 0:
jpayne@68 3828 if r.rnext == r.rname:
jpayne@68 3829 rnext = "="
jpayne@68 3830 else:
jpayne@68 3831 rnext = self.pysam_bamfile.get_reference_name(r.rnext)
jpayne@68 3832 else:
jpayne@68 3833 rnext = "*"
jpayne@68 3834
jpayne@68 3835 # SAM spec says if unavailable should be set to 0. Pysam sets to -1.
jpayne@68 3836
jpayne@68 3837 if r.pnext <= 0:
jpayne@68 3838 pnext = "0"
jpayne@68 3839 else:
jpayne@68 3840 # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
jpayne@68 3841 # but pysam uses 0-based.
jpayne@68 3842 pnext = str(r.pnext + 1)
jpayne@68 3843
jpayne@68 3844 if r.cigarstring:
jpayne@68 3845 cigarstring = r.cigarstring
jpayne@68 3846 else:
jpayne@68 3847 cigarstring = "*"
jpayne@68 3848
jpayne@68 3849 # Rudimentary support.
jpayne@68 3850 # TODO: remove when refactoring to new BAM iterating
jpayne@68 3851 tags = []
jpayne@68 3852 for k, v in r.tags:
jpayne@68 3853 if isinstance(v, int):
jpayne@68 3854 t = "i"
jpayne@68 3855 elif isinstance(v, float):
jpayne@68 3856 t = "f"
jpayne@68 3857 else:
jpayne@68 3858 t = "Z"
jpayne@68 3859 tags.append("{0}:{1}:{2}".format(k, t, v))
jpayne@68 3860
jpayne@68 3861 tags = "\t".join(tags)
jpayne@68 3862
jpayne@68 3863 if r.seq:
jpayne@68 3864 seq = r.seq
jpayne@68 3865 else:
jpayne@68 3866 seq = "*"
jpayne@68 3867
jpayne@68 3868 if r.qual:
jpayne@68 3869 qual = r.qual
jpayne@68 3870 else:
jpayne@68 3871 qual = "*"
jpayne@68 3872
jpayne@68 3873 fields = [
jpayne@68 3874 r.qname,
jpayne@68 3875 str(r.flag),
jpayne@68 3876 rname,
jpayne@68 3877 # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
jpayne@68 3878 # but pysam uses 0-based.
jpayne@68 3879 str(r.pos + 1),
jpayne@68 3880 str(r.mapq),
jpayne@68 3881 cigarstring,
jpayne@68 3882 rnext,
jpayne@68 3883 pnext,
jpayne@68 3884 str(r.tlen),
jpayne@68 3885 seq,
jpayne@68 3886 qual,
jpayne@68 3887 ]
jpayne@68 3888 if tags:
jpayne@68 3889 fields.append(tags)
jpayne@68 3890
jpayne@68 3891 if None in fields:
jpayne@68 3892 raise ValueError("Found 'None' in fields: %s" % fields)
jpayne@68 3893 return create_interval_from_list(fields)
jpayne@68 3894
jpayne@68 3895 def __iter__(self):
jpayne@68 3896 return self
jpayne@68 3897
jpayne@68 3898 # TODO: this is PAINFUL but it ensures that existing tests work. Once all
jpayne@68 3899 # tests work, the new behavior will be to yield pysam AlignedSegment
jpayne@68 3900 # objects directly.
jpayne@68 3901 def __next__(self):
jpayne@68 3902 return self._aligned_segment_to_interval(next(self.pysam_bamfile))
jpayne@68 3903
jpayne@68 3904 def next(self):
jpayne@68 3905 return self.__next__()
jpayne@68 3906
jpayne@68 3907
jpayne@68 3908 class History(list):
jpayne@68 3909 def __init__(self):
jpayne@68 3910 """
jpayne@68 3911 Represents one or many HistorySteps. Mostly used for nicely formatting
jpayne@68 3912 a series of HistorySteps.
jpayne@68 3913 """
jpayne@68 3914 list.__init__(self)
jpayne@68 3915
jpayne@68 3916
jpayne@68 3917 class HistoryStep(object):
jpayne@68 3918 def __init__(self, method, args, kwargs, bedtool_instance, parent_tag, result_tag):
jpayne@68 3919 """
jpayne@68 3920 Class to represent one step in the history.
jpayne@68 3921
jpayne@68 3922 Mostly used for its __repr__ method, to try and exactly replicate code
jpayne@68 3923 that can be pasted to re-do history steps
jpayne@68 3924 """
jpayne@68 3925 try:
jpayne@68 3926 self.method = method._name
jpayne@68 3927 except AttributeError:
jpayne@68 3928 self.method = method.__name__
jpayne@68 3929
jpayne@68 3930 self.args = args
jpayne@68 3931 self.kwargs = kwargs
jpayne@68 3932 self.fn = bedtool_instance.fn
jpayne@68 3933 self.parent_tag = parent_tag
jpayne@68 3934 self.result_tag = result_tag
jpayne@68 3935
jpayne@68 3936 def _clean_arg(self, arg: str|BedTool) -> str:
jpayne@68 3937 """
jpayne@68 3938 Wrap strings in quotes and convert bedtool instances to filenames.
jpayne@68 3939 """
jpayne@68 3940 if isinstance(arg, BedTool):
jpayne@68 3941 arg = arg.fn
jpayne@68 3942 if isinstance(arg, str):
jpayne@68 3943 arg = '"%s"' % arg
jpayne@68 3944 return arg
jpayne@68 3945
jpayne@68 3946 def __repr__(self):
jpayne@68 3947 # Still not sure whether to use pybedtools.bedtool() or bedtool()
jpayne@68 3948 s = ""
jpayne@68 3949 s += "<HistoryStep> "
jpayne@68 3950 if os.path.exists(self.fn):
jpayne@68 3951 s += 'BedTool("%(fn)s").%(method)s(%%s%%s)' % self.__dict__
jpayne@68 3952 else:
jpayne@68 3953 s += 'BedTool("MISSING FILE: %(fn)s")' % self.__dict__
jpayne@68 3954 s += ".%(method)s(%%s%%s)" % self.__dict__
jpayne@68 3955
jpayne@68 3956 # Format args and kwargs
jpayne@68 3957 args_string = ",".join(map(self._clean_arg, self.args))
jpayne@68 3958 kwargs_string = ",".join(
jpayne@68 3959 ["%s=%s" % (i[0], self._clean_arg(i[1])) for i in list(self.kwargs.items())]
jpayne@68 3960 )
jpayne@68 3961 # stick a comma on the end if there's something here
jpayne@68 3962 if len(args_string) > 0:
jpayne@68 3963 args_string += ", "
jpayne@68 3964
jpayne@68 3965 s = s % (args_string, kwargs_string)
jpayne@68 3966 s += ", parent tag: %s" % self.parent_tag
jpayne@68 3967 s += ", result tag: %s" % self.result_tag
jpayne@68 3968 return s
jpayne@68 3969
jpayne@68 3970
jpayne@68 3971 def example_bedtool(fn):
jpayne@68 3972 """
jpayne@68 3973 Return a bedtool using a bed file from the pybedtools examples directory.
jpayne@68 3974 Use :func:`list_example_files` to see a list of files that are included.
jpayne@68 3975 """
jpayne@68 3976 fn = os.path.join(filenames.data_dir(), fn)
jpayne@68 3977 if not os.path.exists(fn):
jpayne@68 3978 msg = "%s does not exist" % fn
jpayne@68 3979 raise FileNotFoundError(msg)
jpayne@68 3980 return BedTool(fn)
jpayne@68 3981
jpayne@68 3982
jpayne@68 3983 if __name__ == "__main__":
jpayne@68 3984 import doctest
jpayne@68 3985
jpayne@68 3986 doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)