csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/bedtool.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/bedtool.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 from __future__ import annotations
jpayne@68	2 import tempfile
jpayne@68	3 from textwrap import dedent
jpayne@68	4 import shutil
jpayne@68	5 import subprocess
jpayne@68	6 import operator
jpayne@68	7 import os
jpayne@68	8 import sys
jpayne@68	9 import random
jpayne@68	10 import string
jpayne@68	11 import pprint
jpayne@68	12 from itertools import islice
jpayne@68	13 from multiprocessing import Pool
jpayne@68	14 import gzip
jpayne@68	15 from typing import Any, Callable, Iterable, Iterator, Literal, Optional, TYPE_CHECKING, cast
jpayne@68	16 import pysam
jpayne@68	17 from warnings import warn
jpayne@68	18 import pathlib
jpayne@68	19 from pathlib import Path
jpayne@68	20
jpayne@68	21 from .helpers import (
jpayne@68	22 get_tempdir,
jpayne@68	23 _tags,
jpayne@68	24 call_bedtools,
jpayne@68	25 _flatten_list,
jpayne@68	26 _check_sequence_stderr,
jpayne@68	27 isBAM,
jpayne@68	28 isBGZIP,
jpayne@68	29 isGZIP,
jpayne@68	30 BEDToolsError,
jpayne@68	31 pybedtoolsError,
jpayne@68	32 _call_randomintersect,
jpayne@68	33 SplitOutput,
jpayne@68	34 FisherOutput,
jpayne@68	35 )
jpayne@68	36 from . import helpers
jpayne@68	37 from .cbedtools import (
jpayne@68	38 IntervalFile,
jpayne@68	39 IntervalIterator,
jpayne@68	40 Interval,
jpayne@68	41 create_interval_from_list,
jpayne@68	42 BedToolsFileError,
jpayne@68	43 )
jpayne@68	44 import pybedtools
jpayne@68	45 from . import settings
jpayne@68	46 from . import filenames
jpayne@68	47
jpayne@68	48 if TYPE_CHECKING:
jpayne@68	49 import pandas as pd
jpayne@68	50 import matplotlib.colors as mcolors
jpayne@68	51
jpayne@68	52 _implicit_registry = {}
jpayne@68	53 _other_registry = {}
jpayne@68	54 _bam_registry = {}
jpayne@68	55
jpayne@68	56
jpayne@68	57 def _jaccard_output_to_dict(s, **kwargs) -> dict:
jpayne@68	58 """
jpayne@68	59 jaccard method doesn't return an interval file, rather, it returns a short
jpayne@68	60 summary of results. Here, we simply parse it into a dict for convenience.
jpayne@68	61 """
jpayne@68	62 if isinstance(s, str):
jpayne@68	63 _s = open(s).read()
jpayne@68	64 elif hasattr(s, "next") or hasattr(s, "__next__"):
jpayne@68	65 _s = "".join([i for i in s])
jpayne@68	66 else:
jpayne@68	67 raise ValueError("Unexpected object %r" % s)
jpayne@68	68 header, data = _s.splitlines()
jpayne@68	69 header = header.split()
jpayne@68	70 data = data.split()
jpayne@68	71 data[0] = int(data[0])
jpayne@68	72 data[1] = int(data[1])
jpayne@68	73 data[2] = float(data[2])
jpayne@68	74 data[3] = int(data[3])
jpayne@68	75 return dict(list(zip(header, data)))
jpayne@68	76
jpayne@68	77
jpayne@68	78 def _reldist_output_handler(s, **kwargs):
jpayne@68	79 """
jpayne@68	80 reldist, if called with -detail, returns a valid BED file with the relative
jpayne@68	81 distance as the last field. In that case, return the BedTool immediately.
jpayne@68	82 If not -detail, then the results are a table, in which case here we parse
jpayne@68	83 into a dict for convenience.
jpayne@68	84 """
jpayne@68	85 if "detail" in kwargs:
jpayne@68	86 return BedTool(s)
jpayne@68	87 if isinstance(s, str):
jpayne@68	88 iterable = open(s)
jpayne@68	89 if hasattr(s, "next"):
jpayne@68	90 iterable = s
jpayne@68	91 header = next(iterable).split()
jpayne@68	92 results = {}
jpayne@68	93 for h in header:
jpayne@68	94 results[h] = []
jpayne@68	95 for i in iterable:
jpayne@68	96 reldist, count, total, fraction = i.split()
jpayne@68	97 data = [float(reldist), int(count), int(total), float(fraction)]
jpayne@68	98 for h, d in zip(header, data):
jpayne@68	99 results[h].append(d)
jpayne@68	100 return results
jpayne@68	101
jpayne@68	102
jpayne@68	103 def _wraps(
jpayne@68	104 prog: Optional[str] = None,
jpayne@68	105 implicit: Optional[str] = None,
jpayne@68	106 bam: Optional[str] = None,
jpayne@68	107 other: Optional[str] = None,
jpayne@68	108 uses_genome: bool = False,
jpayne@68	109 make_tempfile_for: Optional[str] = None,
jpayne@68	110 check_stderr:Optional[Callable]=None,
jpayne@68	111 add_to_bedtool:Optional[dict] = None,
jpayne@68	112 nonbam: Optional[Literal["ALL"] \| str \| list[str]] = None,
jpayne@68	113 force_bam: bool = False,
jpayne@68	114 genome_none_if: Optional[list[str]] =None,
jpayne@68	115 genome_if: Optional[list[str]] = None,
jpayne@68	116 genome_ok_if: Optional[list[str]] = None,
jpayne@68	117 does_not_return_bedtool: Optional[Callable] =None,
jpayne@68	118 arg_order: Optional[list[str]] = None,
jpayne@68	119 ):
jpayne@68	120 """
jpayne@68	121 Do-it-all wrapper, to be used as a decorator.
jpayne@68	122
jpayne@68	123 prog is the name of the BEDTools program that will be called. The help
jpayne@68	124 for this program will also be added to the decorated method's docstring.
jpayne@68	125
jpayne@68	126 implicit is the BEDTools program arg that should be filled in
jpayne@68	127 automatically.
jpayne@68	128
jpayne@68	129 bam will disable the implicit substitution if bam is in the kwargs.
jpayne@68	130 This is typically 'abam' or 'ibam' if the program accepts BAM input.
jpayne@68	131
jpayne@68	132 other is the BEDTools program arg that is passed in as the second input,
jpayne@68	133 if supported. Within the semantics of BEDTools, the typical case will be
jpayne@68	134 that if implicit='a' then other='b'; if implicit='i' then other=None.
jpayne@68	135
jpayne@68	136 uses_genome, if True, will check for 'g' and/or 'genome' args and
jpayne@68	137 retrieve the corresponding genome files as needed.
jpayne@68	138
jpayne@68	139 make_tempfile_for is used for the sequence methods and indicates which
jpayne@68	140 kwarg should have a tempfile made for it if it's not provided ('fo' for the
jpayne@68	141 sequence methods)
jpayne@68	142
jpayne@68	143 check_stderr, if not None, is a function that accepts a string (which
jpayne@68	144 will be anything written to stdout when calling the wrapped program). This
jpayne@68	145 function should return True if the string is OK, and False if it should
jpayne@68	146 truly be considered an error. This is needed for wrapping fastaFromBed,
jpayne@68	147 which will report to stderr that it's creating an index file.
jpayne@68	148
jpayne@68	149 add_to_bedtool is used for sequence methods. It is a dictionary mapping
jpayne@68	150 kwargs to attributes to be created in the resulting BedTool. Typically it
jpayne@68	151 is {'fo':'seqfn'} which will add the resulting sequence name to the
jpayne@68	152 BedTool's .seqfn attribute. If add_to_bedtool is not None, then the
jpayne@68	153 returned BedTool will be self with the added attribute. If a key is
jpayne@68	154 "stdout" (e.g., {"stdout": attr_name}), then save the stdout of the command
jpayne@68	155 as a tempfile and store the tempfile's name in the attribute. This is
jpayne@68	156 required for linksBed and bedToIgv.
jpayne@68	157
jpayne@68	158 nonbam is a kwarg that even if the input file was a BAM, the output will
jpayne@68	159 not be BAM format. For example, the `-bed` arg for intersectBed will
jpayne@68	160 cause the output to be in BED format, not BAM. If not None, this can be a
jpayne@68	161 string, a list of strings, or the special string "ALL", which means that
jpayne@68	162 the wrapped program will never return BAM output.
jpayne@68	163
jpayne@68	164 force_bam, if True, will force the output to be BAM. This is used for
jpayne@68	165 bedToBam.
jpayne@68	166
jpayne@68	167 genome_none_if is a list of arguments that will ignore the requirement
jpayne@68	168 for a genome. This is needed for window_maker, where -b and -g are
jpayne@68	169 mutually exclusive.
jpayne@68	170
jpayne@68	171 genome_ok_if is a list of arguments that, if they are in
jpayne@68	172 genome_none_if, are still OK to pass in. This is needed for bedtool
jpayne@68	173 genomecov, where -g is not needed if -ibam is specified...but it's still OK
jpayne@68	174 if the user passes a genome arg.
jpayne@68	175
jpayne@68	176 genome_if is a list of arguments that will trigger the requirement for
jpayne@68	177 a genome; otherwise no genome needs to be specified.
jpayne@68	178
jpayne@68	179 does_not_return_bedtool, if not None, should be a function that handles
jpayne@68	180 the returned output. Its signature should be ``func(output, kwargs)``,
jpayne@68	181 where `output` is the output from the [possibly streaming] call to BEDTools
jpayne@68	182 and `kwargs` are passed verbatim from the wrapped method call. Some
jpayne@68	183 examples of methods that use this are jaccard, reldist, fisher, and split
jpayne@68	184 methods.
jpayne@68	185
jpayne@68	186 arg_order, if not None, is a sorted list of arguments. This is used by
jpayne@68	187 handle_kwargs() to deal with things like issues 81 and 345, where some
jpayne@68	188 BEDTools programs are sensitive to argument order.
jpayne@68	189 """
jpayne@68	190
jpayne@68	191 # NOTE: We are calling each BEDTools program to get its help and adding
jpayne@68	192 # that to the docstring of each method. This is run at import time. However
jpayne@68	193 # if BEDTools is not on the path at import time, `not_implemented` is set
jpayne@68	194 # to True and isn't reset later until the module is reloaded.
jpayne@68	195 #
jpayne@68	196 # helpers.set_bedtools_path therefore will trigger a module reload.
jpayne@68	197 not_implemented = False
jpayne@68	198
jpayne@68	199 # Call the program with -h to get help, which prints to stderr.
jpayne@68	200 try:
jpayne@68	201 p = subprocess.Popen(
jpayne@68	202 helpers._version_2_15_plus_names(prog) + ["-h"],
jpayne@68	203 stdout=subprocess.PIPE,
jpayne@68	204 stderr=subprocess.PIPE,
jpayne@68	205 )
jpayne@68	206 help_str = p.communicate()[1].decode()
jpayne@68	207
jpayne@68	208 # underscores throw off ReStructuredText syntax of docstrings, so
jpayne@68	209 # replace 'em
jpayne@68	210 help_str = help_str.replace("_", "**")
jpayne@68	211
jpayne@68	212 # indent
jpayne@68	213 help_str = help_str.split("\n")
jpayne@68	214 help_str = ["\n\nOriginal BEDTools help:::"] + ["\t" + i for i in help_str]
jpayne@68	215 help_str = "\n".join(help_str) + "\n"
jpayne@68	216
jpayne@68	217 # If the program can't be found, then we'll eventually replace the method
jpayne@68	218 # with a version that does nothing but raise a NotImplementedError (plus
jpayne@68	219 # a helpful message).
jpayne@68	220 except OSError:
jpayne@68	221 help_str = (
jpayne@68	222 '"%s" does not appear to be installed '
jpayne@68	223 "or on the path, so this method is "
jpayne@68	224 "disabled. Please install a more recent "
jpayne@68	225 "version of BEDTools and re-import to "
jpayne@68	226 "use this method." % prog
jpayne@68	227 )
jpayne@68	228 not_implemented = True
jpayne@68	229
jpayne@68	230 def decorator(func):
jpayne@68	231 """
jpayne@68	232 Accepts a function to be wrapped; discards the original and returns a
jpayne@68	233 new, rebuilt-from-scratch function based on the kwargs passed to
jpayne@68	234 _wraps().
jpayne@68	235 """
jpayne@68	236 # Register the implicit (as well as bam and other) args in the global
jpayne@68	237 # registry. BedTool.handle_kwargs() will access these at runtime. The
jpayne@68	238 # registry is keyed by program name (like intersectBed).
jpayne@68	239 _implicit_registry[prog] = implicit
jpayne@68	240 if other is not None:
jpayne@68	241 _other_registry[prog] = other
jpayne@68	242 if bam is not None:
jpayne@68	243 _bam_registry[prog] = bam
jpayne@68	244
jpayne@68	245 # Here's where we replace an unable-to-be-found program's method with
jpayne@68	246 # one that only returns a NotImplementedError
jpayne@68	247 if not_implemented:
jpayne@68	248
jpayne@68	249 def not_implemented_func(args, *kwargs):
jpayne@68	250 raise NotImplementedError(help_str)
jpayne@68	251
jpayne@68	252 return not_implemented_func
jpayne@68	253
jpayne@68	254 _add_doc = []
jpayne@68	255 if implicit:
jpayne@68	256 _add_doc.append(
jpayne@68	257 dedent(
jpayne@68	258 """
jpayne@68	259 For convenience, the file or stream this BedTool points to
jpayne@68	260 is implicitly passed as the `-%s` argument to `%s`
jpayne@68	261 """
jpayne@68	262 % (implicit, prog)
jpayne@68	263 )
jpayne@68	264 )
jpayne@68	265
jpayne@68	266 if uses_genome:
jpayne@68	267 _add_doc.append(
jpayne@68	268 dedent(
jpayne@68	269 """
jpayne@68	270 There are two alternatives for supplying a genome. Use
jpayne@68	271 `g="genome.filename"` if you have a genome's chrom sizes
jpayne@68	272 saved as a file. This is the what BEDTools expects when
jpayne@68	273 using it from the command line. Alternatively, use the
jpayne@68	274 `genome="assembly.name"` (for example, `genome="hg19"`) to
jpayne@68	275 use chrom sizes for that assembly without having to manage
jpayne@68	276 a separate file. The `genome` argument triggers a call
jpayne@68	277 `pybedtools.chromsizes`, so see that method for more
jpayne@68	278 details.
jpayne@68	279 """
jpayne@68	280 )
jpayne@68	281 )
jpayne@68	282
jpayne@68	283 def wrapped(self, args, *kwargs):
jpayne@68	284 """
jpayne@68	285 A newly created function that will be returned by the _wraps()
jpayne@68	286 decorator
jpayne@68	287 """
jpayne@68	288
jpayne@68	289 # Only one non-keyword argument is supported; this is then assumed
jpayne@68	290 # to be "other" (e.g., `-b` for intersectBed)
jpayne@68	291 if len(args) > 0:
jpayne@68	292 assert len(args) == 1
jpayne@68	293 kwargs[other] = args[0]
jpayne@68	294
jpayne@68	295 # Add the implicit values to kwargs. If the current BedTool is
jpayne@68	296 # BAM, it will automatically be passed to the appropriate
jpayne@68	297 # BAM-support arg (like `-abam`). But this also allows the user to
jpayne@68	298 # explicitly specify the abam kwarg, which will override the
jpayne@68	299 # auto-substitution.
jpayne@68	300 # Note: here, `implicit` is something like "a"; `bam` is something
jpayne@68	301 # like "abam"
jpayne@68	302 if (
jpayne@68	303 (implicit not in kwargs)
jpayne@68	304 and (bam not in kwargs)
jpayne@68	305 and (implicit is not None)
jpayne@68	306 ):
jpayne@68	307 if not self._isbam:
jpayne@68	308 kwargs[implicit] = self.fn
jpayne@68	309 else:
jpayne@68	310 # It is a bam file. If this program supports BAM as the
jpayne@68	311 # first input, then we set it here
jpayne@68	312 if bam is not None:
jpayne@68	313 kwargs[bam] = self.fn
jpayne@68	314
jpayne@68	315 # Otherwise, BEDTools can't currently handle it, so raise
jpayne@68	316 # an exception.
jpayne@68	317 else:
jpayne@68	318 raise pybedtoolsError(
jpayne@68	319 '"%s" currently can\'t handle BAM '
jpayne@68	320 "input, please use bam_to_bed() first." % prog
jpayne@68	321 )
jpayne@68	322
jpayne@68	323 # Should this function handle genome files?
jpayne@68	324 check_for_genome = uses_genome
jpayne@68	325 if uses_genome:
jpayne@68	326 if genome_none_if:
jpayne@68	327 for i in genome_none_if:
jpayne@68	328 if i in kwargs or i == implicit:
jpayne@68	329 check_for_genome = False
jpayne@68	330
jpayne@68	331 # for genomecov, if -ibam then -g is optional. So it's OK
jpayne@68	332 # for the user to provide genome or g kwargs, even if
jpayne@68	333 # -ibam.
jpayne@68	334 if genome_ok_if:
jpayne@68	335 for i in genome_ok_if:
jpayne@68	336 if i in kwargs or i == implicit:
jpayne@68	337 if ("g" in kwargs) or ("genome" in kwargs):
jpayne@68	338 check_for_genome = True
jpayne@68	339 if genome_if:
jpayne@68	340 check_for_genome = False
jpayne@68	341 for i in genome_if:
jpayne@68	342 if (i in kwargs) or (i == implicit):
jpayne@68	343 check_for_genome = True
jpayne@68	344 if check_for_genome:
jpayne@68	345 kwargs = self.check_genome(**kwargs)
jpayne@68	346
jpayne@68	347 # For sequence methods, we may need to make a tempfile that will
jpayne@68	348 # hold the resulting sequence. For example, fastaFromBed needs to
jpayne@68	349 # make a tempfile for 'fo' if no 'fo' was explicitly specified by
jpayne@68	350 # the user.
jpayne@68	351 if make_tempfile_for is not None:
jpayne@68	352 if make_tempfile_for not in kwargs:
jpayne@68	353 kwargs[make_tempfile_for] = self._tmp()
jpayne@68	354
jpayne@68	355 # At runtime, this will parse the kwargs, convert streams to
jpayne@68	356 # tempfiles if needed, and return all the goodies
jpayne@68	357 cmds, tmp, stdin = self.handle_kwargs(prog=prog,
jpayne@68	358 arg_order=arg_order,
jpayne@68	359 **kwargs)
jpayne@68	360
jpayne@68	361 # Decide whether the output is BAM format or not.
jpayne@68	362 result_is_bam = False
jpayne@68	363
jpayne@68	364 # By default, if the current BedTool is BAM, then the result should
jpayne@68	365 # be, too.
jpayne@68	366 if self._isbam:
jpayne@68	367 result_is_bam = True
jpayne@68	368
jpayne@68	369 # If nonbam is "ALL", then this method will never return BAM
jpayne@68	370 # output.
jpayne@68	371 if nonbam == "ALL":
jpayne@68	372 result_is_bam = False
jpayne@68	373
jpayne@68	374 # If any of the `nonbam` args are found in kwargs, then result is
jpayne@68	375 # not a BAM. Side note: the _nonbam name mangling is necessary to
jpayne@68	376 # keep the nonbam arg passed into the original _wraps() decorator
jpayne@68	377 # in scope.
jpayne@68	378 if nonbam is not None and nonbam != "ALL":
jpayne@68	379 if isinstance(nonbam, str):
jpayne@68	380 _nonbam = [nonbam]
jpayne@68	381 else:
jpayne@68	382 _nonbam = nonbam
jpayne@68	383 for i in _nonbam:
jpayne@68	384 if i in kwargs:
jpayne@68	385 result_is_bam = False
jpayne@68	386 break
jpayne@68	387
jpayne@68	388 if force_bam:
jpayne@68	389 result_is_bam = True
jpayne@68	390
jpayne@68	391 decode_output = not result_is_bam
jpayne@68	392
jpayne@68	393 # Do the actual call
jpayne@68	394 stream = call_bedtools(
jpayne@68	395 cmds,
jpayne@68	396 tmp,
jpayne@68	397 stdin=stdin,
jpayne@68	398 check_stderr=check_stderr,
jpayne@68	399 decode_output=decode_output,
jpayne@68	400 )
jpayne@68	401
jpayne@68	402 if does_not_return_bedtool:
jpayne@68	403 return does_not_return_bedtool(stream, **kwargs)
jpayne@68	404
jpayne@68	405 # Post-hoc editing of the BedTool -- for example, this is used for
jpayne@68	406 # the sequence methods to add a `seqfn` attribute to the resulting
jpayne@68	407 # BedTool.
jpayne@68	408 if add_to_bedtool is not None:
jpayne@68	409 for kw, attr in list(add_to_bedtool.items()):
jpayne@68	410 if kw == "stdout":
jpayne@68	411 value = stream
jpayne@68	412 else:
jpayne@68	413 value = kwargs[kw]
jpayne@68	414 setattr(self, attr, value)
jpayne@68	415 result = self
jpayne@68	416 else:
jpayne@68	417 result = BedTool(stream)
jpayne@68	418
jpayne@68	419 result._isbam = result_is_bam
jpayne@68	420 result._cmds = cmds
jpayne@68	421 del kwargs
jpayne@68	422 return result
jpayne@68	423
jpayne@68	424 # Now add the edited docstring (original Python docstring plus BEDTools
jpayne@68	425 # help) to the newly created method above
jpayne@68	426 if func.__doc__ is None:
jpayne@68	427 orig = ""
jpayne@68	428 else:
jpayne@68	429 orig = func.__doc__
jpayne@68	430
jpayne@68	431 wrapped.__doc__ = orig + "\n".join(_add_doc) + help_str
jpayne@68	432
jpayne@68	433 # Add the original method's name to a new attribute so we can access it
jpayne@68	434 # when logging history
jpayne@68	435 wrapped._name = func.__name__ # type: ignore
jpayne@68	436
jpayne@68	437 return wrapped
jpayne@68	438
jpayne@68	439 return decorator
jpayne@68	440
jpayne@68	441
jpayne@68	442 def _log_to_history(method: Callable):
jpayne@68	443 """
jpayne@68	444 Decorator to add a method and its kwargs to the history.
jpayne@68	445
jpayne@68	446 Assumes that you only add this decorator to bedtool instances that
jpayne@68	447 return other bedtool instances
jpayne@68	448 """
jpayne@68	449
jpayne@68	450 def decorated(self, args, *kwargs):
jpayne@68	451
jpayne@68	452 # this calls the actual method in the first place; result is
jpayne@68	453 # whatever you get back
jpayne@68	454 result = method(self, args, *kwargs)
jpayne@68	455
jpayne@68	456 # add appropriate tags
jpayne@68	457 parent_tag = self._tag
jpayne@68	458 result_tag = result._tag
jpayne@68	459
jpayne@68	460 history_step = HistoryStep(
jpayne@68	461 method, args, kwargs, self, parent_tag, result_tag
jpayne@68	462 )
jpayne@68	463
jpayne@68	464 # only add the current history to the new bedtool if there's
jpayne@68	465 # something to add
jpayne@68	466 if len(self.history) > 0:
jpayne@68	467 result.history.append(self.history)
jpayne@68	468
jpayne@68	469 # but either way, add this history step to the result.
jpayne@68	470 result.history.append(history_step)
jpayne@68	471
jpayne@68	472 return result
jpayne@68	473
jpayne@68	474 decorated.__doc__ = method.__doc__
jpayne@68	475 return decorated
jpayne@68	476
jpayne@68	477
jpayne@68	478 class BedTool(object):
jpayne@68	479 TEMPFILES = filenames.TEMPFILES
jpayne@68	480
jpayne@68	481 def __init__(self, fn: Optional[Any] = None,
jpayne@68	482 from_string: bool = False,
jpayne@68	483 remote: bool = False):
jpayne@68	484 """
jpayne@68	485 Wrapper around Aaron Quinlan's ``BEDtools`` suite of programs
jpayne@68	486 (https://github.com/arq5x/bedtools); also contains many useful
jpayne@68	487 methods for more detailed work with BED files.
jpayne@68	488
jpayne@68	489 fn is typically the name of a BED-like file, but can also be
jpayne@68	490 one of the following:
jpayne@68	491
jpayne@68	492 * a string filename
jpayne@68	493 * another BedTool object
jpayne@68	494 * an iterable of Interval objects
jpayne@68	495 * an open file object
jpayne@68	496 * a "file contents" string (see below)
jpayne@68	497
jpayne@68	498 If from_string is True, then you can pass a string that contains
jpayne@68	499 the contents of the BedTool you want to create. This will treat all
jpayne@68	500 spaces as TABs and write to tempfile, treating whatever you pass as
jpayne@68	501 fn as the contents of the bed file. This also strips empty lines.
jpayne@68	502
jpayne@68	503 Typical usage is to point to an existing file::
jpayne@68	504
jpayne@68	505 a = BedTool('a.bed')
jpayne@68	506
jpayne@68	507 But you can also create one from scratch from a string::
jpayne@68	508
jpayne@68	509 >>> s = '''
jpayne@68	510 ... chrX 1 100
jpayne@68	511 ... chrX 25 800
jpayne@68	512 ... '''
jpayne@68	513 >>> a = BedTool(s, from_string=True)
jpayne@68	514
jpayne@68	515 Or use examples that come with pybedtools::
jpayne@68	516
jpayne@68	517 >>> example_files = pybedtools.list_example_files()
jpayne@68	518 >>> assert 'a.bed' in example_files
jpayne@68	519 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	520
jpayne@68	521 """
jpayne@68	522 if remote:
jpayne@68	523 raise ValueError(
jpayne@68	524 "Remote BAM no longer supported (since BEDTools does not " "support it)"
jpayne@68	525 )
jpayne@68	526 self.remote = remote
jpayne@68	527 self._isbam = False
jpayne@68	528 self._bam_header = ""
jpayne@68	529 self._cmds = []
jpayne@68	530 if from_string:
jpayne@68	531 if fn is None or not isinstance(fn, str):
jpayne@68	532 raise ValueError("from_string=True requires a string to parse")
jpayne@68	533 bed_contents = fn
jpayne@68	534 fn = self._tmp()
jpayne@68	535 fout = open(fn, "w")
jpayne@68	536 for line in bed_contents.splitlines():
jpayne@68	537 if len(line.strip()) == 0:
jpayne@68	538 continue
jpayne@68	539 line = "\t".join(line.split()) + "\n"
jpayne@68	540 fout.write(line)
jpayne@68	541 fout.close()
jpayne@68	542
jpayne@68	543 else:
jpayne@68	544 # if fn is a Path object, we have to use its string representation
jpayne@68	545 if isinstance(fn, pathlib.PurePath):
jpayne@68	546 fn = str(fn)
jpayne@68	547
jpayne@68	548 # our work is already done
jpayne@68	549 if isinstance(fn, BedTool):
jpayne@68	550 fn = fn.fn
jpayne@68	551
jpayne@68	552 # from_string=False, so assume it's a filename
jpayne@68	553 elif isinstance(fn, str):
jpayne@68	554 if remote:
jpayne@68	555 self._isbam = True
jpayne@68	556 else:
jpayne@68	557 if not os.path.exists(fn):
jpayne@68	558 msg = 'File "%s" does not exist' % fn
jpayne@68	559 raise FileNotFoundError(msg)
jpayne@68	560 self._isbam = isBAM(fn)
jpayne@68	561
jpayne@68	562 # TODO: we dont' really need this, but it's added here for
jpayne@68	563 # compatibility with existing tests
jpayne@68	564 if self._isbam:
jpayne@68	565 header = pysam.Samfile(fn).header.to_dict()
jpayne@68	566 # For example:
jpayne@68	567 # {
jpayne@68	568 # 'HD': {'VN': '1.0', 'SO': 'coordinate'},
jpayne@68	569 # 'SQ': [
jpayne@68	570 # {'LN': 23011544,
jpayne@68	571 # 'SN': 'chr2L'},
jpayne@68	572 # {'LN': 21146708,
jpayne@68	573 # 'SN': 'chr2R'},
jpayne@68	574 # {'LN': 24543557,
jpayne@68	575 # 'SN': 'chr3L'},
jpayne@68	576 # {'LN': 27905053,
jpayne@68	577 # 'SN': 'chr3R'},
jpayne@68	578 # {'LN': 1351857,
jpayne@68	579 # 'SN': 'chr4'},
jpayne@68	580 # {'LN': 22422827,
jpayne@68	581 # 'SN': 'chrX'}
jpayne@68	582 # ]
jpayne@68	583 # }
jpayne@68	584
jpayne@68	585 txt_header = []
jpayne@68	586 for k, v in header.items():
jpayne@68	587 if isinstance(v, list):
jpayne@68	588 for i in v:
jpayne@68	589 if isinstance(i, dict):
jpayne@68	590 txt_header.append(
jpayne@68	591 "\t".join(
jpayne@68	592 ["@" + k]
jpayne@68	593 + [
jpayne@68	594 ":".join(map(str, j))
jpayne@68	595 for j in sorted(i.items(), reverse=True)
jpayne@68	596 ]
jpayne@68	597 )
jpayne@68	598 )
jpayne@68	599 elif isinstance(i, str):
jpayne@68	600 txt_header.append(i)
jpayne@68	601
jpayne@68	602 elif isinstance(v, dict):
jpayne@68	603 txt_header.append(
jpayne@68	604 "\t".join(
jpayne@68	605 ["@" + k]
jpayne@68	606 + [
jpayne@68	607 ":".join(map(str, j))
jpayne@68	608 for j in sorted(v.items(), reverse=True)
jpayne@68	609 ]
jpayne@68	610 )
jpayne@68	611 )
jpayne@68	612 else:
jpayne@68	613 raise ValueError("unhandled type in BAM header")
jpayne@68	614 self._bam_header = "\n".join(txt_header) + "\n"
jpayne@68	615
jpayne@68	616 # If tuple or list, then save as file first
jpayne@68	617 # (fixes #73)
jpayne@68	618 elif isinstance(fn, (list, tuple)):
jpayne@68	619 fn = BedTool(iter(fn)).saveas().fn
jpayne@68	620
jpayne@68	621 # Otherwise assume iterator, say an open file as from
jpayne@68	622 # subprocess.PIPE
jpayne@68	623 else:
jpayne@68	624 fn = fn
jpayne@68	625
jpayne@68	626 self.fn = fn
jpayne@68	627 tag = "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
jpayne@68	628 self._tag = tag
jpayne@68	629 _tags[tag] = self
jpayne@68	630 self._hascounts = False
jpayne@68	631 self._file_type = None
jpayne@68	632 self.seqfn = None
jpayne@68	633 self.fastq = None
jpayne@68	634 self.igv_script = None
jpayne@68	635 self.links_html = None
jpayne@68	636 self.history = History()
jpayne@68	637
jpayne@68	638 @classmethod
jpayne@68	639 def from_dataframe(
jpayne@68	640 cls,
jpayne@68	641 df: pd.DataFrame,
jpayne@68	642 outfile: Optional[str] =None,
jpayne@68	643 sep: str ="\t",
jpayne@68	644 header: bool =False,
jpayne@68	645 na_rep:str =".",
jpayne@68	646 index:bool =False,
jpayne@68	647 **kwargs
jpayne@68	648 ) -> BedTool:
jpayne@68	649 """
jpayne@68	650 Creates a BedTool from a pandas.DataFrame.
jpayne@68	651
jpayne@68	652 If `outfile` is None, a temporary file will be used. Otherwise it can
jpayne@68	653 be a specific filename or an open file handle. Additional kwargs will
jpayne@68	654 be passed to `pandas.DataFrame.to_csv`.
jpayne@68	655
jpayne@68	656 The fields of the resulting BedTool will match the order of columns in
jpayne@68	657 the dataframe.
jpayne@68	658 """
jpayne@68	659 try:
jpayne@68	660 import pandas
jpayne@68	661 except ImportError:
jpayne@68	662 raise ImportError("pandas must be installed to use dataframes")
jpayne@68	663 if outfile is None:
jpayne@68	664 outfile = cls._tmp()
jpayne@68	665 default_kwargs = dict(sep=sep, header=header, na_rep=na_rep, index=index)
jpayne@68	666 default_kwargs.update(kwargs)
jpayne@68	667 df.to_csv(outfile, **default_kwargs)
jpayne@68	668
jpayne@68	669 if isinstance(outfile, str):
jpayne@68	670 fn = outfile
jpayne@68	671 else:
jpayne@68	672 try:
jpayne@68	673 fn = outfile.name
jpayne@68	674 except AttributeError:
jpayne@68	675 raise ValueError(
jpayne@68	676 "`outfile` is not a string and doesn't have a `name` attribute. "
jpayne@68	677 "Unable to determine filename."
jpayne@68	678 )
jpayne@68	679 return BedTool(fn)
jpayne@68	680
jpayne@68	681 def split(self, func: Callable, args, *kwargs) -> BedTool:
jpayne@68	682 """
jpayne@68	683 Split each feature using a user-defined function.
jpayne@68	684
jpayne@68	685 Calls the provided function `func` with each interval. In contrast to
jpayne@68	686 `each` (which does something similar), this method expects `func` to
jpayne@68	687 return an iterable of Interval objects.
jpayne@68	688
jpayne@68	689 args and kwargs are passed directly to `func`.
jpayne@68	690
jpayne@68	691 Returns a new BedTool.
jpayne@68	692 """
jpayne@68	693
jpayne@68	694 def generator():
jpayne@68	695 for orig_interval in self:
jpayne@68	696 for interval in func(orig_interval, args, *kwargs):
jpayne@68	697 yield interval
jpayne@68	698
jpayne@68	699 return BedTool(generator())
jpayne@68	700
jpayne@68	701 def truncate_to_chrom(self, genome: str \| dict) -> BedTool:
jpayne@68	702 """
jpayne@68	703 Ensure all features fall within chromosome limits.
jpayne@68	704
jpayne@68	705 Some peak-callers extend peaks such that the boundaries overstep
jpayne@68	706 chromosome coordinates. Upon uploading such a file to a genome browser
jpayne@68	707 like UCSC, this results in an error like::
jpayne@68	708
jpayne@68	709 Error line 101 of custom track: chromEnd larger than chrom chr2
jpayne@68	710 size
jpayne@68	711
jpayne@68	712 Use this method to clean your file, truncating any out-of-bounds
jpayne@68	713 features to fit within the chromosome coordinates of `genome`.
jpayne@68	714
jpayne@68	715 `genome` can be either an assembly name ('dm3') or a dictionary where
jpayne@68	716 keys are chrom and values are (start, stop) tuples.
jpayne@68	717 """
jpayne@68	718 if isinstance(genome, dict):
jpayne@68	719 chromdict = genome
jpayne@68	720 else:
jpayne@68	721 assert isinstance(genome, str)
jpayne@68	722 chromdict = helpers.chromsizes(genome)
jpayne@68	723
jpayne@68	724 tmp = self._tmp()
jpayne@68	725 with open(tmp, "w") as fout:
jpayne@68	726 for chrom, coords in list(chromdict.items()):
jpayne@68	727 start, stop = coords
jpayne@68	728 start = str(start)
jpayne@68	729 stop = str(stop)
jpayne@68	730 fout.write("\t".join([chrom, start, stop]) + "\n")
jpayne@68	731 return self.intersect(tmp)
jpayne@68	732
jpayne@68	733 def tabix_intervals(self, interval_or_string: Interval \| str, check_coordinates: bool=False) -> BedTool:
jpayne@68	734 """
jpayne@68	735 Retrieve all intervals within coordinates from a "tabixed" BedTool.
jpayne@68	736
jpayne@68	737 Given either a string in "chrom:start-stop" format, or an interval-like
jpayne@68	738 object with chrom, start, stop attributes, return a streaming BedTool
jpayne@68	739 of the features in this BedTool that overlap the provided interval.
jpayne@68	740
jpayne@68	741 If the coordinates are invalid, an empty generator is returned unless
jpayne@68	742 `check_coordinates=True` in which case a ValueError will be raised.
jpayne@68	743 """
jpayne@68	744 if not self._tabixed():
jpayne@68	745 raise ValueError(
jpayne@68	746 "This BedTool has not been indexed for tabix "
jpayne@68	747 "-- please use the .tabix() method"
jpayne@68	748 )
jpayne@68	749
jpayne@68	750 # tabix expects 1-based coords, but BEDTools works with
jpayne@68	751 # zero-based. pybedtools and pysam also work with zero-based. So we can
jpayne@68	752 # pass zero-based directly to the pysam tabix interface.
jpayne@68	753 tbx = pysam.TabixFile(self.fn)
jpayne@68	754
jpayne@68	755 # If an interval is passed, use its coordinates directly
jpayne@68	756 if isinstance(interval_or_string, Interval):
jpayne@68	757 interval: Interval = interval_or_string
jpayne@68	758 chrom, start, end = interval.chrom, interval.start, interval.stop
jpayne@68	759 # Parse string directly instead of relying on Interval, in order to
jpayne@68	760 # permit full chromosome fetching
jpayne@68	761 else:
jpayne@68	762 match = helpers.coord_re.search(interval_or_string)
jpayne@68	763 # Assume string is contig if it doesn't fit chrom:start-end format
jpayne@68	764 if match is None:
jpayne@68	765 chrom = interval_or_string
jpayne@68	766 start, end = None, None
jpayne@68	767 # Otherwise parse the coordinates
jpayne@68	768 else:
jpayne@68	769 chrom, start, end = match.group(1, 2, 3)
jpayne@68	770 start, end = int(start), int(end)
jpayne@68	771
jpayne@68	772 # Fetch results.
jpayne@68	773 try:
jpayne@68	774 results = tbx.fetch(str(chrom), start, end)
jpayne@68	775 except ValueError:
jpayne@68	776 if check_coordinates:
jpayne@68	777 raise
jpayne@68	778 else:
jpayne@68	779 results = []
jpayne@68	780
jpayne@68	781 # pysam.ctabix.TabixIterator does not include newlines when yielding so
jpayne@68	782 # we need to add them.
jpayne@68	783 def gen():
jpayne@68	784 for i in results:
jpayne@68	785 yield i + "\n"
jpayne@68	786
jpayne@68	787 # xref #190
jpayne@68	788 x = BedTool(gen()).saveas()
jpayne@68	789 tbx.close()
jpayne@68	790 return x
jpayne@68	791
jpayne@68	792 def tabix_contigs(self):
jpayne@68	793 """
jpayne@68	794 Returns a list of contigs from the tabix index.
jpayne@68	795 """
jpayne@68	796 if not self._tabixed():
jpayne@68	797 raise ValueError(
jpayne@68	798 "This BedTool has not been indexed for tabix "
jpayne@68	799 "-- please use the .tabix() method"
jpayne@68	800 )
jpayne@68	801
jpayne@68	802 tbx = pysam.TabixFile(self.fn)
jpayne@68	803 return tbx.contigs
jpayne@68	804
jpayne@68	805 def tabix(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> BedTool:
jpayne@68	806 """
jpayne@68	807 Prepare a BedTool for use with Tabix.
jpayne@68	808
jpayne@68	809 Returns a new BedTool that has been BGZIP compressed
jpayne@68	810 and indexed by tabix.
jpayne@68	811
jpayne@68	812 Parameters
jpayne@68	813 ----------
jpayne@68	814
jpayne@68	815 in_place : bool
jpayne@68	816 If True (default), then assume the file is already sorted and
jpayne@68	817 replace the existing file with the BGZIPed version.
jpayne@68	818
jpayne@68	819 force : bool
jpayne@68	820 If True (default is False), then overwrite both the index and the
jpayne@68	821 BGZIP file.
jpayne@68	822
jpayne@68	823 is_sorted : bool
jpayne@68	824 If True (default is False), then assume the file is already sorted
jpayne@68	825 so that BedTool.bgzip() doesn't have to do that work.
jpayne@68	826 """
jpayne@68	827 # Return quickly if nothing to do
jpayne@68	828 if self._tabixed() and not force:
jpayne@68	829 return self
jpayne@68	830
jpayne@68	831 # Make sure it's BGZIPed
jpayne@68	832 fn = self.bgzip(in_place=in_place, force=force, is_sorted=is_sorted)
jpayne@68	833 if self.file_type is not None and self.file_type not in ["bam", "empty"]:
jpayne@68	834 pysam.tabix_index(fn, force=force, preset=self.file_type) # type: ignore
jpayne@68	835 return BedTool(fn)
jpayne@68	836
jpayne@68	837 def _tabixed(self):
jpayne@68	838 """
jpayne@68	839 Verifies that we're working with a tabixed file: a string filename
jpayne@68	840 pointing to a BGZIPed file with a .tbi file in the same dir.
jpayne@68	841 """
jpayne@68	842 if (
jpayne@68	843 isinstance(self.fn, str)
jpayne@68	844 and isBGZIP(self.fn)
jpayne@68	845 and os.path.exists(self.fn + ".tbi")
jpayne@68	846 ):
jpayne@68	847 return True
jpayne@68	848
jpayne@68	849 def bgzip(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> str:
jpayne@68	850 """
jpayne@68	851 Helper function for more control over "tabixed" BedTools.
jpayne@68	852
jpayne@68	853 Checks to see if we already have a BGZIP file; if not then prepare one.
jpayne@68	854 Always leaves the original file alone. You can always just make a
jpayne@68	855 BedTool out of an already sorted and BGZIPed file to avoid this step.
jpayne@68	856
jpayne@68	857 `in_place` will put the BGZIPed file in the same dir (possibly after
jpayne@68	858 sorting to tempfile).
jpayne@68	859
jpayne@68	860 If `is_sorted`, then assume the file is already sorted. Otherwise call
jpayne@68	861 bedtools sort with the `-header` option.
jpayne@68	862
jpayne@68	863 `force` will overwrite without asking.
jpayne@68	864 """
jpayne@68	865 # It may already be BGZIPed...
jpayne@68	866 if isinstance(self.fn, str) and not force:
jpayne@68	867 if isBGZIP(self.fn):
jpayne@68	868 return self.fn
jpayne@68	869
jpayne@68	870 # If not in_place, then make a tempfile for the BGZIPed version
jpayne@68	871 if not in_place:
jpayne@68	872 # Get tempfile name, sorted or not
jpayne@68	873 if not is_sorted:
jpayne@68	874 fn = self.sort(header=True).fn
jpayne@68	875 else:
jpayne@68	876 fn = self._tmp()
jpayne@68	877
jpayne@68	878 # Register for later deletion
jpayne@68	879 outfn = fn + ".gz"
jpayne@68	880 BedTool.TEMPFILES.append(outfn)
jpayne@68	881
jpayne@68	882 # Creates tempfile.gz
jpayne@68	883 pysam.tabix_compress(fn, outfn, force=force)
jpayne@68	884 return outfn
jpayne@68	885
jpayne@68	886 # Otherwise, make sure the BGZIPed version has a similar name to the
jpayne@68	887 # current BedTool's file
jpayne@68	888 if in_place:
jpayne@68	889 if not is_sorted:
jpayne@68	890 fn = self.sort(header=True).saveas().fn
jpayne@68	891 else:
jpayne@68	892 fn = self.fn
jpayne@68	893 outfn = self.fn + ".gz"
jpayne@68	894 pysam.tabix_compress(fn, outfn, force=force)
jpayne@68	895 return outfn
jpayne@68	896
jpayne@68	897 def delete_temporary_history(self, ask: bool = True, raw_input_func=None):
jpayne@68	898 """
jpayne@68	899 Use at your own risk! This method will delete temp files. You will be
jpayne@68	900 prompted for deletion of files unless you specify ask=False.
jpayne@68	901
jpayne@68	902 Deletes all temporary files created during the history of this BedTool
jpayne@68	903 up to but not including the file this current BedTool points to.
jpayne@68	904
jpayne@68	905 Any filenames that are in the history and have the following pattern
jpayne@68	906 will be deleted::
jpayne@68	907
jpayne@68	908 <TEMP_DIR>/pybedtools.*.tmp
jpayne@68	909
jpayne@68	910 (where <TEMP_DIR> is the result from get_tempdir() and is by default
jpayne@68	911 "/tmp")
jpayne@68	912
jpayne@68	913 Any files that don't have this format will be left alone.
jpayne@68	914
jpayne@68	915 (raw_input_func is used for testing)
jpayne@68	916 """
jpayne@68	917 flattened_history = _flatten_list(self.history)
jpayne@68	918 to_delete = []
jpayne@68	919 tempdir = get_tempdir()
jpayne@68	920 for i in flattened_history:
jpayne@68	921 fn = i.fn
jpayne@68	922 if fn.startswith(os.path.join(os.path.abspath(tempdir), "pybedtools")):
jpayne@68	923 if fn.endswith(".tmp"):
jpayne@68	924 to_delete.append(fn)
jpayne@68	925
jpayne@68	926 if raw_input_func is None:
jpayne@68	927 raw_input_func = input
jpayne@68	928
jpayne@68	929 str_fns = "\n\t".join(to_delete)
jpayne@68	930 if ask:
jpayne@68	931 answer = raw_input_func("Delete these files?\n\t%s\n(y/N) " % str_fns)
jpayne@68	932
jpayne@68	933 if not answer.lower()[0] == "y":
jpayne@68	934 print("OK, not deleting.")
jpayne@68	935 return
jpayne@68	936 for fn in to_delete:
jpayne@68	937 os.unlink(fn)
jpayne@68	938 return
jpayne@68	939
jpayne@68	940 def filter(self, func: Callable, args, *kwargs) -> BedTool:
jpayne@68	941 """
jpayne@68	942 Filter features by user-defined function.
jpayne@68	943
jpayne@68	944 Takes a function func that is called for each feature in the
jpayne@68	945 `BedTool` object and returns only those for which the function returns
jpayne@68	946 True.
jpayne@68	947
jpayne@68	948 args and kwargs are passed directly to func*.
jpayne@68	949
jpayne@68	950 Returns a streaming BedTool; if you want the filename then use the
jpayne@68	951 .saveas() method.
jpayne@68	952
jpayne@68	953 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	954 >>> subset = a.filter(lambda b: b.chrom == 'chr1' and b.start < 150)
jpayne@68	955 >>> len(a), len(subset)
jpayne@68	956 (4, 2)
jpayne@68	957
jpayne@68	958 so it has extracted 2 records from the original 4.
jpayne@68	959
jpayne@68	960 """
jpayne@68	961 return BedTool((f for f in self if func(f, args, *kwargs)))
jpayne@68	962
jpayne@68	963 def field_count(self, n:int=10) -> int:
jpayne@68	964 """
jpayne@68	965 Number of fields in each line of this BedTool (checks `n` lines)
jpayne@68	966
jpayne@68	967 Return the number of fields in the features this file contains. Checks
jpayne@68	968 the first n features.
jpayne@68	969
jpayne@68	970 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	971 >>> a.field_count()
jpayne@68	972 6
jpayne@68	973 """
jpayne@68	974 if self.file_type == "empty":
jpayne@68	975 return 0
jpayne@68	976 i = 0
jpayne@68	977 fields = set([])
jpayne@68	978 for feat in self:
jpayne@68	979 if i > n:
jpayne@68	980 break
jpayne@68	981 i += 1
jpayne@68	982 # TODO: make this more efficient.
jpayne@68	983 fields.update([len(feat.fields)])
jpayne@68	984 assert len(fields) == 1, fields
jpayne@68	985 return list(fields)[0]
jpayne@68	986
jpayne@68	987 def each(self, func: Callable, args, *kwargs) -> BedTool:
jpayne@68	988 """
jpayne@68	989 Modify each feature with a user-defined function.
jpayne@68	990
jpayne@68	991 Applies user-defined function func to each feature. func must
jpayne@68	992 accept an Interval as its first argument; args and *kwargs will be
jpayne@68	993 passed to func.
jpayne@68	994
jpayne@68	995 func must return an Interval object OR a value that evaluates to
jpayne@68	996 False, in which case the original feature will be removed from the
jpayne@68	997 output. This way, an additional "filter" call is not necessary.
jpayne@68	998
jpayne@68	999 >>> def truncate_feature(feature, limit=0):
jpayne@68	1000 ... feature.score = str(len(feature))
jpayne@68	1001 ... if len(feature) > limit:
jpayne@68	1002 ... feature.stop = feature.start + limit
jpayne@68	1003 ... feature.name = feature.name + '.short'
jpayne@68	1004 ... return feature
jpayne@68	1005
jpayne@68	1006 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	1007 >>> b = a.each(truncate_feature, limit=100)
jpayne@68	1008 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	1009 chr1 1 100 feature1 99 +
jpayne@68	1010 chr1 100 200 feature2 100 +
jpayne@68	1011 chr1 150 250 feature3.short 350 -
jpayne@68	1012 chr1 900 950 feature4 50 +
jpayne@68	1013 <BLANKLINE>
jpayne@68	1014
jpayne@68	1015 """
jpayne@68	1016
jpayne@68	1017 def _generator():
jpayne@68	1018 for f in self:
jpayne@68	1019 result = func(f, args, *kwargs)
jpayne@68	1020 if result:
jpayne@68	1021 yield result
jpayne@68	1022
jpayne@68	1023 return BedTool(_generator())
jpayne@68	1024
jpayne@68	1025 def introns(self, gene: str = "gene", exon: str = "exon") -> BedTool:
jpayne@68	1026 """
jpayne@68	1027 Create intron features (requires specific input format).
jpayne@68	1028
jpayne@68	1029 NOTE: this method assumes a simple file with non-overlapping exons. For
jpayne@68	1030 more sophisticated features, consider the gffutils package instead.
jpayne@68	1031
jpayne@68	1032 Given a BED12 or a GFF with exons, create a new `BedTool` with just
jpayne@68	1033 introns. The output is a bed6 file with the score column (5) being one
jpayne@68	1034 of 'intron'/'utr5'/'utr3'
jpayne@68	1035 """
jpayne@68	1036 # iterate over all the features in the gene.
jpayne@68	1037 s = self.sort()
jpayne@68	1038 if self.file_type == "gff":
jpayne@68	1039 exon_iter = BedTool((f for f in s if f[2] == exon)).saveas()
jpayne@68	1040 gene_iter = BedTool((f for f in s if f[2] == gene)).saveas()
jpayne@68	1041
jpayne@68	1042 elif self.file_type == "bed":
jpayne@68	1043 if s.field_count() == 12:
jpayne@68	1044 exon_iter = s.bed6().saveas()
jpayne@68	1045 gene_iter = s.saveas()
jpayne@68	1046 else:
jpayne@68	1047 # TODO: bed6. groupby on name and find smallest start,
jpayne@68	1048 # largest stop.
jpayne@68	1049 exon_iter = s
jpayne@68	1050 gene_iter = None
jpayne@68	1051 raise NotImplementedError(
jpayne@68	1052 ".introns() only supported for bed12" "and GFF"
jpayne@68	1053 )
jpayne@68	1054
jpayne@68	1055 else:
jpayne@68	1056 raise NotImplementedError(".introns() only supported for BED and GFF")
jpayne@68	1057
jpayne@68	1058 with open(BedTool._tmp(), "w") as fh:
jpayne@68	1059 # group on the name.
jpayne@68	1060 exon_intervals = IntervalFile(exon_iter.fn)
jpayne@68	1061 for g in gene_iter:
jpayne@68	1062 # search finds all, but we just want the ones that completely
jpayne@68	1063 # overlap this gene.
jpayne@68	1064 exons = [
jpayne@68	1065 e
jpayne@68	1066 for e in exon_intervals.search(g, same_strand=True)
jpayne@68	1067 if e.start >= g.start and e.end <= g.end
jpayne@68	1068 ]
jpayne@68	1069
jpayne@68	1070 for i, exon_instance in enumerate(exons):
jpayne@68	1071 exon_instance: pybedtools.Interval
jpayne@68	1072 # 5' utr between gene start and first intron
jpayne@68	1073 if i == 0 and exon_instance.start > g.start:
jpayne@68	1074 utr = {"+": "utr5", "-": "utr3"}[g.strand]
jpayne@68	1075 print(
jpayne@68	1076 "%s\t%i\t%i\t%s\t%s\t%s"
jpayne@68	1077 % (g.chrom, g.start, exon_instance.start, g.name, utr, g.strand),
jpayne@68	1078 file=fh,
jpayne@68	1079 )
jpayne@68	1080 elif i == len(exons) - 1 and exon_instance.end < g.end:
jpayne@68	1081 utr = {"+": "utr3", "-": "utr5"}[g.strand]
jpayne@68	1082 print(
jpayne@68	1083 "%s\t%i\t%i\t%s\t%s\t%s"
jpayne@68	1084 % (g.chrom, exon_instance.end, g.end, g.name, utr, g.strand),
jpayne@68	1085 file=fh,
jpayne@68	1086 )
jpayne@68	1087 elif i != len(exons) - 1:
jpayne@68	1088 istart = exon_instance.end
jpayne@68	1089 iend = exons[i + 1].start
jpayne@68	1090 print(
jpayne@68	1091 "%s\t%i\t%i\t%s\tintron\t%s"
jpayne@68	1092 % (g.chrom, istart, iend, g.name, g.strand),
jpayne@68	1093 file=fh,
jpayne@68	1094 )
jpayne@68	1095 return BedTool(fh.name)
jpayne@68	1096
jpayne@68	1097 def features(self):
jpayne@68	1098 """
jpayne@68	1099 Returns an iterable of features
jpayne@68	1100 """
jpayne@68	1101 if hasattr(self, "next") or hasattr(self, "__next__"):
jpayne@68	1102 return self
jpayne@68	1103 return iter(self)
jpayne@68	1104
jpayne@68	1105 FileType = Literal['bed', 'vcf', 'gff', 'bam', 'sam', 'empty']
jpayne@68	1106
jpayne@68	1107 @property
jpayne@68	1108 def file_type(self) -> Optional[FileType]:
jpayne@68	1109 """
jpayne@68	1110 Return the type of the current file. One of ('bed','vcf','gff', 'bam',
jpayne@68	1111 'sam', 'empty').
jpayne@68	1112
jpayne@68	1113 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	1114 >>> print(a.file_type)
jpayne@68	1115 bed
jpayne@68	1116 """
jpayne@68	1117 if not isinstance(self.fn, str):
jpayne@68	1118 raise ValueError(
jpayne@68	1119 "Checking file_type not supported for "
jpayne@68	1120 "non-file BedTools. Use .saveas() to "
jpayne@68	1121 "save as a temp file first."
jpayne@68	1122 )
jpayne@68	1123 if self._isbam:
jpayne@68	1124 self._file_type = "bam"
jpayne@68	1125 else:
jpayne@68	1126 try:
jpayne@68	1127 self._file_type = next(iter(self)).file_type
jpayne@68	1128 except StopIteration:
jpayne@68	1129 self._file_type = "empty"
jpayne@68	1130
jpayne@68	1131 return self._file_type
jpayne@68	1132
jpayne@68	1133 def cut(self, indexes: list[int], stream: bool = False) -> BedTool:
jpayne@68	1134 """
jpayne@68	1135 Analogous to unix `cut`.
jpayne@68	1136
jpayne@68	1137 Similar to unix `cut` except indexes are 0-based, must be a list
jpayne@68	1138 and the columns are returned in the order requested.
jpayne@68	1139
jpayne@68	1140 This method returns a BedTool of results, which means that the indexes
jpayne@68	1141 returned must be valid GFF/GTF/BED/SAM features.
jpayne@68	1142
jpayne@68	1143 If you would like arbitrary columns -- say, just chrom and featuretype
jpayne@68	1144 of a GFF, which would not comprise a valid feature -- then instead of
jpayne@68	1145 this method, simply use indexes on each feature, e.g,
jpayne@68	1146
jpayne@68	1147 >>> gff = pybedtools.example_bedtool('d.gff')
jpayne@68	1148 >>> results = [(f[0], f[2]) for f in gff]
jpayne@68	1149
jpayne@68	1150 In addition, `indexes` can contain keys of the GFF/GTF attributes, in
jpayne@68	1151 which case the values are returned. e.g. 'gene_name' will return the
jpayne@68	1152 corresponding name from a GTF, or 'start' will return the start
jpayne@68	1153 attribute of a BED Interval.
jpayne@68	1154 """
jpayne@68	1155 if stream:
jpayne@68	1156 return BedTool(([f[attr] for attr in indexes] for f in self))
jpayne@68	1157 else:
jpayne@68	1158 with open(self._tmp(), "w") as fh:
jpayne@68	1159 for f in self:
jpayne@68	1160 print("\t".join(map(str, [f[attr] for attr in indexes])), file=fh)
jpayne@68	1161 return BedTool(fh.name)
jpayne@68	1162
jpayne@68	1163 @classmethod
jpayne@68	1164 def _tmp(cls) -> str:
jpayne@68	1165 """
jpayne@68	1166 Makes a tempfile and registers it in the BedTool.TEMPFILES class
jpayne@68	1167 variable. Adds a "pybedtools." prefix and ".tmp" extension for easy
jpayne@68	1168 deletion if you forget to call pybedtools.cleanup().
jpayne@68	1169 """
jpayne@68	1170 tmpfn = tempfile.NamedTemporaryFile(
jpayne@68	1171 prefix=settings.tempfile_prefix,
jpayne@68	1172 suffix=settings.tempfile_suffix,
jpayne@68	1173 delete=False,
jpayne@68	1174 )
jpayne@68	1175 tmpfn = tmpfn.name
jpayne@68	1176 cls.TEMPFILES.append(tmpfn)
jpayne@68	1177 return tmpfn
jpayne@68	1178
jpayne@68	1179 def __iter__(self):
jpayne@68	1180 """
jpayne@68	1181 Dispatches the right iterator depending on how this BedTool was
jpayne@68	1182 created
jpayne@68	1183 """
jpayne@68	1184 if self._isbam:
jpayne@68	1185 # Note: BAM class takes filename or stream, so self.fn is OK
jpayne@68	1186 # here
jpayne@68	1187 return BAM(self.fn)
jpayne@68	1188
jpayne@68	1189 # Plain ol' filename
jpayne@68	1190 if isinstance(self.fn, str):
jpayne@68	1191 if not os.path.exists(self.fn):
jpayne@68	1192 raise BedToolsFileError("{0} does not exist".format(self.fn))
jpayne@68	1193 if isGZIP(self.fn):
jpayne@68	1194 return IntervalIterator(gzip.open(self.fn, "rt"))
jpayne@68	1195 else:
jpayne@68	1196 return IntervalIterator(open(self.fn, "r"))
jpayne@68	1197 # Any other kind of input (streaming string from stdout; iterable of
jpayne@68	1198 # Intervals, iterable of (chrom, start, stop) tuples, etc are handled
jpayne@68	1199 # appropriately by IntervalIterator.
jpayne@68	1200 else:
jpayne@68	1201 return IntervalIterator(self.fn)
jpayne@68	1202
jpayne@68	1203 @property
jpayne@68	1204 def intervals(self):
jpayne@68	1205 if isinstance(self.fn, str):
jpayne@68	1206 return IntervalFile(self.fn)
jpayne@68	1207 else:
jpayne@68	1208 raise ValueError("Please convert to a file-based BedTool using saveas")
jpayne@68	1209
jpayne@68	1210 def __repr__(self):
jpayne@68	1211 if isinstance(self.fn, str):
jpayne@68	1212 if os.path.exists(self.fn) or self.remote:
jpayne@68	1213 return "<BedTool(%s)>" % self.fn
jpayne@68	1214 else:
jpayne@68	1215 return "<BedTool(MISSING FILE: %s)>" % self.fn
jpayne@68	1216 elif isinstance(self.fn, BedTool):
jpayne@68	1217 return repr(self.fn)
jpayne@68	1218 else:
jpayne@68	1219 return "<BedTool(%s)>" % repr(self.fn)
jpayne@68	1220
jpayne@68	1221 def __str__(self):
jpayne@68	1222 """
jpayne@68	1223 Returns the string representation of the whole `BedTool`
jpayne@68	1224 """
jpayne@68	1225 items = []
jpayne@68	1226 for i in iter(self):
jpayne@68	1227 i = str(i)
jpayne@68	1228 if isinstance(i, bytes):
jpayne@68	1229 i = i.decode("UTF-8")
jpayne@68	1230 items.append(i)
jpayne@68	1231 return "".join(items)
jpayne@68	1232
jpayne@68	1233 def __len__(self):
jpayne@68	1234 return self.count()
jpayne@68	1235
jpayne@68	1236 def __eq__(self, other: object) -> bool:
jpayne@68	1237 if isinstance(other, BedTool):
jpayne@68	1238 if not isinstance(self.fn, str) or not isinstance(
jpayne@68	1239 other.fn, str
jpayne@68	1240 ):
jpayne@68	1241 raise NotImplementedError(
jpayne@68	1242 "Testing equality only supported for"
jpayne@68	1243 " BedTools that point to files"
jpayne@68	1244 )
jpayne@68	1245 elif not isinstance(other, str):
jpayne@68	1246 raise NotImplementedError(
jpayne@68	1247 "Testing equality only supported for"
jpayne@68	1248 " BedTools that point to files or str of content"
jpayne@68	1249 )
jpayne@68	1250 return str(self) == str(other)
jpayne@68	1251
jpayne@68	1252 def __ne__(self, other:object):
jpayne@68	1253 return not self.__eq__(other)
jpayne@68	1254
jpayne@68	1255 def __getitem__(self, key: slice\|int):
jpayne@68	1256 if isinstance(key, slice):
jpayne@68	1257 return islice(self, key.start, key.stop, key.step)
jpayne@68	1258 elif isinstance(key, int):
jpayne@68	1259 return list(islice(self, key, key + 1))[0]
jpayne@68	1260 else:
jpayne@68	1261 raise ValueError(
jpayne@68	1262 "Only slices or integers allowed for indexing " "into a BedTool"
jpayne@68	1263 )
jpayne@68	1264
jpayne@68	1265 def __add__(self, other: BedTool) -> BedTool:
jpayne@68	1266 try:
jpayne@68	1267 result = self.intersect(other, u=True)
jpayne@68	1268 except BEDToolsError as e:
jpayne@68	1269 # BEDTools versions <2.20 would raise BEDToolsError
jpayne@68	1270 if (self.file_type == "empty") or (other.file_type == "empty"):
jpayne@68	1271 result = pybedtools.BedTool("", from_string=True)
jpayne@68	1272 else:
jpayne@68	1273 raise e
jpayne@68	1274 return result
jpayne@68	1275
jpayne@68	1276 def __sub__(self, other: BedTool) -> BedTool:
jpayne@68	1277 result = None
jpayne@68	1278
jpayne@68	1279 try:
jpayne@68	1280 result = self.intersect(other, v=True)
jpayne@68	1281 except BEDToolsError:
jpayne@68	1282 # BEDTools versions <2.20 would raise BEDToolsError
jpayne@68	1283
jpayne@68	1284 if (self.file_type == "empty") and (other.file_type == "empty"):
jpayne@68	1285 result = pybedtools.BedTool("", from_string=True)
jpayne@68	1286 elif other.file_type == "empty":
jpayne@68	1287 result = self.saveas()
jpayne@68	1288 elif self.file_type == "empty":
jpayne@68	1289 result = pybedtools.BedTool("", from_string=True)
jpayne@68	1290 if result is None:
jpayne@68	1291 raise ValueError("Subtraction operation failed.")
jpayne@68	1292
jpayne@68	1293 return result
jpayne@68	1294
jpayne@68	1295 def head(self, n: int = 10, as_string: bool = False):
jpayne@68	1296 """
jpayne@68	1297 Prints the first n lines or returns them if as_string is True
jpayne@68	1298
jpayne@68	1299 Note that this only opens the underlying file (gzipped or not), so it
jpayne@68	1300 does not check to see if the file is a valid BED file.
jpayne@68	1301
jpayne@68	1302 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	1303 >>> a.head(2) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	1304 chr1 1 100 feature1 0 +
jpayne@68	1305 chr1 100 200 feature2 0 +
jpayne@68	1306 <BLANKLINE>
jpayne@68	1307
jpayne@68	1308 """
jpayne@68	1309 if not isinstance(self.fn, str):
jpayne@68	1310 raise NotImplementedError(
jpayne@68	1311 "head() not supported for non file-based BedTools"
jpayne@68	1312 )
jpayne@68	1313 if as_string:
jpayne@68	1314 return "".join(str(line) for line in self[:n])
jpayne@68	1315 if self._isbam:
jpayne@68	1316 raise NotImplementedError("head() not supported for BAM")
jpayne@68	1317 else:
jpayne@68	1318 if isGZIP(self.fn):
jpayne@68	1319 openfunc = gzip.open
jpayne@68	1320 openmode = "rt"
jpayne@68	1321 else:
jpayne@68	1322 openfunc = open
jpayne@68	1323 openmode = "r"
jpayne@68	1324 with openfunc(self.fn, openmode) as fin:
jpayne@68	1325 for i, line in enumerate(fin):
jpayne@68	1326 if i == (n):
jpayne@68	1327 break
jpayne@68	1328 print(line, end=" ")
jpayne@68	1329
jpayne@68	1330 def set_chromsizes(self, chromsizes: str \| dict):
jpayne@68	1331 """
jpayne@68	1332 Prepare BedTool for operations that require chromosome coords.
jpayne@68	1333
jpayne@68	1334 Set the chromsizes for this genome. If chromsizes is a string, it
jpayne@68	1335 will be considered a genome assembly name. If that assembly name is
jpayne@68	1336 not available in pybedtools.genome_registry, then it will be searched
jpayne@68	1337 for on the UCSC Genome Browser.
jpayne@68	1338
jpayne@68	1339 Example usage:
jpayne@68	1340
jpayne@68	1341 >>> hg19 = pybedtools.chromsizes('hg19')
jpayne@68	1342 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	1343 >>> a = a.set_chromsizes(hg19)
jpayne@68	1344 >>> print(a.chromsizes['chr1'])
jpayne@68	1345 (0, 249250621)
jpayne@68	1346
jpayne@68	1347 """
jpayne@68	1348 if isinstance(chromsizes, str):
jpayne@68	1349 self.chromsizes = pybedtools.chromsizes(chromsizes)
jpayne@68	1350 elif isinstance(chromsizes, dict):
jpayne@68	1351 self.chromsizes = chromsizes
jpayne@68	1352 else:
jpayne@68	1353 raise ValueError(
jpayne@68	1354 "Need to specify chromsizes either as a string"
jpayne@68	1355 " (assembly name) or a dictionary"
jpayne@68	1356 )
jpayne@68	1357 return self
jpayne@68	1358
jpayne@68	1359 def _collapse(
jpayne@68	1360 self,
jpayne@68	1361 iterable: Iterable,
jpayne@68	1362 fn: Optional[str] = None,
jpayne@68	1363 trackline: Optional[str] = None,
jpayne@68	1364 in_compressed: bool = False,
jpayne@68	1365 out_compressed: bool = False,
jpayne@68	1366 ) -> str:
jpayne@68	1367 """
jpayne@68	1368 Collapses an iterable into file fn (or a new tempfile if fn is
jpayne@68	1369 None).
jpayne@68	1370
jpayne@68	1371 Returns the newly created filename.
jpayne@68	1372
jpayne@68	1373 Parameters
jpayne@68	1374 ----------
jpayne@68	1375
jpayne@68	1376 iterable : iter
jpayne@68	1377 Any iterable object whose items can be converted to an Interval.
jpayne@68	1378
jpayne@68	1379 fn : str
jpayne@68	1380 Output filename, if None then creates a temp file for output
jpayne@68	1381
jpayne@68	1382 trackline : str
jpayne@68	1383 If not None, string to be added to the top of the output. Newline
jpayne@68	1384 will be added.
jpayne@68	1385
jpayne@68	1386 in_compressed : bool
jpayne@68	1387 Indicates whether the input is compressed
jpayne@68	1388
jpayne@68	1389 out_compressed : bool
jpayne@68	1390 Indicates whether the output should be compressed
jpayne@68	1391 """
jpayne@68	1392 if fn is None:
jpayne@68	1393 fn = self._tmp()
jpayne@68	1394
jpayne@68	1395 in_open_func = gzip.open if in_compressed else open
jpayne@68	1396 out_open_func = gzip.open if out_compressed else open
jpayne@68	1397
jpayne@68	1398 # special case: if BAM-format BedTool is provided, no trackline should
jpayne@68	1399 # be supplied, and don't iterate -- copy the file wholesale
jpayne@68	1400 if isinstance(iterable, BedTool) and iterable._isbam:
jpayne@68	1401 if trackline:
jpayne@68	1402 raise ValueError(
jpayne@68	1403 "trackline provided, but input is a BAM "
jpayne@68	1404 "file, which takes no track line"
jpayne@68	1405 )
jpayne@68	1406 with open(fn, "wb") as out_:
jpayne@68	1407 out_.write(open(self.fn, "rb").read())
jpayne@68	1408 return fn
jpayne@68	1409
jpayne@68	1410 # If we're just working with filename-based BedTool objects, just copy
jpayne@68	1411 # the files directly
jpayne@68	1412 if isinstance(iterable, BedTool) and isinstance(iterable.fn, str):
jpayne@68	1413 with out_open_func(fn, "wt") as out_:
jpayne@68	1414 if sys.version_info > (3,0):
jpayne@68	1415 in_ = in_open_func(iterable.fn, "rt", errors="ignore")
jpayne@68	1416 else:
jpayne@68	1417 in_ = in_open_func(iterable.fn, "rt")
jpayne@68	1418 if trackline:
jpayne@68	1419 out_.write(trackline.strip() + "\n")
jpayne@68	1420 out_.writelines(in_)
jpayne@68	1421 in_.close()
jpayne@68	1422 else:
jpayne@68	1423 with out_open_func(fn, "wt") as out_:
jpayne@68	1424 for i in iterable:
jpayne@68	1425 if isinstance(i, (list, tuple)):
jpayne@68	1426 i = create_interval_from_list(list(i))
jpayne@68	1427 out_.write(str(i))
jpayne@68	1428 return fn
jpayne@68	1429
jpayne@68	1430 def handle_kwargs(self, prog:str, arg_order: Optional[list[str]] = None, **kwargs):
jpayne@68	1431 """
jpayne@68	1432 Handle most cases of BEDTool program calls, but leave the specifics
jpayne@68	1433 up to individual methods.
jpayne@68	1434
jpayne@68	1435 prog is a BEDTools program name, e.g., 'intersectBed'.
jpayne@68	1436
jpayne@68	1437 arg_order lists any arguments that are sensitive to order. Everything
jpayne@68	1438 else will be reverse-sorted.
jpayne@68	1439
jpayne@68	1440 kwargs are passed directly from the calling method (like
jpayne@68	1441 self.intersect).
jpayne@68	1442
jpayne@68	1443 This method figures out, given how this BedTool was constructed, what
jpayne@68	1444 to send to BEDTools programs -- for example, an open file to stdin with
jpayne@68	1445 the `-` argument, or a filename with the `-a` argument.
jpayne@68	1446 """
jpayne@68	1447 pybedtools.logger.debug(
jpayne@68	1448 "BedTool.handle_kwargs() got these kwargs:\n%s", pprint.pformat(kwargs)
jpayne@68	1449 )
jpayne@68	1450
jpayne@68	1451 # If you pass in a list, how should it be converted to a BedTools arg?
jpayne@68	1452 default_list_delimiter = " "
jpayne@68	1453 list_delimiters = {
jpayne@68	1454 "annotateBed": " ",
jpayne@68	1455 "getOverlap": ",",
jpayne@68	1456 "groupBy": ",",
jpayne@68	1457 "multiIntersectBed": " ",
jpayne@68	1458 "mergeBed": ",",
jpayne@68	1459 "intersectBed": " ",
jpayne@68	1460 "mapBed": ",",
jpayne@68	1461 }
jpayne@68	1462 stdin = None
jpayne@68	1463
jpayne@68	1464 # -----------------------------------------------------------------
jpayne@68	1465 # Decide how to send instream1 to BEDTools. If there's no implicit
jpayne@68	1466 # instream1 arg, then do nothing.
jpayne@68	1467 #
jpayne@68	1468 try:
jpayne@68	1469 # e.g., 'a' for intersectBed
jpayne@68	1470 if self._isbam:
jpayne@68	1471 inarg1 = _bam_registry[prog]
jpayne@68	1472 else:
jpayne@68	1473 inarg1 = _implicit_registry[prog]
jpayne@68	1474
jpayne@68	1475 # e.g., self.fn or 'a.bed' or an iterator...
jpayne@68	1476 instream1 = kwargs[inarg1]
jpayne@68	1477
jpayne@68	1478 # If it's a BedTool, then get underlying stream
jpayne@68	1479 if isinstance(instream1, BedTool):
jpayne@68	1480 instream1 = instream1.fn
jpayne@68	1481
jpayne@68	1482 # Filename? No pipe, just provide the file
jpayne@68	1483 if isinstance(instream1, str):
jpayne@68	1484 kwargs[inarg1] = instream1
jpayne@68	1485 stdin = None
jpayne@68	1486
jpayne@68	1487 # Open file? Pipe it
jpayne@68	1488 # elif isinstance(instream1, file):
jpayne@68	1489 # kwargs[inarg1] = 'stdin'
jpayne@68	1490 # stdin = instream1
jpayne@68	1491
jpayne@68	1492 # A generator or iterator: pipe it as a generator of lines
jpayne@68	1493 else:
jpayne@68	1494 kwargs[inarg1] = "stdin"
jpayne@68	1495 stdin = (str(i) for i in instream1)
jpayne@68	1496 except KeyError:
jpayne@68	1497 pass
jpayne@68	1498
jpayne@68	1499 # -----------------------------------------------------------------
jpayne@68	1500 # Decide how to send instream2 to BEDTools.
jpayne@68	1501 try:
jpayne@68	1502 # e.g., 'b' for intersectBed
jpayne@68	1503 inarg2 = _other_registry[prog]
jpayne@68	1504
jpayne@68	1505 # e.g., another BedTool
jpayne@68	1506 instream2 = kwargs[inarg2]
jpayne@68	1507
jpayne@68	1508 # Get stream if BedTool
jpayne@68	1509 if isinstance(instream2, BedTool):
jpayne@68	1510 instream2 = instream2.fn
jpayne@68	1511
jpayne@68	1512 # Filename
jpayne@68	1513 if isinstance(instream2, str):
jpayne@68	1514 kwargs[inarg2] = instream2
jpayne@68	1515
jpayne@68	1516 # If it's a list of strings, then we need to figure out if it's
jpayne@68	1517 # a list of filenames or a list of intervals (see issue #156)
jpayne@68	1518 #
jpayne@68	1519 # Several options:
jpayne@68	1520 #
jpayne@68	1521 # - assume intervals have tabs but filenames don't
jpayne@68	1522 # - assume that, upon being split on tabs, an interval is >=3 fields
jpayne@68	1523 # - try creating an interval out of the first thing, success means interval
jpayne@68	1524 #
jpayne@68	1525 # The last seems the most robust. It does allow filenames with
jpayne@68	1526 # tabs; deciding whether or not such filenames are a good idea is
jpayne@68	1527 # left to the user.
jpayne@68	1528 #
jpayne@68	1529 elif isinstance(instream2, (list, tuple)) and isinstance(
jpayne@68	1530 instream2[0], str
jpayne@68	1531 ):
jpayne@68	1532 try:
jpayne@68	1533 _ = create_interval_from_list(instream2[0].split("\t"))
jpayne@68	1534 kwargs[inarg2] = self._collapse(instream2)
jpayne@68	1535 except IndexError:
jpayne@68	1536 kwargs[inarg2] = instream2
jpayne@68	1537
jpayne@68	1538 # Otherwise we need to collapse it in order to send to BEDTools
jpayne@68	1539 # programs
jpayne@68	1540 else:
jpayne@68	1541 kwargs[inarg2] = self._collapse(instream2)
jpayne@68	1542
jpayne@68	1543 except KeyError:
jpayne@68	1544 pass
jpayne@68	1545
jpayne@68	1546 # If stream not specified, then a tempfile will be created
jpayne@68	1547 if kwargs.pop("stream", None):
jpayne@68	1548 tmp = None
jpayne@68	1549 else:
jpayne@68	1550 output = kwargs.pop("output", None)
jpayne@68	1551 if output:
jpayne@68	1552 tmp = output
jpayne@68	1553 else:
jpayne@68	1554 tmp = self._tmp()
jpayne@68	1555
jpayne@68	1556 additional_args = kwargs.pop("additional_args", None)
jpayne@68	1557
jpayne@68	1558 # Parse the kwargs into BEDTools-ready args
jpayne@68	1559 cmds = [prog]
jpayne@68	1560
jpayne@68	1561 # arg_order mechanism added to fix #345
jpayne@68	1562 if arg_order is None:
jpayne@68	1563 arg_order = []
jpayne@68	1564
jpayne@68	1565 for arg in arg_order:
jpayne@68	1566 if arg in kwargs:
jpayne@68	1567 val = kwargs.pop(arg)
jpayne@68	1568 cmds.append("-" + arg)
jpayne@68	1569 cmds.append(val)
jpayne@68	1570
jpayne@68	1571 # The reverse-sort is a temp fix for issue #81
jpayne@68	1572 for key, value in sorted(list(kwargs.items()), reverse=True):
jpayne@68	1573 if isinstance(value, bool):
jpayne@68	1574 if value:
jpayne@68	1575 cmds.append("-" + key)
jpayne@68	1576 else:
jpayne@68	1577 continue
jpayne@68	1578 elif isinstance(value, list) or isinstance(value, tuple):
jpayne@68	1579 value = list(map(str, value))
jpayne@68	1580 try:
jpayne@68	1581 delim = list_delimiters[prog]
jpayne@68	1582 except KeyError:
jpayne@68	1583 delim = default_list_delimiter
jpayne@68	1584
jpayne@68	1585 if delim == " ":
jpayne@68	1586 cmds.append("-" + key)
jpayne@68	1587 cmds.extend(value)
jpayne@68	1588
jpayne@68	1589 # make comma-separated list if that's what's needed
jpayne@68	1590 else:
jpayne@68	1591 cmds.append("-" + key)
jpayne@68	1592 cmds.append(delim.join(value))
jpayne@68	1593
jpayne@68	1594 else:
jpayne@68	1595 cmds.append("-" + key)
jpayne@68	1596 cmds.append(str(value))
jpayne@68	1597
jpayne@68	1598 if additional_args:
jpayne@68	1599 cmds.append(additional_args)
jpayne@68	1600
jpayne@68	1601 return cmds, tmp, stdin
jpayne@68	1602
jpayne@68	1603 def check_genome(self, **kwargs):
jpayne@68	1604 """
jpayne@68	1605 Handles the different ways of specifying a genome in kwargs:
jpayne@68	1606
jpayne@68	1607 g='genome.file' specifies a file directly
jpayne@68	1608 genome='dm3' gets the file from genome registry
jpayne@68	1609 self.chromsizes could be a dict.\
jpayne@68	1610 """
jpayne@68	1611
jpayne@68	1612 # If both g and genome are missing, assume self.chromsizes
jpayne@68	1613 if ("g" not in kwargs) and ("genome" not in kwargs):
jpayne@68	1614 if hasattr(self, "chromsizes"):
jpayne@68	1615 kwargs["g"] = self.chromsizes
jpayne@68	1616 else:
jpayne@68	1617 raise ValueError(
jpayne@68	1618 'No genome specified. Use the "g" or '
jpayne@68	1619 '"genome" kwargs, or use the '
jpayne@68	1620 ".set_chromsizes() method"
jpayne@68	1621 )
jpayne@68	1622
jpayne@68	1623 # If both specified, rather than make an implicit decision, raise an
jpayne@68	1624 # exception
jpayne@68	1625 if "g" in kwargs and "genome" in kwargs:
jpayne@68	1626 raise ValueError('Cannot specify both "g" and "genome"')
jpayne@68	1627
jpayne@68	1628 # Something like genome='dm3' was specified
jpayne@68	1629 if "g" not in kwargs and "genome" in kwargs:
jpayne@68	1630 if isinstance(kwargs["genome"], dict):
jpayne@68	1631 genome_dict = kwargs["genome"]
jpayne@68	1632 else:
jpayne@68	1633 genome_dict = pybedtools.chromsizes(kwargs["genome"])
jpayne@68	1634 genome_file = pybedtools.chromsizes_to_file(genome_dict)
jpayne@68	1635 kwargs["g"] = genome_file
jpayne@68	1636 del kwargs["genome"]
jpayne@68	1637
jpayne@68	1638 # By the time we get here, 'g' is specified.
jpayne@68	1639
jpayne@68	1640 # If a dict was provided, convert to tempfile here
jpayne@68	1641 if isinstance(kwargs["g"], dict):
jpayne@68	1642 kwargs["g"] = pybedtools.chromsizes_to_file(kwargs["g"])
jpayne@68	1643
jpayne@68	1644 if not os.path.exists(kwargs["g"]):
jpayne@68	1645 msg = 'Genome file "%s" does not exist' % (kwargs["g"])
jpayne@68	1646 raise FileNotFoundError(msg)
jpayne@68	1647
jpayne@68	1648 return kwargs
jpayne@68	1649
jpayne@68	1650 @_log_to_history
jpayne@68	1651 def remove_invalid(self):
jpayne@68	1652 """
jpayne@68	1653 Remove invalid features that may break BEDTools programs.
jpayne@68	1654
jpayne@68	1655 >>> a = pybedtools.BedTool("chr1 10 100\\nchr1 10 1",
jpayne@68	1656 ... from_string=True)
jpayne@68	1657 >>> print(a.remove_invalid()) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	1658 chr1 10 100
jpayne@68	1659 <BLANKLINE>
jpayne@68	1660
jpayne@68	1661 """
jpayne@68	1662 tmp = self._tmp()
jpayne@68	1663
jpayne@68	1664 # If it's a file-based BedTool -- which is likely, if we're trying to
jpayne@68	1665 # remove invalid features -- then we need to parse it line by line.
jpayne@68	1666 if isinstance(self.fn, str):
jpayne@68	1667 i = IntervalIterator(open(self.fn, "r"))
jpayne@68	1668 else:
jpayne@68	1669 tmp = self.saveas()
jpayne@68	1670 i = IntervalIterator(open(tmp.fn, "r"))
jpayne@68	1671
jpayne@68	1672 def _generator():
jpayne@68	1673 while True:
jpayne@68	1674 try:
jpayne@68	1675 feature = next(i)
jpayne@68	1676 if feature.start <= feature.stop:
jpayne@68	1677 yield feature
jpayne@68	1678 else:
jpayne@68	1679 continue
jpayne@68	1680 except pybedtools.MalformedBedLineError:
jpayne@68	1681 continue
jpayne@68	1682 except OverflowError:
jpayne@68	1683 # This can happen if coords are negative
jpayne@68	1684 continue
jpayne@68	1685 except IndexError:
jpayne@68	1686 continue
jpayne@68	1687 except StopIteration:
jpayne@68	1688 break
jpayne@68	1689
jpayne@68	1690 return BedTool(_generator())
jpayne@68	1691
jpayne@68	1692 def all_hits(self, interval: Interval, same_strand: bool = False, overlap: float = 0.0):
jpayne@68	1693 """
jpayne@68	1694 Return all intervals that overlap `interval`.
jpayne@68	1695
jpayne@68	1696 Calls the `all_hits` method of an IntervalFile to return all intervals
jpayne@68	1697 in this current BedTool that overlap `interval`.
jpayne@68	1698
jpayne@68	1699 Require that overlaps have the same strand with same_strand=True.
jpayne@68	1700
jpayne@68	1701 Notes:
jpayne@68	1702 If this current BedTool is generator-based, it will be
jpayne@68	1703 converted into a file first.
jpayne@68	1704
jpayne@68	1705 If this current BedTool refers to a BAM file, it will be
jpayne@68	1706 converted to a BED file first using default arguments. If you
jpayne@68	1707 don't want this to happen, please convert to BED first before
jpayne@68	1708 using this method.
jpayne@68	1709 """
jpayne@68	1710 if not isinstance(interval, Interval):
jpayne@68	1711 raise ValueError("Need an Interval instance")
jpayne@68	1712 fn = self.fn
jpayne@68	1713 if not isinstance(fn, str):
jpayne@68	1714 fn = self.saveas().fn
jpayne@68	1715 if self._isbam:
jpayne@68	1716 fn = self.bam_to_bed().fn
jpayne@68	1717 interval_file = pybedtools.IntervalFile(fn)
jpayne@68	1718 return interval_file.all_hits(interval, same_strand, overlap)
jpayne@68	1719
jpayne@68	1720 def any_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0):
jpayne@68	1721 """
jpayne@68	1722 Return whether or not any intervals overlap `interval`.
jpayne@68	1723
jpayne@68	1724 Calls the `any_hits` method of an IntervalFile. If there were any hits
jpayne@68	1725 within `interval` in this BedTool, then return 1; otherwise 0.
jpayne@68	1726
jpayne@68	1727 Require that overlaps have the same strand with same_strand=True.
jpayne@68	1728
jpayne@68	1729 Notes:
jpayne@68	1730 If this current BedTool is generator-based, it will be
jpayne@68	1731 converted into a file first.
jpayne@68	1732
jpayne@68	1733 If this current BedTool refers to a BAM file, it will be
jpayne@68	1734 converted to a BED file first using default arguments. If you
jpayne@68	1735 don't want this to happen, please convert to BED first before
jpayne@68	1736 using this method.
jpayne@68	1737 """
jpayne@68	1738 if not isinstance(interval, Interval):
jpayne@68	1739 raise ValueError("Need an Interval instance")
jpayne@68	1740 fn = self.fn
jpayne@68	1741 if not isinstance(fn, str):
jpayne@68	1742 fn = self.saveas().fn
jpayne@68	1743 if self._isbam:
jpayne@68	1744 fn = self.bam_to_bed().fn
jpayne@68	1745 interval_file = pybedtools.IntervalFile(fn)
jpayne@68	1746 return interval_file.any_hits(interval, same_strand, overlap)
jpayne@68	1747
jpayne@68	1748 def count_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0) -> int:
jpayne@68	1749 """
jpayne@68	1750 Return the number of intervals that overlap `interval`.
jpayne@68	1751
jpayne@68	1752 Calls the `count_hits` method of an IntervalFile. Returns the number
jpayne@68	1753 of valid hits in this BedTool that overlap `interval`.
jpayne@68	1754
jpayne@68	1755 Require that overlaps have the same strand with same_strand=True.
jpayne@68	1756
jpayne@68	1757 Notes:
jpayne@68	1758 If this current BedTool is generator-based, it will be
jpayne@68	1759 converted into a file first.
jpayne@68	1760
jpayne@68	1761 If this current BedTool refers to a BAM file, it will be
jpayne@68	1762 converted to a BED file first using default arguments. If you
jpayne@68	1763 don't want this to happen, please convert to BED first before
jpayne@68	1764 using this method.
jpayne@68	1765 """
jpayne@68	1766 if not isinstance(interval, Interval):
jpayne@68	1767 raise ValueError("Need an Interval instance")
jpayne@68	1768 fn = self.fn
jpayne@68	1769 if not isinstance(fn, str):
jpayne@68	1770 fn = self.saveas().fn
jpayne@68	1771 if self._isbam:
jpayne@68	1772 fn = self.bam_to_bed().fn
jpayne@68	1773 interval_file = pybedtools.IntervalFile(fn)
jpayne@68	1774 return interval_file.count_hits(interval, same_strand, overlap)
jpayne@68	1775
jpayne@68	1776 @_log_to_history
jpayne@68	1777 @_wraps(prog="bed12ToBed6", implicit="i", bam=None, other=None)
jpayne@68	1778 def bed6(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	1779 """
jpayne@68	1780 Wraps `bedtools bed12tobed6`.
jpayne@68	1781 """
jpayne@68	1782
jpayne@68	1783 # Alias for backward compatibility
jpayne@68	1784 bed12tobed6 = bed6
jpayne@68	1785
jpayne@68	1786 @_log_to_history
jpayne@68	1787 @_wraps(prog="bamToBed", implicit="i", other=None, nonbam="ALL", bam="i")
jpayne@68	1788 def bam_to_bed(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	1789 """
jpayne@68	1790 Wraps `bedtools bamtobed`.
jpayne@68	1791 """
jpayne@68	1792
jpayne@68	1793 # Alias for backward compatibility
jpayne@68	1794 bamtobed = bam_to_bed
jpayne@68	1795
jpayne@68	1796 @_wraps(prog="bedToBam", implicit="i", uses_genome=True, force_bam=True)
jpayne@68	1797 def _bed_to_bam(self, args, *kwargs):
jpayne@68	1798 """
jpayne@68	1799 Wraps bedToBam and is called internally for BED/GFF/VCF files by
jpayne@68	1800 self.to_bam (which needs to do something different for SAM files...)
jpayne@68	1801 """
jpayne@68	1802
jpayne@68	1803 @_log_to_history
jpayne@68	1804 def to_bam(self, **kwargs):
jpayne@68	1805 """
jpayne@68	1806 Wraps `bedtools bedtobam`
jpayne@68	1807
jpayne@68	1808 If self.fn is in BED/VCF/GFF format, call BEDTools' bedToBam. If
jpayne@68	1809 self.fn is in SAM format, then create a header out of the genome file
jpayne@68	1810 and then convert using `samtools`.
jpayne@68	1811 """
jpayne@68	1812 if self.file_type == "bam":
jpayne@68	1813 return self
jpayne@68	1814 if self.file_type in ("bed", "gff", "vcf"):
jpayne@68	1815 return self._bed_to_bam(**kwargs)
jpayne@68	1816
jpayne@68	1817 # TODO: to maintain backwards compatibility we go from Interval to
jpayne@68	1818 # AlignedSegment.
jpayne@68	1819 if self.file_type == "sam":
jpayne@68	1820
jpayne@68	1821 # Use pysam, but construct the header out of a provided genome
jpayne@68	1822 # file.
jpayne@68	1823
jpayne@68	1824 # construct a genome out of whatever kwargs were passed in
jpayne@68	1825 kwargs = self.check_genome(**kwargs)
jpayne@68	1826
jpayne@68	1827 # Build a header that we can use for the output BAM file.
jpayne@68	1828 genome = dict(i.split() for i in open(kwargs["g"]))
jpayne@68	1829 SQ = []
jpayne@68	1830 ref_ids = {}
jpayne@68	1831 text_header = ["@HD\tVN:1.0"]
jpayne@68	1832
jpayne@68	1833 for i, (k, v) in enumerate(genome.items()):
jpayne@68	1834 SQ.append(dict(SN=k, LN=int(v)))
jpayne@68	1835 ref_ids[k] = i
jpayne@68	1836 text_header.append("@SQ\tSN:{0}\tLN:{1}".format(k, v))
jpayne@68	1837
jpayne@68	1838 # And the text-format header
jpayne@68	1839 text_header = "\n".join(text_header) + "\n"
jpayne@68	1840
jpayne@68	1841 # The strategy is to write an actual SAM file to disk, along with
jpayne@68	1842 # a header, and then read that back in.
jpayne@68	1843 #
jpayne@68	1844 # Painfully inefficient, but this will change once all py2 tests
jpayne@68	1845 # pass.
jpayne@68	1846 sam_tmp = self._tmp()
jpayne@68	1847 bam_tmp = self._tmp()
jpayne@68	1848 with open(sam_tmp, "w") as fout:
jpayne@68	1849 fout.write(text_header)
jpayne@68	1850 for interval in self:
jpayne@68	1851 fout.write("\t".join(map(str, interval.fields)) + "\n")
jpayne@68	1852
jpayne@68	1853 samfile = pysam.AlignmentFile(sam_tmp, "r")
jpayne@68	1854 bamfile = pysam.AlignmentFile(bam_tmp, "wb", template=samfile)
jpayne@68	1855 for alignment in samfile:
jpayne@68	1856 bamfile.write(alignment)
jpayne@68	1857
jpayne@68	1858 samfile.close()
jpayne@68	1859 bamfile.close()
jpayne@68	1860 new_bedtool = BedTool(bam_tmp)
jpayne@68	1861 new_bedtool._isbam = True
jpayne@68	1862 return new_bedtool
jpayne@68	1863
jpayne@68	1864 # Alias for backward compatibility
jpayne@68	1865 bedtobam = to_bam
jpayne@68	1866
jpayne@68	1867 @_log_to_history
jpayne@68	1868 @_wraps(prog="intersectBed", implicit="a", other="b", bam="abam",
jpayne@68	1869 nonbam="bed", arg_order=["a", "abam"])
jpayne@68	1870 def intersect(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	1871 """
jpayne@68	1872 Wraps `bedtools intersect`.
jpayne@68	1873 """
jpayne@68	1874
jpayne@68	1875 @_log_to_history
jpayne@68	1876 @_wraps(
jpayne@68	1877 prog="fastaFromBed",
jpayne@68	1878 implicit="bed",
jpayne@68	1879 bam=None,
jpayne@68	1880 other="fi",
jpayne@68	1881 make_tempfile_for="fo",
jpayne@68	1882 check_stderr=_check_sequence_stderr,
jpayne@68	1883 add_to_bedtool={"fo": "seqfn"},
jpayne@68	1884 )
jpayne@68	1885 def sequence(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	1886 '''
jpayne@68	1887 Wraps `bedtools getfasta`.
jpayne@68	1888
jpayne@68	1889 fi is passed in by the user; bed is automatically passed in as the
jpayne@68	1890 bedfile of this object; fo by default is a temp file. Use
jpayne@68	1891 save_seqs() to save as a file.
jpayne@68	1892
jpayne@68	1893 The end result is that this BedTool will assign a value to the attribute , self.seqfn,
jpayne@68	1894 that points to the new fasta file.
jpayne@68	1895
jpayne@68	1896 Example usage:
jpayne@68	1897
jpayne@68	1898 >>> a = pybedtools.BedTool("""
jpayne@68	1899 ... chr1 1 10
jpayne@68	1900 ... chr1 50 55""", from_string=True)
jpayne@68	1901 >>> fasta = pybedtools.example_filename('test.fa')
jpayne@68	1902 >>> a = a.sequence(fi=fasta)
jpayne@68	1903 >>> print(open(a.seqfn).read())
jpayne@68	1904 >chr1:1-10
jpayne@68	1905 GATGAGTCT
jpayne@68	1906 >chr1:50-55
jpayne@68	1907 CCATC
jpayne@68	1908 <BLANKLINE>
jpayne@68	1909
jpayne@68	1910 '''
jpayne@68	1911
jpayne@68	1912 # Alias for backwards compatibility
jpayne@68	1913 getfasta = sequence
jpayne@68	1914
jpayne@68	1915 @staticmethod
jpayne@68	1916 def seq(loc, fasta) -> str:
jpayne@68	1917 """
jpayne@68	1918 Return just the sequence from a region string or a single location
jpayne@68	1919 >>> fn = pybedtools.example_filename('test.fa')
jpayne@68	1920 >>> BedTool.seq('chr1:2-10', fn)
jpayne@68	1921 'GATGAGTCT'
jpayne@68	1922 >>> BedTool.seq(('chr1', 1, 10), fn)
jpayne@68	1923 'GATGAGTCT'
jpayne@68	1924 """
jpayne@68	1925 if isinstance(loc, str):
jpayne@68	1926 chrom, start_end = loc.split(":")
jpayne@68	1927 start, end = list(map(int, start_end.split("-")))
jpayne@68	1928 start -= 1
jpayne@68	1929 else:
jpayne@68	1930 chrom, start, end = loc[0], loc[1], loc[2]
jpayne@68	1931
jpayne@68	1932 loc = BedTool("%s\t%i\t%i" % (chrom, start, end), from_string=True)
jpayne@68	1933 lseq = loc.sequence(fi=fasta)
jpayne@68	1934 return "".join([l.rstrip() for l in open(lseq.seqfn, "r") if l[0] != ">"])
jpayne@68	1935
jpayne@68	1936 @_log_to_history
jpayne@68	1937 @_wraps(
jpayne@68	1938 prog="nucBed", implicit="bed", other="fi", check_stderr=_check_sequence_stderr
jpayne@68	1939 )
jpayne@68	1940 def nucleotide_content(self) -> BedTool: # type: ignore
jpayne@68	1941 """
jpayne@68	1942 Wraps `bedtools nuc`.
jpayne@68	1943
jpayne@68	1944 Profiles nucleotide content. The returned BED file contains extra
jpayne@68	1945 information about the nucleotide content
jpayne@68	1946 """
jpayne@68	1947
jpayne@68	1948 # Alias for backwards compatibility
jpayne@68	1949 nuc = nucleotide_content
jpayne@68	1950
jpayne@68	1951 @_log_to_history
jpayne@68	1952 @_wraps(prog="multiBamCov", implicit="bed")
jpayne@68	1953 def multi_bam_coverage(self) -> BedTool: # type: ignore
jpayne@68	1954 """
jpayne@68	1955 Wraps `bedtools multicov`.
jpayne@68	1956
jpayne@68	1957 Pass a list of sorted and indexed BAM files as `bams`
jpayne@68	1958 """
jpayne@68	1959
jpayne@68	1960 # Alias for backwards compatibility
jpayne@68	1961 multicov = multi_bam_coverage
jpayne@68	1962
jpayne@68	1963 @_log_to_history
jpayne@68	1964 @_wraps(prog="subtractBed", implicit="a", other="b", bam=None)
jpayne@68	1965 def subtract(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	1966 """
jpayne@68	1967 Wraps `bedtools subtract`.
jpayne@68	1968
jpayne@68	1969 Subtracts from another BED file and returns a new BedTool object.
jpayne@68	1970
jpayne@68	1971 Example usage:
jpayne@68	1972
jpayne@68	1973 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	1974 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	1975
jpayne@68	1976 Do a "stranded" subtraction:
jpayne@68	1977
jpayne@68	1978 >>> c = a.subtract(b, s=True)
jpayne@68	1979
jpayne@68	1980 Require 50% of features in `a` to overlap:
jpayne@68	1981
jpayne@68	1982 >>> c = a.subtract(b, f=0.5)
jpayne@68	1983 """
jpayne@68	1984 if "a" not in kwargs:
jpayne@68	1985 kwargs["a"] = self.fn
jpayne@68	1986
jpayne@68	1987 if "b" not in kwargs:
jpayne@68	1988 if len(args) > 0:
jpayne@68	1989 kwargs["b"] = args[0]
jpayne@68	1990 else:
jpayne@68	1991 raise ValueError("Must specify a BED file to subtract, either as a positional argument or as the 'b' keyword argument.")
jpayne@68	1992
jpayne@68	1993 cmds, tmp, stdin = self.handle_kwargs(prog="subtractBed", **kwargs)
jpayne@68	1994 stream = call_bedtools(cmds, tmp, stdin=stdin)
jpayne@68	1995 return BedTool(stream)
jpayne@68	1996
jpayne@68	1997 @_log_to_history
jpayne@68	1998 @_wraps(prog="slopBed", implicit="i", other=None, bam=None, uses_genome=True)
jpayne@68	1999 def slop(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2000 """
jpayne@68	2001 Wraps `bedtools slop`.
jpayne@68	2002 """
jpayne@68	2003
jpayne@68	2004 @_log_to_history
jpayne@68	2005 @_wraps(prog="shiftBed", implicit="i", other=None, bam=None, uses_genome=True)
jpayne@68	2006 def shift(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2007 """
jpayne@68	2008 Wraps `bedtools shift`.
jpayne@68	2009
jpayne@68	2010 Shift each feature by user-defined number of bases. Returns a new BedTool object.
jpayne@68	2011
jpayne@68	2012 Example usage:
jpayne@68	2013
jpayne@68	2014 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2015
jpayne@68	2016 Shift every feature by 5bp:
jpayne@68	2017
jpayne@68	2018 >>> b = a.shift(genome='hg19', s=5)
jpayne@68	2019 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2020 chr1 6 105 feature1 0 +
jpayne@68	2021 chr1 105 205 feature2 0 +
jpayne@68	2022 chr1 155 505 feature3 0 -
jpayne@68	2023 chr1 905 955 feature4 0 +
jpayne@68	2024 <BLANKLINE>
jpayne@68	2025
jpayne@68	2026 Shift features on the '+' strand by -1bp and on '-' strand by +3bp:
jpayne@68	2027
jpayne@68	2028 >>> b = a.shift(genome='hg19', p=-1, m=3)
jpayne@68	2029 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2030 chr1 0 99 feature1 0 +
jpayne@68	2031 chr1 99 199 feature2 0 +
jpayne@68	2032 chr1 153 503 feature3 0 -
jpayne@68	2033 chr1 899 949 feature4 0 +
jpayne@68	2034 <BLANKLINE>
jpayne@68	2035
jpayne@68	2036 # Disabling, see https://github.com/arq5x/bedtools2/issues/807
jpayne@68	2037 Shift features by a fraction of their length (0.50):
jpayne@68	2038
jpayne@68	2039 #>>> b = a.shift(genome='hg19', pct=True, s=0.50)
jpayne@68	2040 #>>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2041 #chr1 50 149 feature1 0 +
jpayne@68	2042 #chr1 150 250 feature2 0 +
jpayne@68	2043 #chr1 325 675 feature3 0 -
jpayne@68	2044 #chr1 925 975 feature4 0 +
jpayne@68	2045 #<BLANKLINE>
jpayne@68	2046
jpayne@68	2047 """
jpayne@68	2048
jpayne@68	2049 @_log_to_history
jpayne@68	2050 @_wraps(prog="mergeBed", implicit="i", other=None, bam=None)
jpayne@68	2051 def merge(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2052 """
jpayne@68	2053 Wraps `bedtools merge`.
jpayne@68	2054
jpayne@68	2055 Merge overlapping features together. Returns a new BedTool object.
jpayne@68	2056
jpayne@68	2057 Example usage:
jpayne@68	2058
jpayne@68	2059 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2060
jpayne@68	2061 Merge:
jpayne@68	2062
jpayne@68	2063 >>> c = a.merge()
jpayne@68	2064
jpayne@68	2065 Allow merging of features 500 bp apart:
jpayne@68	2066
jpayne@68	2067 >>> c = a.merge(d=500)
jpayne@68	2068
jpayne@68	2069 """
jpayne@68	2070
jpayne@68	2071 @_log_to_history
jpayne@68	2072 @_wraps(prog="closestBed", implicit="a", other="b", bam=None)
jpayne@68	2073 def closest(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2074 """
jpayne@68	2075 Wraps `bedtools closest`.
jpayne@68	2076
jpayne@68	2077 Return a new BedTool object containing closest features in b. Note
jpayne@68	2078 that the resulting file is no longer a valid BED format; use the
jpayne@68	2079 special "_closest" methods to work with the resulting file.
jpayne@68	2080
jpayne@68	2081 Example usage::
jpayne@68	2082
jpayne@68	2083 a = BedTool('in.bed')
jpayne@68	2084
jpayne@68	2085 # get the closest feature in 'other.bed' on the same strand
jpayne@68	2086 b = a.closest('other.bed', s=True)
jpayne@68	2087
jpayne@68	2088 """
jpayne@68	2089
jpayne@68	2090 @_log_to_history
jpayne@68	2091 @_wraps(prog="windowBed", implicit="a", other="b", bam="abam", nonbam="bed")
jpayne@68	2092 def window(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2093 """
jpayne@68	2094 Wraps `bedtools window`.
jpayne@68	2095
jpayne@68	2096 Example usage::
jpayne@68	2097
jpayne@68	2098 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2099 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	2100 >>> print(a.window(b, w=1000)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2101 chr1 1 100 feature1 0 + chr1 155 200 feature5 0 -
jpayne@68	2102 chr1 1 100 feature1 0 + chr1 800 901 feature6 0 +
jpayne@68	2103 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 -
jpayne@68	2104 chr1 100 200 feature2 0 + chr1 800 901 feature6 0 +
jpayne@68	2105 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 -
jpayne@68	2106 chr1 150 500 feature3 0 - chr1 800 901 feature6 0 +
jpayne@68	2107 chr1 900 950 feature4 0 + chr1 155 200 feature5 0 -
jpayne@68	2108 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 +
jpayne@68	2109 <BLANKLINE>
jpayne@68	2110 """
jpayne@68	2111
jpayne@68	2112 @_log_to_history
jpayne@68	2113 @_wraps(prog="shuffleBed", implicit="i", other=None, bam=None, uses_genome=True)
jpayne@68	2114 def shuffle(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2115 """
jpayne@68	2116 Wraps `bedtools shuffle`.
jpayne@68	2117
jpayne@68	2118 Example usage:
jpayne@68	2119
jpayne@68	2120 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2121 >>> seed = 1 # so this test always returns the same results
jpayne@68	2122 >>> b = a.shuffle(genome='hg19', chrom=True, seed=seed)
jpayne@68	2123 >>> print(b) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2124 chr1 123081365 123081464 feature1 0 +
jpayne@68	2125 chr1 243444570 243444670 feature2 0 +
jpayne@68	2126 chr1 194620241 194620591 feature3 0 -
jpayne@68	2127 chr1 172792873 172792923 feature4 0 +
jpayne@68	2128 <BLANKLINE>
jpayne@68	2129 """
jpayne@68	2130
jpayne@68	2131 @_log_to_history
jpayne@68	2132 @_wraps(prog="sortBed", implicit="i", uses_genome=True, genome_if=["g", "genome"])
jpayne@68	2133 def sort(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2134 """
jpayne@68	2135 Wraps `bedtools sort`.
jpayne@68	2136
jpayne@68	2137 Note that chromosomes are sorted lexicographically, so chr12 will come
jpayne@68	2138 before chr9.
jpayne@68	2139
jpayne@68	2140 Example usage:
jpayne@68	2141
jpayne@68	2142 >>> a = pybedtools.BedTool('''
jpayne@68	2143 ... chr9 300 400
jpayne@68	2144 ... chr1 100 200
jpayne@68	2145 ... chr1 1 50
jpayne@68	2146 ... chr12 1 100
jpayne@68	2147 ... chr9 500 600
jpayne@68	2148 ... ''', from_string=True)
jpayne@68	2149 >>> print(a.sort()) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2150 chr1 1 50
jpayne@68	2151 chr1 100 200
jpayne@68	2152 chr12 1 100
jpayne@68	2153 chr9 300 400
jpayne@68	2154 chr9 500 600
jpayne@68	2155 <BLANKLINE>
jpayne@68	2156 """
jpayne@68	2157
jpayne@68	2158 @_log_to_history
jpayne@68	2159 @_wraps(prog="annotateBed", implicit="i")
jpayne@68	2160 def annotate(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2161 """
jpayne@68	2162 Wraps `bedtools annotate`.
jpayne@68	2163
jpayne@68	2164 Annotate this BedTool with a list of other files.
jpayne@68	2165 Example usage:
jpayne@68	2166
jpayne@68	2167 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2168 >>> b_fn = pybedtools.example_filename('b.bed')
jpayne@68	2169 >>> print(a.annotate(files=b_fn)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2170 chr1 1 100 feature1 0 + 0.000000
jpayne@68	2171 chr1 100 200 feature2 0 + 0.450000
jpayne@68	2172 chr1 150 500 feature3 0 - 0.128571
jpayne@68	2173 chr1 900 950 feature4 0 + 0.020000
jpayne@68	2174 <BLANKLINE>
jpayne@68	2175 """
jpayne@68	2176
jpayne@68	2177 @_log_to_history
jpayne@68	2178 @_wraps(prog="flankBed", implicit="i", uses_genome=True)
jpayne@68	2179 def flank(self, args, *kwargs) -> BedTool:
jpayne@68	2180 """
jpayne@68	2181 Wraps `bedtools flank`.
jpayne@68	2182
jpayne@68	2183 Example usage:
jpayne@68	2184
jpayne@68	2185 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2186 >>> print(a.flank(genome='hg19', b=100)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2187 chr1 0 1 feature1 0 +
jpayne@68	2188 chr1 100 200 feature1 0 +
jpayne@68	2189 chr1 0 100 feature2 0 +
jpayne@68	2190 chr1 200 300 feature2 0 +
jpayne@68	2191 chr1 50 150 feature3 0 -
jpayne@68	2192 chr1 500 600 feature3 0 -
jpayne@68	2193 chr1 800 900 feature4 0 +
jpayne@68	2194 chr1 950 1050 feature4 0 +
jpayne@68	2195 <BLANKLINE>
jpayne@68	2196
jpayne@68	2197 """
jpayne@68	2198 kwargs = self.check_genome(**kwargs)
jpayne@68	2199
jpayne@68	2200 if "i" not in kwargs:
jpayne@68	2201 kwargs["i"] = self.fn
jpayne@68	2202
jpayne@68	2203 cmds, tmp, stdin = self.handle_kwargs(prog="flankBed", **kwargs)
jpayne@68	2204 stream = call_bedtools(cmds, tmp, stdin=stdin)
jpayne@68	2205 return BedTool(stream)
jpayne@68	2206
jpayne@68	2207 @_log_to_history
jpayne@68	2208 @_wraps(
jpayne@68	2209 prog="genomeCoverageBed",
jpayne@68	2210 implicit="i",
jpayne@68	2211 bam="ibam",
jpayne@68	2212 genome_none_if=["ibam"],
jpayne@68	2213 genome_ok_if=["ibam"],
jpayne@68	2214 uses_genome=True,
jpayne@68	2215 nonbam="ALL",
jpayne@68	2216 )
jpayne@68	2217 def genome_coverage(self, args, *kwargs):
jpayne@68	2218 """
jpayne@68	2219 Wraps `bedtools genomecov`.
jpayne@68	2220
jpayne@68	2221 Note that some invocations of `bedtools genomecov` do not result in
jpayne@68	2222 a properly-formatted BED file. For example, the default behavior is to
jpayne@68	2223 report a histogram of coverage. Iterating over the resulting,
jpayne@68	2224 non-BED-format file will raise exceptions in pybedtools' parser.
jpayne@68	2225
jpayne@68	2226 Consider using the `BedTool.to_dataframe` method to convert these
jpayne@68	2227 non-BED files into a pandas DataFrame for further use.
jpayne@68	2228
jpayne@68	2229 Example usage:
jpayne@68	2230
jpayne@68	2231 BAM file input does not require a genome:
jpayne@68	2232
jpayne@68	2233 >>> a = pybedtools.example_bedtool('x.bam')
jpayne@68	2234 >>> b = a.genome_coverage(bg=True)
jpayne@68	2235 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2236 chr2L 9329 9365 1
jpayne@68	2237 chr2L 10212 10248 1
jpayne@68	2238 chr2L 10255 10291 1
jpayne@68	2239
jpayne@68	2240 Other input does require a genome:
jpayne@68	2241
jpayne@68	2242 >>> a = pybedtools.example_bedtool('x.bed')
jpayne@68	2243 >>> b = a.genome_coverage(bg=True, genome='dm3')
jpayne@68	2244 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2245 chr2L 9329 9365 1
jpayne@68	2246 chr2L 10212 10248 1
jpayne@68	2247 chr2L 10255 10291 1
jpayne@68	2248
jpayne@68	2249 Non-BED format results:
jpayne@68	2250 >>> a = pybedtools.example_bedtool('x.bed')
jpayne@68	2251 >>> b = a.genome_coverage(genome='dm3')
jpayne@68	2252 >>> df = b.to_dataframe(names=['chrom', 'depth', 'n', 'chromsize', 'fraction'])
jpayne@68	2253 """
jpayne@68	2254
jpayne@68	2255 # Alias for backwards compatibility
jpayne@68	2256 genomecov = genome_coverage
jpayne@68	2257
jpayne@68	2258 @_log_to_history
jpayne@68	2259 @_wraps(prog="coverageBed", implicit="a", other="b", bam="abam", nonbam="ALL")
jpayne@68	2260 def coverage(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2261 """
jpayne@68	2262 Wraps `bedtools coverage`.
jpayne@68	2263
jpayne@68	2264 Note that starting in version 2.24.0, BEDTools swapped the semantics of
jpayne@68	2265 the "a" and "b" files.
jpayne@68	2266
jpayne@68	2267 Example usage:
jpayne@68	2268
jpayne@68	2269 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2270 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	2271 >>> c = b.coverage(a)
jpayne@68	2272 >>> c.head(3) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2273 chr1 155 200 feature5 0 - 2 45 45 1.0000000
jpayne@68	2274 chr1 800 901 feature6 0 + 1 1 101 0.0099010
jpayne@68	2275 """
jpayne@68	2276
jpayne@68	2277 @_log_to_history
jpayne@68	2278 @_wraps(
jpayne@68	2279 prog="maskFastaFromBed",
jpayne@68	2280 implicit="bed",
jpayne@68	2281 other="fi",
jpayne@68	2282 make_tempfile_for="fo",
jpayne@68	2283 add_to_bedtool={"fo": "seqfn"},
jpayne@68	2284 check_stderr=_check_sequence_stderr,
jpayne@68	2285 )
jpayne@68	2286 def mask_fasta(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2287 """
jpayne@68	2288 Wraps `bedtools maskfasta`.
jpayne@68	2289
jpayne@68	2290 Masks a fasta file at the positions in a BED file and saves result as
jpayne@68	2291 'out' and stores the filename in seqfn.
jpayne@68	2292
jpayne@68	2293 >>> a = pybedtools.BedTool('chr1 100 110', from_string=True)
jpayne@68	2294 >>> fasta_fn = pybedtools.example_filename('test.fa')
jpayne@68	2295 >>> a = a.mask_fasta(fi=fasta_fn, fo='masked.fa.example')
jpayne@68	2296 >>> b = a.slop(b=2, genome='hg19')
jpayne@68	2297 >>> b = b.sequence(fi=a.seqfn)
jpayne@68	2298 >>> print(open(b.seqfn).read())
jpayne@68	2299 >chr1:98-112
jpayne@68	2300 TTNNNNNNNNNNAT
jpayne@68	2301 <BLANKLINE>
jpayne@68	2302 >>> os.unlink('masked.fa.example')
jpayne@68	2303 >>> if os.path.exists('masked.fa.example.fai'):
jpayne@68	2304 ... os.unlink('masked.fa.example.fai')
jpayne@68	2305 """
jpayne@68	2306
jpayne@68	2307 # Alias for backwards compatibility
jpayne@68	2308 maskfasta = mask_fasta
jpayne@68	2309
jpayne@68	2310 @_log_to_history
jpayne@68	2311 @_wraps(prog="complementBed", implicit="i", uses_genome=True)
jpayne@68	2312 def complement(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2313 """
jpayne@68	2314 Wraps `bedtools complement`.
jpayne@68	2315 Example usage:
jpayne@68	2316
jpayne@68	2317 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2318 >>> a.complement(genome='hg19').head(5) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2319 chr1 0 1
jpayne@68	2320 chr1 500 900
jpayne@68	2321 chr1 950 249250621
jpayne@68	2322 chr2 0 243199373
jpayne@68	2323 chr3 0 198022430
jpayne@68	2324 """
jpayne@68	2325
jpayne@68	2326 @_log_to_history
jpayne@68	2327 @_wraps(prog="getOverlap", implicit="i")
jpayne@68	2328 def overlap(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2329 """
jpayne@68	2330 Wraps `bedtools overlap`.
jpayne@68	2331
jpayne@68	2332 Example usage:
jpayne@68	2333
jpayne@68	2334 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2335 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	2336 >>> c = a.window(b, w=10).overlap(cols=[2,3,8,9])
jpayne@68	2337 >>> print(c) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2338 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 - 45
jpayne@68	2339 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 - 45
jpayne@68	2340 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 + 1
jpayne@68	2341 <BLANKLINE>
jpayne@68	2342 """
jpayne@68	2343
jpayne@68	2344 # TODO: needs test files and doctests written
jpayne@68	2345 @_log_to_history
jpayne@68	2346 @_wraps(prog="pairToBed", implicit="a", other="b", bam="abam", nonbam="bedpe")
jpayne@68	2347 def pair_to_bed(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2348 """
jpayne@68	2349 Wraps `bedtools pairtobed`.
jpayne@68	2350 """
jpayne@68	2351
jpayne@68	2352 # Alias for backwards compatibility
jpayne@68	2353 pairtobed = pair_to_bed
jpayne@68	2354
jpayne@68	2355 @_log_to_history
jpayne@68	2356 @_wraps(prog="pairToPair", implicit="a", other="b")
jpayne@68	2357 def pair_to_pair(self) -> BedTool: # type: ignore
jpayne@68	2358 """
jpayne@68	2359 Wraps `bedtools pairtopair`.
jpayne@68	2360 """
jpayne@68	2361
jpayne@68	2362 # Alias for backwards compatibility
jpayne@68	2363 pairtopair = pair_to_pair
jpayne@68	2364
jpayne@68	2365 @_log_to_history
jpayne@68	2366 @_wraps(prog="groupBy", implicit="i")
jpayne@68	2367 def groupby(self, args, *kwargs) -> BedTool:
jpayne@68	2368 """
jpayne@68	2369 Wraps `bedtools groupby`.
jpayne@68	2370
jpayne@68	2371 Example usage:
jpayne@68	2372
jpayne@68	2373 >>> a = pybedtools.example_bedtool('gdc.gff')
jpayne@68	2374 >>> b = pybedtools.example_bedtool('gdc.bed')
jpayne@68	2375 >>> c = a.intersect(b, c=True)
jpayne@68	2376 >>> d = c.groupby(g=[1, 4, 5], c=10, o=['sum'])
jpayne@68	2377 >>> print(d) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2378 chr2L 41 70 0
jpayne@68	2379 chr2L 71 130 2
jpayne@68	2380 chr2L 131 170 4
jpayne@68	2381 chr2L 171 200 0
jpayne@68	2382 chr2L 201 220 1
jpayne@68	2383 chr2L 41 130 2
jpayne@68	2384 chr2L 171 220 1
jpayne@68	2385 chr2L 41 220 7
jpayne@68	2386 chr2L 161 230 6
jpayne@68	2387 chr2L 41 220 7
jpayne@68	2388 <BLANKLINE>
jpayne@68	2389
jpayne@68	2390 """
jpayne@68	2391
jpayne@68	2392 @_log_to_history
jpayne@68	2393 @_wraps(prog="tagBam", implicit="i", bam="i")
jpayne@68	2394 def tag_bam(self, args, *kwargs) -> pysam.AlignmentFile: # type: ignore
jpayne@68	2395 """
jpayne@68	2396 Wraps `bedtools tag`.
jpayne@68	2397
jpayne@68	2398 `files` and `labels` should lists of equal length.
jpayne@68	2399
jpayne@68	2400 """
jpayne@68	2401 if "i" in kwargs:
jpayne@68	2402 if "labels" in kwargs:
jpayne@68	2403 if len(kwargs["i"]) != len(kwargs["labels"]):
jpayne@68	2404 raise ValueError("files and labels must be lists of equal length")
jpayne@68	2405
jpayne@68	2406
jpayne@68	2407 # Alias for backwards compatibility
jpayne@68	2408 tag = tag_bam
jpayne@68	2409
jpayne@68	2410 @_log_to_history
jpayne@68	2411 @_wraps(prog="mapBed", implicit="a", other="b")
jpayne@68	2412 def map(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2413 """
jpayne@68	2414 Wraps `bedtools map`; See also :meth:`BedTool.each`.
jpayne@68	2415 """
jpayne@68	2416
jpayne@68	2417 @_log_to_history
jpayne@68	2418 @_wraps(prog="multiIntersectBed", uses_genome=True, genome_if=["empty"])
jpayne@68	2419 def multi_intersect(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2420 """
jpayne@68	2421 Wraps `bedtools multiintersect`.
jpayne@68	2422
jpayne@68	2423 Provide a list of filenames as the "i" argument. e.g. if you already
jpayne@68	2424 have BedTool objects then use their `.fn` attribute, like this::
jpayne@68	2425
jpayne@68	2426 >>> x = pybedtools.BedTool()
jpayne@68	2427 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2428 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	2429 >>> result = x.multi_intersect(i=[a.fn, b.fn])
jpayne@68	2430 >>> print(result) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	2431 chr1 1 155 1 1 1 0
jpayne@68	2432 chr1 155 200 2 1,2 1 1
jpayne@68	2433 chr1 200 500 1 1 1 0
jpayne@68	2434 chr1 800 900 1 2 0 1
jpayne@68	2435 chr1 900 901 2 1,2 1 1
jpayne@68	2436 chr1 901 950 1 1 1 0
jpayne@68	2437 <BLANKLINE>
jpayne@68	2438
jpayne@68	2439 """
jpayne@68	2440
jpayne@68	2441 # Alias for backwards compatibility
jpayne@68	2442 multiinter = multi_intersect
jpayne@68	2443
jpayne@68	2444 @_log_to_history
jpayne@68	2445 @_wraps(prog="randomBed", uses_genome=True)
jpayne@68	2446 def random(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2447 """
jpayne@68	2448 Wraps `bedtools random`.
jpayne@68	2449
jpayne@68	2450 Since this method does not operate on an existing file, create
jpayne@68	2451 a BedTool with no arguments and then call this method, e.g.,
jpayne@68	2452
jpayne@68	2453 >>> x = BedTool()
jpayne@68	2454 >>> y = x.random(l=100, n=10, genome='hg19')
jpayne@68	2455 """
jpayne@68	2456
jpayne@68	2457 @_log_to_history
jpayne@68	2458 @_wraps("bedpeToBam", implicit="i", uses_genome=True, force_bam=True)
jpayne@68	2459 def bedpe_to_bam(self, args, *kwargs) -> pysam.AlignmentFile: # type: ignore
jpayne@68	2460 """
jpayne@68	2461 Wraps `bedtools bedpetobam`.
jpayne@68	2462 """
jpayne@68	2463
jpayne@68	2464 # Alias for backwards compatibility
jpayne@68	2465 bedpetobam = bedpe_to_bam
jpayne@68	2466
jpayne@68	2467 @_log_to_history
jpayne@68	2468 @_wraps(prog="clusterBed", implicit="i")
jpayne@68	2469 def cluster(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2470 """
jpayne@68	2471 Wraps `bedtools cluster`.
jpayne@68	2472 """
jpayne@68	2473
jpayne@68	2474 @_log_to_history
jpayne@68	2475 @_wraps(prog="unionBedGraphs")
jpayne@68	2476 def union_bedgraphs(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2477 """
jpayne@68	2478 Wraps `bedtools unionbedg`.
jpayne@68	2479
jpayne@68	2480 Warning: using the `header=True` kwarg will result in a file that is
jpayne@68	2481 not in true BED format, which may break downstream analysis.
jpayne@68	2482 """
jpayne@68	2483 if "header" in kwargs:
jpayne@68	2484 if kwargs["header"] is True:
jpayne@68	2485 warn("Using header=True with unionbedg will result in a file that is not in true BED format, which may break downstream analysis.")
jpayne@68	2486
jpayne@68	2487
jpayne@68	2488 # Alias for backwards compatibility
jpayne@68	2489 unionbedg = union_bedgraphs
jpayne@68	2490
jpayne@68	2491 @_log_to_history
jpayne@68	2492 @_wraps(prog="windowMaker", uses_genome=True, genome_none_if=["b"], other="b", arg_order=["w"])
jpayne@68	2493 def window_maker(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2494 """
jpayne@68	2495 Wraps `bedtools makewindows`.
jpayne@68	2496 """
jpayne@68	2497
jpayne@68	2498 # Alias for backwards compatibility
jpayne@68	2499 makewindows = window_maker
jpayne@68	2500
jpayne@68	2501 @_log_to_history
jpayne@68	2502 @_wraps(prog="expandCols", implicit="i")
jpayne@68	2503 def expand(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2504 """
jpayne@68	2505 Wraps `bedtools expand`
jpayne@68	2506 """
jpayne@68	2507
jpayne@68	2508 @_log_to_history
jpayne@68	2509 @_wraps(prog="linksBed", implicit="i", add_to_bedtool={"stdout": "links_html"})
jpayne@68	2510 def links(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2511 """
jpayne@68	2512 Wraps `linksBed`.
jpayne@68	2513
jpayne@68	2514 The resulting BedTool will assign a value to the attribute `links_html`. This
jpayne@68	2515 attribute is a temp filename containing the HTML links.
jpayne@68	2516 """
jpayne@68	2517
jpayne@68	2518 @_log_to_history
jpayne@68	2519 @_wraps(prog="bedToIgv", implicit="i", add_to_bedtool={"stdout": "igv_script"})
jpayne@68	2520 def igv(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2521 """
jpayne@68	2522 Wraps `bedtools igv`.
jpayne@68	2523
jpayne@68	2524 The resulting BedTool will assign a value to the attribute `igv_script`. This
jpayne@68	2525 attribute is a temp filename containing the IGV script.
jpayne@68	2526 """
jpayne@68	2527
jpayne@68	2528 @_log_to_history
jpayne@68	2529 @_wraps(
jpayne@68	2530 prog="bamToFastq",
jpayne@68	2531 implicit="i",
jpayne@68	2532 bam="i",
jpayne@68	2533 make_tempfile_for="fq",
jpayne@68	2534 add_to_bedtool={"fq": "fastq"},
jpayne@68	2535 )
jpayne@68	2536 def bam_to_fastq(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2537 """
jpayne@68	2538 Wraps `bedtools bamtofastq`.
jpayne@68	2539
jpayne@68	2540 The `fq` argument is required.
jpayne@68	2541
jpayne@68	2542 The resulting BedTool will assign a value to the attribute `fastq`.
jpayne@68	2543 """
jpayne@68	2544
jpayne@68	2545 # Alias for backwards compatibility
jpayne@68	2546 bamtofastq = bam_to_fastq
jpayne@68	2547
jpayne@68	2548 @_wraps(
jpayne@68	2549 prog="jaccard",
jpayne@68	2550 implicit="a",
jpayne@68	2551 other="b",
jpayne@68	2552 does_not_return_bedtool=_jaccard_output_to_dict,
jpayne@68	2553 )
jpayne@68	2554 def jaccard(self, args, *kwargs) -> dict[str, Any]: # type: ignore
jpayne@68	2555 """
jpayne@68	2556 Returns a dictionary with keys (intersection, union, jaccard).
jpayne@68	2557 """
jpayne@68	2558
jpayne@68	2559 @_wraps(
jpayne@68	2560 prog="reldist",
jpayne@68	2561 implicit="a",
jpayne@68	2562 other="b",
jpayne@68	2563 does_not_return_bedtool=_reldist_output_handler,
jpayne@68	2564 )
jpayne@68	2565 def reldist(self, args, *kwargs) -> BedTool \| dict[str, Any]: # type: ignore
jpayne@68	2566 """
jpayne@68	2567 If detail=False, then return a dictionary with keys (reldist, count,
jpayne@68	2568 total, fraction), which is the summary of the bedtools reldist.
jpayne@68	2569
jpayne@68	2570 Otherwise return a BedTool, with the relative distance for each
jpayne@68	2571 interval in A in the last column.
jpayne@68	2572 """
jpayne@68	2573 if "detail" in kwargs:
jpayne@68	2574 if kwargs["detail"] is False:
jpayne@68	2575 warn("Using detail=False with reldist will return a dictionary with keys (reldist, count, total, fraction), which is the summary of the bedtools reldist."
jpayne@68	2576 "Not a BedTool object.")
jpayne@68	2577
jpayne@68	2578
jpayne@68	2579 @_wraps(prog="sample", implicit="i", bam="i")
jpayne@68	2580 def sample(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2581 """
jpayne@68	2582 Wraps 'sample'.
jpayne@68	2583 """
jpayne@68	2584
jpayne@68	2585 @_wraps(
jpayne@68	2586 prog="fisher",
jpayne@68	2587 implicit="a",
jpayne@68	2588 other="b",
jpayne@68	2589 uses_genome=True,
jpayne@68	2590 does_not_return_bedtool=FisherOutput,
jpayne@68	2591 )
jpayne@68	2592 def fisher(self, args, *kwargs) -> FisherOutput: # type: ignore
jpayne@68	2593 """
jpayne@68	2594 Wraps 'fisher'. Returns an object representing the output.
jpayne@68	2595
jpayne@68	2596 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2597 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	2598 >>> f = a.fisher(b, genome='hg19')
jpayne@68	2599 >>> print(f) # doctest: +NORMALIZE_WHITESPACE
jpayne@68	2600 # Number of query intervals: 4
jpayne@68	2601 # Number of db intervals: 2
jpayne@68	2602 # Number of overlaps: 3
jpayne@68	2603 # Number of possible intervals (estimated): 13958448
jpayne@68	2604 # phyper(3 - 1, 4, 13958448 - 4, 2, lower.tail=F)
jpayne@68	2605 # Contingency Table Of Counts
jpayne@68	2606 #_________________________________________
jpayne@68	2607 # \| in -b \| not in -b \|
jpayne@68	2608 # in -a \| 3 \| 1 \|
jpayne@68	2609 # not in -a \| 0 \| 13958444 \|
jpayne@68	2610 #_________________________________________
jpayne@68	2611 # p-values for fisher's exact test
jpayne@68	2612 left right two-tail ratio
jpayne@68	2613 1 8.8247e-21 8.8247e-21 inf
jpayne@68	2614 <BLANKLINE>
jpayne@68	2615
jpayne@68	2616
jpayne@68	2617 >>> f.table['not in -a']['in -b']
jpayne@68	2618 0
jpayne@68	2619
jpayne@68	2620 >>> f.table['not in -a']['not in -b']
jpayne@68	2621 13958444
jpayne@68	2622
jpayne@68	2623 >>> f.table['in -a']['in -b']
jpayne@68	2624 3
jpayne@68	2625
jpayne@68	2626 >>> f.table['in -a']['not in -b']
jpayne@68	2627 1
jpayne@68	2628
jpayne@68	2629 >>> f.two_tail
jpayne@68	2630 8.8247e-21
jpayne@68	2631 """
jpayne@68	2632
jpayne@68	2633 @_wraps(prog="split", implicit="i", does_not_return_bedtool=SplitOutput)
jpayne@68	2634 def splitbed(self, args, *kwargs) -> SplitOutput: # type: ignore
jpayne@68	2635 """
jpayne@68	2636 Wraps 'bedtools split'.
jpayne@68	2637
jpayne@68	2638 BedTool objects have long had a `split` method which splits intervals
jpayne@68	2639 according to a custom function. Now that BEDTools has a `split` tool,
jpayne@68	2640 the method name conflicts. To maintain backwards compatibility, the
jpayne@68	2641 method wrapping the BEDTools command is called `splitbed`.
jpayne@68	2642
jpayne@68	2643 Since this tool does not return a single BED file, the method parses
jpayne@68	2644 the output and returns a SplitOutput object, which includes an
jpayne@68	2645 attribute, `bedtools`, that is a list of BedTool objects created from
jpayne@68	2646 the split files.
jpayne@68	2647
jpayne@68	2648 To keep the working directory clean, you may want to consider using
jpayne@68	2649 `prefix=BedTool._tmp()` to get a temp file that will be deleted when
jpayne@68	2650 Python exits cleanly.
jpayne@68	2651
jpayne@68	2652 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2653 >>> s = a.splitbed(n=2, p="split")
jpayne@68	2654 >>> assert len(a) == 4, len(a)
jpayne@68	2655 >>> assert len(s.bedtools) == 2
jpayne@68	2656 >>> print(s.bedtools[0]) # doctest: +NORMALIZE_WHITESPACE
jpayne@68	2657 chr1 150 500 feature3 0 -
jpayne@68	2658 <BLANKLINE>
jpayne@68	2659 >>> print(s.bedtools[1]) # doctest: +NORMALIZE_WHITESPACE
jpayne@68	2660 chr1 100 200 feature2 0 +
jpayne@68	2661 chr1 1 100 feature1 0 +
jpayne@68	2662 chr1 900 950 feature4 0 +
jpayne@68	2663 <BLANKLINE>
jpayne@68	2664 """
jpayne@68	2665
jpayne@68	2666 @_wraps(prog="spacing", implicit="i")
jpayne@68	2667 def spacing(self, args, *kwargs) -> BedTool: # type: ignore
jpayne@68	2668 """
jpayne@68	2669 Wraps `bedtools spacing`
jpayne@68	2670
jpayne@68	2671 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2672 >>> print(a.spacing()) # doctest: +NORMALIZE_WHITESPACE
jpayne@68	2673 chr1 1 100 feature1 0 + .
jpayne@68	2674 chr1 100 200 feature2 0 + 0
jpayne@68	2675 chr1 150 500 feature3 0 - -1
jpayne@68	2676 chr1 900 950 feature4 0 + 400
jpayne@68	2677 """
jpayne@68	2678
jpayne@68	2679 def count(self) -> int:
jpayne@68	2680 """
jpayne@68	2681 Count the number features in this BedTool.
jpayne@68	2682
jpayne@68	2683 Number of features in BED file. Does the same thing as len(self), which
jpayne@68	2684 actually just calls this method.
jpayne@68	2685
jpayne@68	2686 Only counts the actual features. Ignores any track lines, browser
jpayne@68	2687 lines, lines starting with a "#", or blank lines.
jpayne@68	2688
jpayne@68	2689 Example usage:
jpayne@68	2690
jpayne@68	2691 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	2692 >>> a.count()
jpayne@68	2693 4
jpayne@68	2694 """
jpayne@68	2695 if hasattr(self, "next") or hasattr(self, "__next__"):
jpayne@68	2696 return sum(1 for _ in self)
jpayne@68	2697 return sum(1 for _ in iter(self))
jpayne@68	2698
jpayne@68	2699 def print_sequence(self) -> str:
jpayne@68	2700 """
jpayne@68	2701 Print the sequence that was retrieved by BedTool.sequence.
jpayne@68	2702 See usage example in :meth:`BedTool.sequence`.
jpayne@68	2703
jpayne@68	2704 Returns:
jpayne@68	2705 str: The sequence as a string.
jpayne@68	2706
jpayne@68	2707 Raises:
jpayne@68	2708 ValueError: If the sequence has not been generated using .sequence(fasta).
jpayne@68	2709 """
jpayne@68	2710 if self.seqfn is None:
jpayne@68	2711 raise ValueError("Use .sequence(fasta) to get the sequence first")
jpayne@68	2712 f = open(self.seqfn)
jpayne@68	2713 s = f.read()
jpayne@68	2714 f.close()
jpayne@68	2715 return s
jpayne@68	2716
jpayne@68	2717 def save_seqs(self, fn:str) -> BedTool:
jpayne@68	2718 """
jpayne@68	2719 Save sequences, after calling BedTool.sequence.
jpayne@68	2720
jpayne@68	2721 In order to use this function, you need to have called
jpayne@68	2722 the :meth:`BedTool.sequence()` method.
jpayne@68	2723
jpayne@68	2724 A new BedTool object is returned which references the newly saved file.
jpayne@68	2725
jpayne@68	2726 Example usage:
jpayne@68	2727
jpayne@68	2728 >>> a = pybedtools.BedTool('''
jpayne@68	2729 ... chr1 1 10
jpayne@68	2730 ... chr1 50 55''', from_string=True)
jpayne@68	2731 >>> fasta = pybedtools.example_filename('test.fa')
jpayne@68	2732 >>> a = a.sequence(fi=fasta)
jpayne@68	2733 >>> print(open(a.seqfn).read())
jpayne@68	2734 >chr1:1-10
jpayne@68	2735 GATGAGTCT
jpayne@68	2736 >chr1:50-55
jpayne@68	2737 CCATC
jpayne@68	2738 <BLANKLINE>
jpayne@68	2739 >>> b = a.save_seqs('example.fa')
jpayne@68	2740 >>> assert open(b.fn).read() == open(a.fn).read()
jpayne@68	2741 >>> if os.path.exists('example.fa'):
jpayne@68	2742 ... os.unlink('example.fa')
jpayne@68	2743 """
jpayne@68	2744
jpayne@68	2745 if self.seqfn is None:
jpayne@68	2746 raise ValueError("Use .sequence(fasta) to get the sequence first")
jpayne@68	2747
jpayne@68	2748 with open(fn, "w") as fout:
jpayne@68	2749 if self.seqfn is None:
jpayne@68	2750 raise ValueError("Use .sequence(fasta) to get the sequence first")
jpayne@68	2751 with open(self.seqfn) as seqfile:
jpayne@68	2752 fout.write(seqfile.read())
jpayne@68	2753
jpayne@68	2754 new_bedtool = BedTool(self.fn)
jpayne@68	2755 new_bedtool.seqfn = fn
jpayne@68	2756 return new_bedtool
jpayne@68	2757
jpayne@68	2758 def randomstats(
jpayne@68	2759 self,
jpayne@68	2760 other: BedTool,
jpayne@68	2761 iterations: int,
jpayne@68	2762 new: bool = False,
jpayne@68	2763 genome_fn: Optional[str] = None,
jpayne@68	2764 include_distribution: bool = False,
jpayne@68	2765 **kwargs
jpayne@68	2766 ) -> dict[str, Any]:
jpayne@68	2767 """
jpayne@68	2768 Dictionary of results from many randomly shuffled intersections.
jpayne@68	2769
jpayne@68	2770 Sends args and kwargs to :meth:`BedTool.randomintersection` and
jpayne@68	2771 compiles results into a dictionary with useful stats. Requires
jpayne@68	2772 numpy.
jpayne@68	2773
jpayne@68	2774 If `include_distribution` is True, then the dictionary will include the
jpayne@68	2775 full distribution; otherwise, the distribution is deleted and cleaned
jpayne@68	2776 up to save on memory usage.
jpayne@68	2777
jpayne@68	2778 This is one possible way of assigning significance to overlaps between
jpayne@68	2779 two files. See, for example:
jpayne@68	2780
jpayne@68	2781 Negre N, Brown CD, Shah PK, Kheradpour P, Morrison CA, et al. 2010
jpayne@68	2782 A Comprehensive Map of Insulator Elements for the Drosophila
jpayne@68	2783 Genome. PLoS Genet 6(1): e1000814. doi:10.1371/journal.pgen.1000814
jpayne@68	2784
jpayne@68	2785 Example usage:
jpayne@68	2786
jpayne@68	2787 Make chromsizes a very small genome for this example:
jpayne@68	2788
jpayne@68	2789 >>> chromsizes = {'chr1':(1,1000)}
jpayne@68	2790 >>> a = pybedtools.example_bedtool('a.bed').set_chromsizes(chromsizes)
jpayne@68	2791 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	2792 >>> try:
jpayne@68	2793 ... results = a.randomstats(b, 100, debug=True)
jpayne@68	2794 ... except ImportError:
jpayne@68	2795 ... pass
jpayne@68	2796
jpayne@68	2797 results is a dictionary that you can inspect.
jpayne@68	2798
jpayne@68	2799 (Note that the following examples are not run as part of the doctests
jpayne@68	2800 to avoid forcing users to install NumPy just to pass tests)
jpayne@68	2801
jpayne@68	2802 The actual overlap::
jpayne@68	2803
jpayne@68	2804 print(results['actual'])
jpayne@68	2805 3
jpayne@68	2806
jpayne@68	2807 The median of all randomized overlaps::
jpayne@68	2808
jpayne@68	2809 print(results['median randomized'])
jpayne@68	2810 2.0
jpayne@68	2811
jpayne@68	2812 The percentile of the actual overlap in the distribution of randomized
jpayne@68	2813 overlaps, which can be used to get an empirical p-value::
jpayne@68	2814
jpayne@68	2815 print(results['percentile'])
jpayne@68	2816 90.0
jpayne@68	2817 """
jpayne@68	2818 if ("intersect_kwargs" not in kwargs) or (kwargs["intersect_kwargs"] is None):
jpayne@68	2819 kwargs["intersect_kwargs"] = {"u": True}
jpayne@68	2820 try:
jpayne@68	2821 import numpy as np
jpayne@68	2822 except ImportError:
jpayne@68	2823 raise ImportError("Need to install NumPy for stats...")
jpayne@68	2824
jpayne@68	2825 def percentileofscore(a, score):
jpayne@68	2826 """
jpayne@68	2827 copied from scipy.stats.percentileofscore, to avoid dependency on
jpayne@68	2828 scipy.
jpayne@68	2829 """
jpayne@68	2830 a = np.array(a)
jpayne@68	2831 n = len(a)
jpayne@68	2832
jpayne@68	2833 if not (np.any(a == score)):
jpayne@68	2834 a = np.append(a, score)
jpayne@68	2835 a_len = np.array(list(range(len(a))))
jpayne@68	2836 else:
jpayne@68	2837 a_len = np.array(list(range(len(a)))) + 1.0
jpayne@68	2838
jpayne@68	2839 a = np.sort(a)
jpayne@68	2840 idx = tuple([a == score])
jpayne@68	2841 pct = (np.mean(a_len[idx]) / n) * 100.0
jpayne@68	2842 return pct
jpayne@68	2843
jpayne@68	2844 if isinstance(other, str):
jpayne@68	2845 other = BedTool(other)
jpayne@68	2846 else:
jpayne@68	2847 assert isinstance(
jpayne@68	2848 other, BedTool
jpayne@68	2849 ), "Either filename or another BedTool instance required"
jpayne@68	2850
jpayne@68	2851 # Actual (unshuffled) counts.
jpayne@68	2852 i_kwargs = kwargs["intersect_kwargs"]
jpayne@68	2853 actual = len(self.intersect(other, **i_kwargs))
jpayne@68	2854
jpayne@68	2855 # List of counts from randomly shuffled versions.
jpayne@68	2856 # Length of counts == iterations.
jpayne@68	2857
jpayne@68	2858 if not new:
jpayne@68	2859 distribution = self.randomintersection(
jpayne@68	2860 other, iterations=iterations, **kwargs
jpayne@68	2861 )
jpayne@68	2862 else:
jpayne@68	2863 # use new mechanism
jpayne@68	2864 if genome_fn is None:
jpayne@68	2865 raise ValueError(
jpayne@68	2866 "`genome_fn` must be provided if using the "
jpayne@68	2867 "new _randomintersection mechanism"
jpayne@68	2868 )
jpayne@68	2869 distribution = self._randomintersection(
jpayne@68	2870 other, iterations=iterations, genome_fn=genome_fn, **kwargs
jpayne@68	2871 )
jpayne@68	2872
jpayne@68	2873 distribution = np.array(list(distribution))
jpayne@68	2874
jpayne@68	2875 # Median of distribution
jpayne@68	2876 med_count = np.median(distribution)
jpayne@68	2877
jpayne@68	2878 n = float(len(distribution))
jpayne@68	2879
jpayne@68	2880 frac_above = sum(distribution > actual) / n
jpayne@68	2881 frac_below = sum(distribution < actual) / n
jpayne@68	2882
jpayne@68	2883 normalized = actual / med_count
jpayne@68	2884
jpayne@68	2885 lower_thresh = 2.5
jpayne@68	2886 upper_thresh = 97.5
jpayne@68	2887 lower, upper = np.percentile(distribution, [lower_thresh, upper_thresh])
jpayne@68	2888
jpayne@68	2889 actual_percentile = percentileofscore(distribution, actual)
jpayne@68	2890 d = {
jpayne@68	2891 "iterations": iterations,
jpayne@68	2892 "actual": actual,
jpayne@68	2893 "file_a": self.fn,
jpayne@68	2894 "file_b": other.fn,
jpayne@68	2895 self.fn: len(self),
jpayne@68	2896 other.fn: len(other),
jpayne@68	2897 "self": len(self),
jpayne@68	2898 "other": len(other),
jpayne@68	2899 "frac randomized above actual": frac_above,
jpayne@68	2900 "frac randomized below actual": frac_below,
jpayne@68	2901 "median randomized": med_count,
jpayne@68	2902 "normalized": normalized,
jpayne@68	2903 "percentile": actual_percentile,
jpayne@68	2904 "lower_%sth" % lower_thresh: lower,
jpayne@68	2905 "upper_%sth" % upper_thresh: upper,
jpayne@68	2906 }
jpayne@68	2907 if include_distribution:
jpayne@68	2908 d["distribution"] = distribution
jpayne@68	2909 else:
jpayne@68	2910 del distribution
jpayne@68	2911 return d
jpayne@68	2912
jpayne@68	2913 def random_op(self, args, *kwargs):
jpayne@68	2914 """
jpayne@68	2915 For backwards compatibility; see BedTool.parallel_apply instead.
jpayne@68	2916 """
jpayne@68	2917 return self.parallel_apply(args, *kwargs)
jpayne@68	2918
jpayne@68	2919 def parallel_apply(
jpayne@68	2920 self,
jpayne@68	2921 iterations: int,
jpayne@68	2922 func: Callable,
jpayne@68	2923 func_args: Iterable,
jpayne@68	2924 func_kwargs: dict,
jpayne@68	2925 processes: int=1,
jpayne@68	2926 _orig_pool: Optional[Pool] = None # type: ignore
jpayne@68	2927 ):
jpayne@68	2928 """
jpayne@68	2929 Generalized method for applying a function in parallel.
jpayne@68	2930
jpayne@68	2931 Typically used when having to do many random shufflings.
jpayne@68	2932
jpayne@68	2933 `func_args` and `func_kwargs` will be passed to `func` each time in
jpayne@68	2934 `iterations`, and these iterations will be split across `processes`
jpayne@68	2935 processes.
jpayne@68	2936
jpayne@68	2937 Notes on the function, `func`:
jpayne@68	2938
jpayne@68	2939 * the function should manually remove any tempfiles created. This
jpayne@68	2940 is because the BedTool.TEMPFILES list of auto-created tempfiles
jpayne@68	2941 does not share state across processes, so things will not get
jpayne@68	2942 cleaned up automatically as they do in a single-process
jpayne@68	2943 pybedtools session.
jpayne@68	2944
jpayne@68	2945 * this includes deleting any "chromsizes" or genome files --
jpayne@68	2946 generally it will be best to require a genome filename in
jpayne@68	2947 `func_kwargs` if you'll be using any BedTool methods that accept
jpayne@68	2948 the `g` kwarg.
jpayne@68	2949
jpayne@68	2950 * the function should be a module-level function (rather than a
jpayne@68	2951 class method) because class methods can't be pickled across
jpayne@68	2952 process boundaries
jpayne@68	2953
jpayne@68	2954 * the function can have any signature and have any return value
jpayne@68	2955
jpayne@68	2956 `_orig_pool` can be a previously-created multiprocessing.Pool instance;
jpayne@68	2957 otherwise, a new Pool will be created with `processes`
jpayne@68	2958 """
jpayne@68	2959 if processes == 1:
jpayne@68	2960 for _ in range(iterations):
jpayne@68	2961 yield func(func_args, *func_kwargs)
jpayne@68	2962 raise StopIteration
jpayne@68	2963
jpayne@68	2964 if _orig_pool:
jpayne@68	2965 p = _orig_pool
jpayne@68	2966 else:
jpayne@68	2967 p = Pool(processes)
jpayne@68	2968 iterations_each = [iterations / processes] * processes
jpayne@68	2969 iterations_each[-1] += iterations % processes
jpayne@68	2970
jpayne@68	2971 # FYI some useful info on apply_async:
jpayne@68	2972 # http://stackoverflow.com/questions/8533318/
jpayne@68	2973 # python-multiprocessing-pool-when-to-use-apply-apply-async-or-map
jpayne@68	2974 #
jpayne@68	2975 # Here, we don't care about the order, and don't want the subprocesses
jpayne@68	2976 # to block.
jpayne@68	2977 results = [
jpayne@68	2978 p.apply_async(func, func_args, func_kwargs) for _ in range(iterations)
jpayne@68	2979 ]
jpayne@68	2980 for r in results:
jpayne@68	2981 yield r.get()
jpayne@68	2982 raise StopIteration
jpayne@68	2983
jpayne@68	2984 def random_jaccard(
jpayne@68	2985 self,
jpayne@68	2986 other: BedTool,
jpayne@68	2987 genome_fn: Optional[str] = None,
jpayne@68	2988 iterations: Optional[int] = None,
jpayne@68	2989 processes: int = 1,
jpayne@68	2990 _orig_pool: Optional[Pool] = None,
jpayne@68	2991 shuffle_kwargs: Optional[dict[str, Any]] = None,
jpayne@68	2992 jaccard_kwargs: Optional[dict[str, Any]] = None,
jpayne@68	2993 ) -> list:
jpayne@68	2994 """
jpayne@68	2995 Computes the naive Jaccard statistic (intersection divided by union).
jpayne@68	2996
jpayne@68	2997 .. note::
jpayne@68	2998
jpayne@68	2999 If you don't need the randomization functionality of this method,
jpayne@68	3000 you can use the simpler BedTool.jaccard method instead.
jpayne@68	3001
jpayne@68	3002 See Favorov et al. (2012) PLoS Comput Biol 8(5): e1002529 for more
jpayne@68	3003 info on the Jaccard statistic for intersections.
jpayne@68	3004
jpayne@68	3005 If `iterations` is None, then do not perform random shufflings.
jpayne@68	3006
jpayne@68	3007 If `iterations` is an integer, perform `iterations` random shufflings,
jpayne@68	3008 each time computing the Jaccard statistic to build an empirical
jpayne@68	3009 distribution. `genome_fn` will also be needed; optional `processes`
jpayne@68	3010 will split the iterations across multiple CPUs.
jpayne@68	3011
jpayne@68	3012 Returns a tuple of the observed Jaccard statistic and a list of the
jpayne@68	3013 randomized statistics (which will be an empty list if `iterations` was
jpayne@68	3014 None).
jpayne@68	3015 """
jpayne@68	3016 if shuffle_kwargs is None:
jpayne@68	3017 shuffle_kwargs = {}
jpayne@68	3018 if jaccard_kwargs is None:
jpayne@68	3019 jaccard_kwargs = {}
jpayne@68	3020 if not genome_fn:
jpayne@68	3021 raise ValueError("Need a genome filename in order to perform randomization")
jpayne@68	3022 return list(
jpayne@68	3023 self.parallel_apply(
jpayne@68	3024 iterations=iterations if iterations else 1,
jpayne@68	3025 func=pybedtools.stats.random_jaccard,
jpayne@68	3026 func_args=(self, other),
jpayne@68	3027 func_kwargs=dict(
jpayne@68	3028 genome_fn=genome_fn,
jpayne@68	3029 shuffle_kwargs=shuffle_kwargs,
jpayne@68	3030 jaccard_kwargs=jaccard_kwargs,
jpayne@68	3031 ),
jpayne@68	3032 processes=processes,
jpayne@68	3033 _orig_pool=_orig_pool,
jpayne@68	3034 )
jpayne@68	3035 )
jpayne@68	3036
jpayne@68	3037 def _randomintersection(
jpayne@68	3038 self,
jpayne@68	3039 other: BedTool,
jpayne@68	3040 iterations: int,
jpayne@68	3041 genome_fn: str,
jpayne@68	3042 intersect_kwargs: Optional[dict[str, Any]] = None,
jpayne@68	3043 _orig_pool:Optional[Pool] = None,
jpayne@68	3044 shuffle_kwargs: Optional[dict[str, Any]] = None,
jpayne@68	3045 processes: int = 1,
jpayne@68	3046 ):
jpayne@68	3047 """
jpayne@68	3048 Re-implementation of BedTool.randomintersection using the new
jpayne@68	3049 `random_op` method
jpayne@68	3050 """
jpayne@68	3051 if shuffle_kwargs is None:
jpayne@68	3052 shuffle_kwargs = {}
jpayne@68	3053 if intersect_kwargs is None:
jpayne@68	3054 intersect_kwargs = dict(u=True)
jpayne@68	3055 if not genome_fn:
jpayne@68	3056 raise ValueError("Need a genome filename in order to perform randomization")
jpayne@68	3057 return list(
jpayne@68	3058 self.parallel_apply(
jpayne@68	3059 iterations=iterations,
jpayne@68	3060 func=pybedtools.stats.random_intersection,
jpayne@68	3061 func_args=(self, other),
jpayne@68	3062 func_kwargs=dict(
jpayne@68	3063 genome_fn=genome_fn,
jpayne@68	3064 shuffle_kwargs=shuffle_kwargs,
jpayne@68	3065 intersect_kwargs=intersect_kwargs,
jpayne@68	3066 ),
jpayne@68	3067 processes=processes,
jpayne@68	3068 _orig_pool=_orig_pool,
jpayne@68	3069 )
jpayne@68	3070 )
jpayne@68	3071
jpayne@68	3072 def randomintersection_bp(
jpayne@68	3073 self,
jpayne@68	3074 other: BedTool,
jpayne@68	3075 iterations: int,
jpayne@68	3076 genome_fn: str,
jpayne@68	3077 intersect_kwargs: Optional[dict[str, Any]] = None,
jpayne@68	3078 shuffle_kwargs: Optional[dict[str, Any]] = None,
jpayne@68	3079 processes: int = 1,
jpayne@68	3080 _orig_pool:Optional[Pool] = None,
jpayne@68	3081 ) -> list[int]:
jpayne@68	3082 """
jpayne@68	3083 Like randomintersection, but return the bp overlap instead of the
jpayne@68	3084 number of intersecting intervals.
jpayne@68	3085 """
jpayne@68	3086 if shuffle_kwargs is None:
jpayne@68	3087 shuffle_kwargs = {}
jpayne@68	3088 if intersect_kwargs is None:
jpayne@68	3089 intersect_kwargs = {}
jpayne@68	3090 if not genome_fn:
jpayne@68	3091 raise ValueError("Need a genome filename in order to perform randomization")
jpayne@68	3092 return list(
jpayne@68	3093 self.parallel_apply(
jpayne@68	3094 iterations=iterations,
jpayne@68	3095 func=pybedtools.stats.random_intersection_bp,
jpayne@68	3096 func_args=(self, other),
jpayne@68	3097 func_kwargs=dict(
jpayne@68	3098 genome_fn=genome_fn,
jpayne@68	3099 shuffle_kwargs=shuffle_kwargs,
jpayne@68	3100 intersect_kwargs=intersect_kwargs,
jpayne@68	3101 ),
jpayne@68	3102 processes=processes,
jpayne@68	3103 _orig_pool=_orig_pool,
jpayne@68	3104 )
jpayne@68	3105 )
jpayne@68	3106
jpayne@68	3107 def randomintersection(
jpayne@68	3108 self,
jpayne@68	3109 other: BedTool,
jpayne@68	3110 iterations: int,
jpayne@68	3111 intersect_kwargs: Optional[dict[str, Any]] = None,
jpayne@68	3112 shuffle_kwargs: Optional[dict[str, Any]] = None,
jpayne@68	3113 debug: bool = False,
jpayne@68	3114 report_iterations: bool = False,
jpayne@68	3115 processes: Optional[int] = None,
jpayne@68	3116 _orig_processes: Optional[int] = None,
jpayne@68	3117 ) -> Iterator[int]:
jpayne@68	3118 """
jpayne@68	3119 Perform `iterations` shufflings, each time intersecting with `other`.
jpayne@68	3120
jpayne@68	3121 Returns a generator of integers where each integer is the number of
jpayne@68	3122 intersections of a shuffled file with other. This distribution can
jpayne@68	3123 be used in downstream analysis for things like empirical p-values.
jpayne@68	3124
jpayne@68	3125 intersect_kwargs and shuffle_kwargs are passed to self.intersect()
jpayne@68	3126 and self.shuffle() respectively. By default for intersect, u=True is
jpayne@68	3127 specified -- but s=True might be a useful option for strand-specific
jpayne@68	3128 work.
jpayne@68	3129
jpayne@68	3130 Useful kwargs for shuffle_kwargs are chrom, excl, or incl. If you
jpayne@68	3131 use the "seed" kwarg, that seed will be used each time shuffleBed is
jpayne@68	3132 called -- so all your randomization results will be identical for each
jpayne@68	3133 iteration. To get around this and to allow for tests, debug=True will
jpayne@68	3134 set the seed to the iteration number. You may also break up the
jpayne@68	3135 intersections across multiple processes with processes > 1.
jpayne@68	3136
jpayne@68	3137 Example usage:
jpayne@68	3138
jpayne@68	3139 >>> chromsizes = {'chr1':(0, 1000)}
jpayne@68	3140 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	3141 >>> a = a.set_chromsizes(chromsizes)
jpayne@68	3142 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	3143 >>> results = a.randomintersection(b, 10, debug=True)
jpayne@68	3144 >>> print(list(results))
jpayne@68	3145 [1, 0, 1, 2, 4, 2, 2, 1, 2, 4]
jpayne@68	3146
jpayne@68	3147 """
jpayne@68	3148 if processes is not None:
jpayne@68	3149 p = Pool(processes)
jpayne@68	3150 iterations_each = [iterations // processes] * processes
jpayne@68	3151 iterations_each[-1] += iterations % processes
jpayne@68	3152 results = [
jpayne@68	3153 p.apply_async(
jpayne@68	3154 _call_randomintersect,
jpayne@68	3155 (self, other, it),
jpayne@68	3156 dict(
jpayne@68	3157 intersect_kwargs=intersect_kwargs,
jpayne@68	3158 shuffle_kwargs=shuffle_kwargs,
jpayne@68	3159 debug=debug,
jpayne@68	3160 report_iterations=report_iterations,
jpayne@68	3161 _orig_processes=processes,
jpayne@68	3162 ),
jpayne@68	3163 )
jpayne@68	3164 for it in iterations_each
jpayne@68	3165 ]
jpayne@68	3166 for r in results:
jpayne@68	3167 for value in r.get():
jpayne@68	3168 yield value
jpayne@68	3169 raise StopIteration
jpayne@68	3170
jpayne@68	3171 if shuffle_kwargs is None:
jpayne@68	3172 shuffle_kwargs = {}
jpayne@68	3173 if intersect_kwargs is None:
jpayne@68	3174 intersect_kwargs = {"u": True}
jpayne@68	3175
jpayne@68	3176 if "u" not in intersect_kwargs:
jpayne@68	3177 intersect_kwargs["u"] = True
jpayne@68	3178
jpayne@68	3179 resort = intersect_kwargs.get("sorted", False)
jpayne@68	3180
jpayne@68	3181 for i in range(iterations):
jpayne@68	3182 if debug:
jpayne@68	3183 shuffle_kwargs["seed"] = i
jpayne@68	3184 if report_iterations:
jpayne@68	3185 if _orig_processes > 1:
jpayne@68	3186 msg = "\rapprox (total across %s processes): %s" % (
jpayne@68	3187 _orig_processes,
jpayne@68	3188 i * _orig_processes,
jpayne@68	3189 )
jpayne@68	3190 else:
jpayne@68	3191 msg = "\r%s" % i
jpayne@68	3192 sys.stderr.write(msg)
jpayne@68	3193 sys.stderr.flush()
jpayne@68	3194
jpayne@68	3195 # Re-sort if sorted=True in kwargs
jpayne@68	3196 if resort:
jpayne@68	3197 tmp0 = self.shuffle(**shuffle_kwargs)
jpayne@68	3198 tmp = tmp0.sort()
jpayne@68	3199 else:
jpayne@68	3200 tmp = self.shuffle(**shuffle_kwargs)
jpayne@68	3201
jpayne@68	3202 tmp2 = tmp.intersect(other, stream=True, **intersect_kwargs)
jpayne@68	3203
jpayne@68	3204 yield len(tmp2)
jpayne@68	3205
jpayne@68	3206 # Close the open stdouts from subprocess.Popen calls. Note: doing
jpayne@68	3207 # this in self.__del__ doesn't fix the open file limit bug; it
jpayne@68	3208 # needs to be done here.
jpayne@68	3209 # if resort:
jpayne@68	3210 # tmp0.fn.close()
jpayne@68	3211 # tmp.fn.close()
jpayne@68	3212 tmp2.fn.close()
jpayne@68	3213 del tmp
jpayne@68	3214 del tmp2
jpayne@68	3215
jpayne@68	3216 @_log_to_history
jpayne@68	3217 def cat(self, others: Iterable[str\|Path\|BedTool], *kwargs) -> BedTool:
jpayne@68	3218 """
jpayne@68	3219 Concatenate interval files together.
jpayne@68	3220
jpayne@68	3221 Concatenates two BedTool objects (or an object and a file) and does an
jpayne@68	3222 optional post-merge of the features.
jpayne@68	3223
jpayne@68	3224 postmerge=True by default; use postmerge=False if you want to keep
jpayne@68	3225 features separate.
jpayne@68	3226
jpayne@68	3227 force_truncate=False by default; force_truncate=True to truncate
jpayne@68	3228 all files to chrom, start, stop.
jpayne@68	3229
jpayne@68	3230 When force_truncate=False and postmerge=False, the output will
jpayne@68	3231 contain the smallest number of fields observed across all inputs. This
jpayne@68	3232 maintains compatibility with BEDTools programs, which assume constant
jpayne@68	3233 number of fields in all lines of a file.
jpayne@68	3234
jpayne@68	3235 Other kwargs are sent to :meth:`BedTool.merge` (and assuming that
jpayne@68	3236 postmerge=True).
jpayne@68	3237
jpayne@68	3238 Example usage:
jpayne@68	3239
jpayne@68	3240 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	3241 >>> b = pybedtools.example_bedtool('b.bed')
jpayne@68	3242 >>> print(a.cat(b)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	3243 chr1 1 500
jpayne@68	3244 chr1 800 950
jpayne@68	3245 <BLANKLINE>
jpayne@68	3246 >>> print(a.cat(*[b,b],
jpayne@68	3247 ... postmerge=False)) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	3248 chr1 1 100 feature1 0 +
jpayne@68	3249 chr1 100 200 feature2 0 +
jpayne@68	3250 chr1 150 500 feature3 0 -
jpayne@68	3251 chr1 900 950 feature4 0 +
jpayne@68	3252 chr1 155 200 feature5 0 -
jpayne@68	3253 chr1 800 901 feature6 0 +
jpayne@68	3254 chr1 155 200 feature5 0 -
jpayne@68	3255 chr1 800 901 feature6 0 +
jpayne@68	3256 <BLANKLINE>
jpayne@68	3257 """
jpayne@68	3258 same_type = None
jpayne@68	3259 same_field_num = None
jpayne@68	3260 field_nums = set()
jpayne@68	3261
jpayne@68	3262 assert len(others) > 0, "You must specify at least one other bedfile!"
jpayne@68	3263 other_beds = []
jpayne@68	3264 for other in others:
jpayne@68	3265 if isinstance(other, (str, Path)):
jpayne@68	3266 other = BedTool(other)
jpayne@68	3267 else:
jpayne@68	3268 assert isinstance(
jpayne@68	3269 other, BedTool
jpayne@68	3270 ), "Either filename or another BedTool instance required"
jpayne@68	3271 other_beds.append(other)
jpayne@68	3272
jpayne@68	3273 # postmerge and force_truncate don't get passed on to merge
jpayne@68	3274 postmerge = kwargs.pop("postmerge", True)
jpayne@68	3275 force_truncate = kwargs.pop("force_truncate", False)
jpayne@68	3276 stream_merge = kwargs.get("stream", False)
jpayne@68	3277 if stream_merge and postmerge:
jpayne@68	3278 raise ValueError(
jpayne@68	3279 "The post-merge step in the `cat()` method "
jpayne@68	3280 "performs a sort, which uses stream=True. Using "
jpayne@68	3281 "stream=True for the merge as well will result in a "
jpayne@68	3282 "deadlock!"
jpayne@68	3283 )
jpayne@68	3284
jpayne@68	3285 # if filetypes and field counts are the same, don't truncate
jpayne@68	3286 if not force_truncate:
jpayne@68	3287 try:
jpayne@68	3288 filetypes = set(
jpayne@68	3289 [self.file_type] + [i.file_type for i in other_beds]
jpayne@68	3290 ).difference(["empty"])
jpayne@68	3291 field_nums = (
jpayne@68	3292 set([self.field_count()] + [i.field_count() for i in other_beds])
jpayne@68	3293 .difference([None])
jpayne@68	3294 .difference([0])
jpayne@68	3295 )
jpayne@68	3296 same_field_num = len(field_nums) == 1
jpayne@68	3297 same_type = len(set(filetypes)) == 1
jpayne@68	3298 except ValueError:
jpayne@68	3299 raise ValueError(
jpayne@68	3300 "Can't check filetype or field count -- "
jpayne@68	3301 "is one of the files you're merging a 'streaming' "
jpayne@68	3302 "BedTool? If so, use .saveas() to save to file first"
jpayne@68	3303 )
jpayne@68	3304
jpayne@68	3305 tmp = self._tmp()
jpayne@68	3306
jpayne@68	3307 if not force_truncate and same_type and same_field_num:
jpayne@68	3308 with open(tmp, "w") as TMP:
jpayne@68	3309 for f in self:
jpayne@68	3310 TMP.write(str(f))
jpayne@68	3311 for other in other_beds:
jpayne@68	3312 for f in other:
jpayne@68	3313 TMP.write(str(f))
jpayne@68	3314
jpayne@68	3315 # Types match, so we can use the min number of fields observed across
jpayne@68	3316 # all inputs
jpayne@68	3317 elif not force_truncate and same_type:
jpayne@68	3318 minfields = min(field_nums)
jpayne@68	3319 with open(tmp, "w") as TMP:
jpayne@68	3320 for f in self:
jpayne@68	3321 TMP.write("\t".join(f.fields[:minfields]) + "\n")
jpayne@68	3322 for other in other_beds:
jpayne@68	3323 for f in other:
jpayne@68	3324 TMP.write("\t".join(f.fields[:minfields]) + "\n")
jpayne@68	3325
jpayne@68	3326 # Otherwise, use the zero-based chrom/start/stop to create a BED3,
jpayne@68	3327 # which will work when catting a GFF and a BED together.
jpayne@68	3328 else:
jpayne@68	3329 with open(tmp, "w") as TMP:
jpayne@68	3330 for f in self:
jpayne@68	3331 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
jpayne@68	3332 for other in other_beds:
jpayne@68	3333 for f in other:
jpayne@68	3334 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end))
jpayne@68	3335
jpayne@68	3336 c = BedTool(tmp)
jpayne@68	3337 if postmerge:
jpayne@68	3338 d = c.sort(stream=True).merge(**kwargs)
jpayne@68	3339
jpayne@68	3340 # Explicitly delete -- needed when using multiprocessing
jpayne@68	3341 os.unlink(tmp)
jpayne@68	3342 return d
jpayne@68	3343 else:
jpayne@68	3344 return c
jpayne@68	3345
jpayne@68	3346 @_log_to_history
jpayne@68	3347 def saveas(self, fn: Optional[str] = None, trackline: Optional[str] = None, compressed: Optional[bool] = None) -> BedTool:
jpayne@68	3348 """
jpayne@68	3349 Make a copy of the BedTool.
jpayne@68	3350
jpayne@68	3351 Optionally adds `trackline` to the beginning of the file.
jpayne@68	3352
jpayne@68	3353 Optionally compresses output using gzip.
jpayne@68	3354
jpayne@68	3355 if the filename extension is .gz, or compressed=True,
jpayne@68	3356 the output is compressed using gzip
jpayne@68	3357
jpayne@68	3358 Returns a new BedTool for the newly saved file.
jpayne@68	3359
jpayne@68	3360 A newline is automatically added to the trackline if it does not
jpayne@68	3361 already have one.
jpayne@68	3362
jpayne@68	3363 Example usage:
jpayne@68	3364
jpayne@68	3365 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	3366 >>> b = a.saveas('other.bed')
jpayne@68	3367 >>> b.fn
jpayne@68	3368 'other.bed'
jpayne@68	3369 >>> print(b == a)
jpayne@68	3370 True
jpayne@68	3371
jpayne@68	3372 >>> b = a.saveas('other.bed', trackline="name='test run' color=0,55,0")
jpayne@68	3373 >>> open(b.fn).readline()
jpayne@68	3374 "name='test run' color=0,55,0\\n"
jpayne@68	3375 >>> if os.path.exists('other.bed'):
jpayne@68	3376 ... os.unlink('other.bed')
jpayne@68	3377 """
jpayne@68	3378 if fn is None:
jpayne@68	3379 fn = self._tmp()
jpayne@68	3380
jpayne@68	3381 # Default to compressed if extension is .gz
jpayne@68	3382 if compressed is None:
jpayne@68	3383 __, extension = os.path.splitext(fn)
jpayne@68	3384 if extension == ".gz":
jpayne@68	3385 compressed = True
jpayne@68	3386 else:
jpayne@68	3387 compressed = False
jpayne@68	3388
jpayne@68	3389 in_compressed = isinstance(self.fn, str) and isGZIP(self.fn)
jpayne@68	3390
jpayne@68	3391 fn = self._collapse(
jpayne@68	3392 self,
jpayne@68	3393 fn=fn,
jpayne@68	3394 trackline=trackline,
jpayne@68	3395 in_compressed=in_compressed,
jpayne@68	3396 out_compressed=compressed,
jpayne@68	3397 )
jpayne@68	3398 return BedTool(fn)
jpayne@68	3399
jpayne@68	3400 @_log_to_history
jpayne@68	3401 def moveto(self, fn: Optional[str]=None) -> BedTool:
jpayne@68	3402 """
jpayne@68	3403 Move to a new filename (can be much quicker than BedTool.saveas())
jpayne@68	3404
jpayne@68	3405 Move BED file to new filename, `fn`.
jpayne@68	3406
jpayne@68	3407 Returns a new BedTool for the new file.
jpayne@68	3408
jpayne@68	3409 Example usage:
jpayne@68	3410
jpayne@68	3411 >>> # make a copy so we don't mess up the example file
jpayne@68	3412 >>> a = pybedtools.example_bedtool('a.bed').saveas()
jpayne@68	3413 >>> a_contents = str(a)
jpayne@68	3414 >>> b = a.moveto('other.bed')
jpayne@68	3415 >>> b.fn
jpayne@68	3416 'other.bed'
jpayne@68	3417 >>> b == a_contents
jpayne@68	3418 True
jpayne@68	3419 """
jpayne@68	3420 if not isinstance(self.fn, str):
jpayne@68	3421 fn = self._collapse(self, fn=fn)
jpayne@68	3422 else:
jpayne@68	3423 shutil.move(self.fn, fn)
jpayne@68	3424 return BedTool(fn)
jpayne@68	3425
jpayne@68	3426 @_log_to_history
jpayne@68	3427 def random_subset(self, n: Optional[int] = None, f: Optional[float] = None, seed: Optional[float\|int]=None) -> BedTool:
jpayne@68	3428 """
jpayne@68	3429 Return a BedTool containing a random subset.
jpayne@68	3430
jpayne@68	3431 NOTE: using `n` will be slower and use more memory than using `f`.
jpayne@68	3432
jpayne@68	3433 Parameters
jpayne@68	3434 ----------
jpayne@68	3435
jpayne@68	3436 n : int
jpayne@68	3437 Number of features to return. Only one of `n` or `f` can be provided.
jpayne@68	3438
jpayne@68	3439 f : float, 0 <= f <= 1
jpayne@68	3440 Fraction of features to return. Cannot be provided with `n`.
jpayne@68	3441
jpayne@68	3442 seed : float or int
jpayne@68	3443 Set random.seed
jpayne@68	3444
jpayne@68	3445 Example
jpayne@68	3446 -------
jpayne@68	3447
jpayne@68	3448 >>> seed = 0 # only for test, otherwise use None
jpayne@68	3449
jpayne@68	3450 `n` will always give the same number of returned features, but will be
jpayne@68	3451 slower since it is creating an index and then shuffling it.
jpayne@68	3452
jpayne@68	3453 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	3454 >>> b = a.random_subset(n=2)
jpayne@68	3455 >>> len(b)
jpayne@68	3456 2
jpayne@68	3457
jpayne@68	3458 Using a fraction `f` will be faster but depending on seed will result
jpayne@68	3459 in slightly different total numbers.
jpayne@68	3460
jpayne@68	3461 >>> a = pybedtools.example_bedtool('x.bam')
jpayne@68	3462 >>> len(a)
jpayne@68	3463 45593
jpayne@68	3464 >>> b = a.random_subset(f=0.4, seed=seed)
jpayne@68	3465 >>> len(b)
jpayne@68	3466 18316
jpayne@68	3467
jpayne@68	3468 Check that we have approximately the right fraction
jpayne@68	3469 >>> print('{0:.2f}'.format(len(b) / len(a)))
jpayne@68	3470 0.40
jpayne@68	3471
jpayne@68	3472 """
jpayne@68	3473 if ((n is None) and (f is None)) or ((n is not None) and (f is not None)):
jpayne@68	3474 raise ValueError("Exactly one of `n` or `f` must be provided")
jpayne@68	3475
jpayne@68	3476 tmpfn = self._tmp()
jpayne@68	3477 if seed is not None:
jpayne@68	3478 random.seed(seed)
jpayne@68	3479
jpayne@68	3480 if n:
jpayne@68	3481 idxs = list(range(len(self)))
jpayne@68	3482 random.shuffle(idxs)
jpayne@68	3483 idxs = idxs[:n]
jpayne@68	3484 with open(tmpfn, "w") as tmp:
jpayne@68	3485 for i, feature in enumerate(self):
jpayne@68	3486 if i in idxs:
jpayne@68	3487 tmp.write(str(feature))
jpayne@68	3488
jpayne@68	3489 elif f:
jpayne@68	3490 with open(tmpfn, "w") as tmp:
jpayne@68	3491 for i in self:
jpayne@68	3492 if random.random() <= f:
jpayne@68	3493 tmp.write(str(i))
jpayne@68	3494
jpayne@68	3495 return BedTool(tmpfn)
jpayne@68	3496
jpayne@68	3497 def total_coverage(self) -> int:
jpayne@68	3498 """
jpayne@68	3499 Return the total number of bases covered by this interval file.
jpayne@68	3500
jpayne@68	3501 Does a self.merge() first to remove potentially multiple-counting
jpayne@68	3502 bases.
jpayne@68	3503
jpayne@68	3504 Example usage:
jpayne@68	3505
jpayne@68	3506 >>> a = pybedtools.example_bedtool('a.bed')
jpayne@68	3507
jpayne@68	3508 This does a merge() first, so this is what the total coverage is
jpayne@68	3509 counting:
jpayne@68	3510
jpayne@68	3511 >>> print(a.merge()) #doctest: +NORMALIZE_WHITESPACE
jpayne@68	3512 chr1 1 500
jpayne@68	3513 chr1 900 950
jpayne@68	3514 <BLANKLINE>
jpayne@68	3515
jpayne@68	3516 >>> print(a.total_coverage())
jpayne@68	3517 549
jpayne@68	3518 """
jpayne@68	3519 b = self.merge()
jpayne@68	3520 total_bp = 0
jpayne@68	3521 for feature in b.features():
jpayne@68	3522 total_bp += len(feature)
jpayne@68	3523 return total_bp
jpayne@68	3524
jpayne@68	3525 @_log_to_history
jpayne@68	3526 def with_attrs(self, **kwargs) -> BedTool:
jpayne@68	3527 """
jpayne@68	3528 Helper method for adding attributes in the middle of a pipeline.
jpayne@68	3529
jpayne@68	3530 Given arbitrary keyword arguments, turns the keys and values into
jpayne@68	3531 attributes. Useful for labeling BedTools at creation time.
jpayne@68	3532
jpayne@68	3533 Example usage:
jpayne@68	3534
jpayne@68	3535 >>> # add a "label" attribute to each BedTool
jpayne@68	3536 >>> a = pybedtools.example_bedtool('a.bed')\
jpayne@68	3537 .with_attrs(label='transcription factor 1')
jpayne@68	3538 >>> b = pybedtools.example_bedtool('b.bed')\
jpayne@68	3539 .with_attrs(label='transcription factor 2')
jpayne@68	3540 >>> for i in [a, b]:
jpayne@68	3541 ... print('{0} features for {1}'.format(i.count(), i.label))
jpayne@68	3542 4 features for transcription factor 1
jpayne@68	3543 2 features for transcription factor 2
jpayne@68	3544
jpayne@68	3545 """
jpayne@68	3546 for key, value in list(kwargs.items()):
jpayne@68	3547 setattr(self, key, value)
jpayne@68	3548 return self
jpayne@68	3549
jpayne@68	3550 def as_intervalfile(self) -> IntervalFile:
jpayne@68	3551 """
jpayne@68	3552 Returns an IntervalFile of this BedTool for low-level interface.
jpayne@68	3553 """
jpayne@68	3554 if not isinstance(self.fn, str):
jpayne@68	3555 fn = self._collapse(self.fn)
jpayne@68	3556 else:
jpayne@68	3557 fn = self.fn
jpayne@68	3558 return IntervalFile(fn)
jpayne@68	3559
jpayne@68	3560 def liftover(self, chainfile: str, unmapped: Optional[str] = None, liftover_args: str = "") -> BedTool:
jpayne@68	3561 """
jpayne@68	3562 Returns a new BedTool of the liftedOver features, saving the unmapped
jpayne@68	3563 ones as `unmapped`. If `unmapped` is None, then discards the unmapped
jpayne@68	3564 features.
jpayne@68	3565
jpayne@68	3566 `liftover_args` is a string of additional args that is passed,
jpayne@68	3567 verbatim, to liftOver.
jpayne@68	3568
jpayne@68	3569 Needs `liftOver` from UCSC to be on the path and a `chainfile`
jpayne@68	3570 downloaded from UCSC.
jpayne@68	3571 """
jpayne@68	3572 result = BedTool._tmp()
jpayne@68	3573 if unmapped is None:
jpayne@68	3574 unmapped = BedTool._tmp()
jpayne@68	3575 cmds = ["liftOver", liftover_args, self.fn, chainfile, result, unmapped]
jpayne@68	3576 os.system(" ".join(cmds))
jpayne@68	3577 return BedTool(result)
jpayne@68	3578
jpayne@68	3579 def absolute_distance(self, other: BedTool, closest_kwargs: Optional[dict[str, Any]]=None, use_midpoints: bool=False) -> Iterator[int]:
jpayne@68	3580 """
jpayne@68	3581 Returns an iterator of the absolute distances between features in
jpayne@68	3582 self and other.
jpayne@68	3583
jpayne@68	3584 If `use_midpoints` is True, then only use the midpoints of features
jpayne@68	3585 (which will return values where features are overlapping). Otherwise,
jpayne@68	3586 when features overlap the value will always be zero.
jpayne@68	3587
jpayne@68	3588 `closest_kwargs` are passed to self.closest(); either `d` or
jpayne@68	3589 'D` are required in order to get back distance values (`d=True` is
jpayne@68	3590 default)
jpayne@68	3591 """
jpayne@68	3592 from .featurefuncs import midpoint
jpayne@68	3593
jpayne@68	3594 if closest_kwargs is None:
jpayne@68	3595 closest_kwargs = {"d": True}
jpayne@68	3596
jpayne@68	3597 if "D" not in closest_kwargs:
jpayne@68	3598 closest_kwargs.update(dict(d=True))
jpayne@68	3599
jpayne@68	3600 if use_midpoints:
jpayne@68	3601 mid_self = self.each(midpoint).saveas()
jpayne@68	3602 mid_other = other.each(midpoint).saveas()
jpayne@68	3603 c = mid_self.closest(mid_other, stream=True, **closest_kwargs)
jpayne@68	3604 else:
jpayne@68	3605 c = self.closest(other, stream=True, **closest_kwargs)
jpayne@68	3606 for i in c:
jpayne@68	3607 yield int(i[-1])
jpayne@68	3608
jpayne@68	3609 def relative_distance(self, other: BedTool, genome:Optional[dict\|str] =None, g: Optional[str]=None) -> Iterator[float]:
jpayne@68	3610 """
jpayne@68	3611 Returns an iterator of relative distances between features in self and
jpayne@68	3612 other.
jpayne@68	3613
jpayne@68	3614 First computes the midpoints of self and other, then returns distances
jpayne@68	3615 of each feature in `other` relative to the distance between `self`
jpayne@68	3616 features.
jpayne@68	3617
jpayne@68	3618 Requires either `genome` (dictionary of chromsizes or assembly name) or
jpayne@68	3619 `g` (filename of chromsizes file).
jpayne@68	3620 """
jpayne@68	3621 g_dict = {}
jpayne@68	3622 if (genome is None) and (g is None):
jpayne@68	3623 raise ValueError("Need either `genome` or `g` arg for relative distance")
jpayne@68	3624 elif genome and g:
jpayne@68	3625 raise ValueError("Please specify only one of `genome` or `g`")
jpayne@68	3626 elif genome:
jpayne@68	3627 g_dict = dict(genome=genome)
jpayne@68	3628 elif g:
jpayne@68	3629 g_dict = dict(g=g)
jpayne@68	3630
jpayne@68	3631 from .featurefuncs import midpoint
jpayne@68	3632
jpayne@68	3633 # This gets the space between features in self.
jpayne@68	3634 c = self.each(midpoint).complement(**g_dict)
jpayne@68	3635
jpayne@68	3636 hits = c.intersect(other, wao=True, stream=True) # TODO: should this be other or mid_other?
jpayne@68	3637 for i in hits:
jpayne@68	3638 yield float(i[-1]) / len(i)
jpayne@68	3639
jpayne@68	3640 def colormap_normalize(self,
jpayne@68	3641 vmin: Optional[float\|int]=None,
jpayne@68	3642 vmax: Optional[float\|int]=None,
jpayne@68	3643 percentile: bool=False,
jpayne@68	3644 log: bool=False) -> mcolors.LogNorm\|mcolors.Normalize:
jpayne@68	3645 """
jpayne@68	3646 Returns a normalization instance for use by featurefuncs.add_color().
jpayne@68	3647
jpayne@68	3648 Parameters
jpayne@68	3649 ----------
jpayne@68	3650 vmin, vmax : float, int, or None
jpayne@68	3651 `vmin` and `vmax` set the colormap bounds; if None then
jpayne@68	3652 these will be determined from the scores in the BED file.
jpayne@68	3653
jpayne@68	3654 log : bool
jpayne@68	3655 If True, put the scores on a log scale; of course be careful
jpayne@68	3656 if you have negative scores
jpayne@68	3657
jpayne@68	3658 percentile : bool
jpayne@68	3659 If True, interpret vmin and vmax as a percentile in the range
jpayne@68	3660 [0,100] rather than absolute values.
jpayne@68	3661 """
jpayne@68	3662 field_count = self.field_count()
jpayne@68	3663 if (self.file_type != "bed") or (field_count < 5):
jpayne@68	3664 raise ValueError("colorizing only works for BED files with score " "fields")
jpayne@68	3665 try:
jpayne@68	3666 import matplotlib.colors as mcolors
jpayne@68	3667 except ImportError:
jpayne@68	3668 raise ImportError("matplotlib.colors must be installed to use colormap_normalize")
jpayne@68	3669
jpayne@68	3670 try:
jpayne@68	3671 import numpy as np
jpayne@68	3672 except ImportError:
jpayne@68	3673 raise ImportError("numpy must be installed to use colormap_normalize")
jpayne@68	3674
jpayne@68	3675 if log:
jpayne@68	3676 norm = mcolors.LogNorm()
jpayne@68	3677 else:
jpayne@68	3678 norm = mcolors.Normalize()
jpayne@68	3679
jpayne@68	3680 scores = np.array([i.score for i in self], dtype=float)
jpayne@68	3681 scores = scores[np.isfinite(scores)]
jpayne@68	3682 norm.autoscale(scores)
jpayne@68	3683
jpayne@68	3684 if vmin is not None:
jpayne@68	3685 if percentile:
jpayne@68	3686 vmin = float(np.percentile(scores, vmin))
jpayne@68	3687 norm.vmin = vmin
jpayne@68	3688 if vmax is not None:
jpayne@68	3689 if percentile:
jpayne@68	3690 vmax = float(np.percentile(scores, vmax))
jpayne@68	3691 norm.vmax = vmax
jpayne@68	3692
jpayne@68	3693 return norm
jpayne@68	3694
jpayne@68	3695 def at(self, inds: list[int]) -> BedTool:
jpayne@68	3696 """
jpayne@68	3697 Returns a new BedTool with only intervals at lines `inds`
jpayne@68	3698
jpayne@68	3699 Parameters
jpayne@68	3700 ----------
jpayne@68	3701 inds : List[int]
jpayne@68	3702 List of line numbers
jpayne@68	3703
jpayne@68	3704 Returns
jpayne@68	3705 -------
jpayne@68	3706 BedTool
jpayne@68	3707 New BedTool with only intervals at `inds`
jpayne@68	3708 """
jpayne@68	3709 length = len(inds)
jpayne@68	3710
jpayne@68	3711 def _gen():
jpayne@68	3712 k = 0
jpayne@68	3713 for i, feature in enumerate(self):
jpayne@68	3714 if i == inds[k]:
jpayne@68	3715 yield feature
jpayne@68	3716 k += 1
jpayne@68	3717 if k == length:
jpayne@68	3718 break
jpayne@68	3719
jpayne@68	3720 return BedTool(_gen()).saveas()
jpayne@68	3721
jpayne@68	3722 def to_dataframe(self, disable_auto_names: bool = False, args, *kwargs) -> pd.DataFrame:
jpayne@68	3723 """
jpayne@68	3724 Create a pandas.DataFrame, passing args and kwargs to pandas.read_csv
jpayne@68	3725 The separator kwarg `sep` is given a tab `\\t` as value by default.
jpayne@68	3726
jpayne@68	3727 Parameters
jpayne@68	3728 ----------
jpayne@68	3729 disable_auto_names : bool
jpayne@68	3730 By default, the created dataframe fills in column names
jpayne@68	3731 automatically according to the detected filetype (e.g., "chrom",
jpayne@68	3732 "start", "end" for a BED3 file). Set this argument to True to
jpayne@68	3733 disable this behavior.
jpayne@68	3734 """
jpayne@68	3735 # Complain if BAM or if not a file
jpayne@68	3736 if self._isbam:
jpayne@68	3737 raise ValueError("BAM not supported for converting to DataFrame")
jpayne@68	3738 if not isinstance(self.fn, str):
jpayne@68	3739 raise ValueError("use .saveas() to make sure self.fn is a file")
jpayne@68	3740
jpayne@68	3741 try:
jpayne@68	3742 import pandas
jpayne@68	3743 except ImportError:
jpayne@68	3744 raise ImportError("pandas must be installed to convert to pandas.DataFrame")
jpayne@68	3745 # Otherwise we're good:
jpayne@68	3746 names = kwargs.get("names", None)
jpayne@68	3747 if names is None and not disable_auto_names:
jpayne@68	3748 try:
jpayne@68	3749 _names = settings._column_names[self.file_type][: self.field_count()]
jpayne@68	3750 if len(_names) < self.field_count():
jpayne@68	3751 warn(
jpayne@68	3752 "Default names for filetype %s are:\n%s\nbut file has "
jpayne@68	3753 "%s fields; you can supply custom names with the "
jpayne@68	3754 "`names` kwarg" % (self.file_type, _names, self.field_count())
jpayne@68	3755 )
jpayne@68	3756 _names = None
jpayne@68	3757 except KeyError:
jpayne@68	3758 _names = None
jpayne@68	3759 kwargs["names"] = _names
jpayne@68	3760
jpayne@68	3761 if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0:
jpayne@68	3762 return pandas.read_csv(self.fn, args, sep="\t", *kwargs) # type: ignore
jpayne@68	3763 else:
jpayne@68	3764 return pandas.DataFrame()
jpayne@68	3765
jpayne@68	3766 def tail(self, lines:int = 10, as_string: bool = False) -> Optional[str]:
jpayne@68	3767 """
jpayne@68	3768 Like `head`, but prints last 10 lines of the file by default.
jpayne@68	3769
jpayne@68	3770 To avoid consuming iterables, this only works with file-based, non-BAM
jpayne@68	3771 BedTool objects.
jpayne@68	3772
jpayne@68	3773 Use `as_string=True` to return a string.
jpayne@68	3774 """
jpayne@68	3775 if self._isbam:
jpayne@68	3776 raise ValueError("tail() not yet implemented for BAM files")
jpayne@68	3777 if not isinstance(self.fn, str):
jpayne@68	3778 raise ValueError(
jpayne@68	3779 "tail() not implemented for non-file-based "
jpayne@68	3780 "BedTool objects. Please use saveas() first."
jpayne@68	3781 )
jpayne@68	3782 bufsize = 8192
jpayne@68	3783 offset = bufsize
jpayne@68	3784 f = open(self.fn, "rb")
jpayne@68	3785
jpayne@68	3786 # whence=2 arg means relative to end (i.e., go to the end)
jpayne@68	3787 f.seek(0, 2)
jpayne@68	3788 file_size = f.tell()
jpayne@68	3789 data = []
jpayne@68	3790 while True:
jpayne@68	3791 if file_size < bufsize:
jpayne@68	3792 offset = file_size
jpayne@68	3793 f.seek(-offset, 2)
jpayne@68	3794 chunk = f.read(offset)
jpayne@68	3795 data.extend(chunk.splitlines(True))
jpayne@68	3796 if len(data) >= lines or offset == file_size:
jpayne@68	3797 break
jpayne@68	3798 offset += bufsize
jpayne@68	3799
jpayne@68	3800 result = "".join([i.decode() for i in data[-lines:]])
jpayne@68	3801 if as_string:
jpayne@68	3802 return result
jpayne@68	3803 else:
jpayne@68	3804 print(result)
jpayne@68	3805
jpayne@68	3806
jpayne@68	3807 class BAM(object):
jpayne@68	3808 def __init__(self, stream):
jpayne@68	3809 """
jpayne@68	3810 Wraps pysam.Samfile so that it yields pybedtools.Interval objects when
jpayne@68	3811 iterated over.
jpayne@68	3812
jpayne@68	3813 The pysam.Samfile can be accessed via the .pysam_bamfile attribute.
jpayne@68	3814 """
jpayne@68	3815 self.stream = stream
jpayne@68	3816 if not isinstance(self.stream, str):
jpayne@68	3817 raise ValueError("Only files are supported, not streams")
jpayne@68	3818 self.pysam_bamfile = pysam.Samfile(self.stream)
jpayne@68	3819
jpayne@68	3820 def _aligned_segment_to_interval(self, r):
jpayne@68	3821
jpayne@68	3822 if r.rname >= 0:
jpayne@68	3823 rname = self.pysam_bamfile.get_reference_name(r.rname)
jpayne@68	3824 else:
jpayne@68	3825 rname = "*"
jpayne@68	3826
jpayne@68	3827 if r.rnext >= 0:
jpayne@68	3828 if r.rnext == r.rname:
jpayne@68	3829 rnext = "="
jpayne@68	3830 else:
jpayne@68	3831 rnext = self.pysam_bamfile.get_reference_name(r.rnext)
jpayne@68	3832 else:
jpayne@68	3833 rnext = "*"
jpayne@68	3834
jpayne@68	3835 # SAM spec says if unavailable should be set to 0. Pysam sets to -1.
jpayne@68	3836
jpayne@68	3837 if r.pnext <= 0:
jpayne@68	3838 pnext = "0"
jpayne@68	3839 else:
jpayne@68	3840 # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
jpayne@68	3841 # but pysam uses 0-based.
jpayne@68	3842 pnext = str(r.pnext + 1)
jpayne@68	3843
jpayne@68	3844 if r.cigarstring:
jpayne@68	3845 cigarstring = r.cigarstring
jpayne@68	3846 else:
jpayne@68	3847 cigarstring = "*"
jpayne@68	3848
jpayne@68	3849 # Rudimentary support.
jpayne@68	3850 # TODO: remove when refactoring to new BAM iterating
jpayne@68	3851 tags = []
jpayne@68	3852 for k, v in r.tags:
jpayne@68	3853 if isinstance(v, int):
jpayne@68	3854 t = "i"
jpayne@68	3855 elif isinstance(v, float):
jpayne@68	3856 t = "f"
jpayne@68	3857 else:
jpayne@68	3858 t = "Z"
jpayne@68	3859 tags.append("{0}:{1}:{2}".format(k, t, v))
jpayne@68	3860
jpayne@68	3861 tags = "\t".join(tags)
jpayne@68	3862
jpayne@68	3863 if r.seq:
jpayne@68	3864 seq = r.seq
jpayne@68	3865 else:
jpayne@68	3866 seq = "*"
jpayne@68	3867
jpayne@68	3868 if r.qual:
jpayne@68	3869 qual = r.qual
jpayne@68	3870 else:
jpayne@68	3871 qual = "*"
jpayne@68	3872
jpayne@68	3873 fields = [
jpayne@68	3874 r.qname,
jpayne@68	3875 str(r.flag),
jpayne@68	3876 rname,
jpayne@68	3877 # +1 here because cbedtools.pyx expects SAM -- which is 1-based --
jpayne@68	3878 # but pysam uses 0-based.
jpayne@68	3879 str(r.pos + 1),
jpayne@68	3880 str(r.mapq),
jpayne@68	3881 cigarstring,
jpayne@68	3882 rnext,
jpayne@68	3883 pnext,
jpayne@68	3884 str(r.tlen),
jpayne@68	3885 seq,
jpayne@68	3886 qual,
jpayne@68	3887 ]
jpayne@68	3888 if tags:
jpayne@68	3889 fields.append(tags)
jpayne@68	3890
jpayne@68	3891 if None in fields:
jpayne@68	3892 raise ValueError("Found 'None' in fields: %s" % fields)
jpayne@68	3893 return create_interval_from_list(fields)
jpayne@68	3894
jpayne@68	3895 def __iter__(self):
jpayne@68	3896 return self
jpayne@68	3897
jpayne@68	3898 # TODO: this is PAINFUL but it ensures that existing tests work. Once all
jpayne@68	3899 # tests work, the new behavior will be to yield pysam AlignedSegment
jpayne@68	3900 # objects directly.
jpayne@68	3901 def __next__(self):
jpayne@68	3902 return self._aligned_segment_to_interval(next(self.pysam_bamfile))
jpayne@68	3903
jpayne@68	3904 def next(self):
jpayne@68	3905 return self.__next__()
jpayne@68	3906
jpayne@68	3907
jpayne@68	3908 class History(list):
jpayne@68	3909 def __init__(self):
jpayne@68	3910 """
jpayne@68	3911 Represents one or many HistorySteps. Mostly used for nicely formatting
jpayne@68	3912 a series of HistorySteps.
jpayne@68	3913 """
jpayne@68	3914 list.__init__(self)
jpayne@68	3915
jpayne@68	3916
jpayne@68	3917 class HistoryStep(object):
jpayne@68	3918 def __init__(self, method, args, kwargs, bedtool_instance, parent_tag, result_tag):
jpayne@68	3919 """
jpayne@68	3920 Class to represent one step in the history.
jpayne@68	3921
jpayne@68	3922 Mostly used for its __repr__ method, to try and exactly replicate code
jpayne@68	3923 that can be pasted to re-do history steps
jpayne@68	3924 """
jpayne@68	3925 try:
jpayne@68	3926 self.method = method._name
jpayne@68	3927 except AttributeError:
jpayne@68	3928 self.method = method.__name__
jpayne@68	3929
jpayne@68	3930 self.args = args
jpayne@68	3931 self.kwargs = kwargs
jpayne@68	3932 self.fn = bedtool_instance.fn
jpayne@68	3933 self.parent_tag = parent_tag
jpayne@68	3934 self.result_tag = result_tag
jpayne@68	3935
jpayne@68	3936 def _clean_arg(self, arg: str\|BedTool) -> str:
jpayne@68	3937 """
jpayne@68	3938 Wrap strings in quotes and convert bedtool instances to filenames.
jpayne@68	3939 """
jpayne@68	3940 if isinstance(arg, BedTool):
jpayne@68	3941 arg = arg.fn
jpayne@68	3942 if isinstance(arg, str):
jpayne@68	3943 arg = '"%s"' % arg
jpayne@68	3944 return arg
jpayne@68	3945
jpayne@68	3946 def __repr__(self):
jpayne@68	3947 # Still not sure whether to use pybedtools.bedtool() or bedtool()
jpayne@68	3948 s = ""
jpayne@68	3949 s += "<HistoryStep> "
jpayne@68	3950 if os.path.exists(self.fn):
jpayne@68	3951 s += 'BedTool("%(fn)s").%(method)s(%%s%%s)' % self.__dict__
jpayne@68	3952 else:
jpayne@68	3953 s += 'BedTool("MISSING FILE: %(fn)s")' % self.__dict__
jpayne@68	3954 s += ".%(method)s(%%s%%s)" % self.__dict__
jpayne@68	3955
jpayne@68	3956 # Format args and kwargs
jpayne@68	3957 args_string = ",".join(map(self._clean_arg, self.args))
jpayne@68	3958 kwargs_string = ",".join(
jpayne@68	3959 ["%s=%s" % (i[0], self._clean_arg(i[1])) for i in list(self.kwargs.items())]
jpayne@68	3960 )
jpayne@68	3961 # stick a comma on the end if there's something here
jpayne@68	3962 if len(args_string) > 0:
jpayne@68	3963 args_string += ", "
jpayne@68	3964
jpayne@68	3965 s = s % (args_string, kwargs_string)
jpayne@68	3966 s += ", parent tag: %s" % self.parent_tag
jpayne@68	3967 s += ", result tag: %s" % self.result_tag
jpayne@68	3968 return s
jpayne@68	3969
jpayne@68	3970
jpayne@68	3971 def example_bedtool(fn):
jpayne@68	3972 """
jpayne@68	3973 Return a bedtool using a bed file from the pybedtools examples directory.
jpayne@68	3974 Use :func:`list_example_files` to see a list of files that are included.
jpayne@68	3975 """
jpayne@68	3976 fn = os.path.join(filenames.data_dir(), fn)
jpayne@68	3977 if not os.path.exists(fn):
jpayne@68	3978 msg = "%s does not exist" % fn
jpayne@68	3979 raise FileNotFoundError(msg)
jpayne@68	3980 return BedTool(fn)
jpayne@68	3981
jpayne@68	3982
jpayne@68	3983 if __name__ == "__main__":
jpayne@68	3984 import doctest
jpayne@68	3985
jpayne@68	3986 doctest.testmod(optionflags=doctest.ELLIPSIS \| doctest.NORMALIZE_WHITESPACE)

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/bedtool.py @ 68:5028fdace37b