annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/helpers.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 import sys
jpayne@68 2 import os
jpayne@68 3 import gzip
jpayne@68 4 import tempfile
jpayne@68 5 import subprocess
jpayne@68 6 import glob
jpayne@68 7 import struct
jpayne@68 8 import atexit
jpayne@68 9 import re
jpayne@68 10 import urllib
jpayne@68 11 import urllib.error
jpayne@68 12 import urllib.request
jpayne@68 13
jpayne@68 14 try: # Use genomepy to determine chrom sizes if it is installed
jpayne@68 15 import genomepy
jpayne@68 16 except ImportError:
jpayne@68 17 pass
jpayne@68 18
jpayne@68 19 from . import settings
jpayne@68 20 from . import filenames
jpayne@68 21 from . import genome_registry
jpayne@68 22 from .logger import logger
jpayne@68 23 from .cbedtools import create_interval_from_list
jpayne@68 24
jpayne@68 25 BUFSIZE = -1
jpayne@68 26
jpayne@68 27 _tags = {}
jpayne@68 28
jpayne@68 29
jpayne@68 30 def set_bedtools_path(path=""):
jpayne@68 31 """
jpayne@68 32 Explicitly set path to `BEDTools` installation dir.
jpayne@68 33
jpayne@68 34 If BEDTools is not available on your system path, specify the path to the
jpayne@68 35 dir containing the BEDTools executables (intersectBed, subtractBed, etc)
jpayne@68 36 with this function.
jpayne@68 37
jpayne@68 38 To reset and use the default system path, call this function with no
jpayne@68 39 arguments or use path="".
jpayne@68 40 """
jpayne@68 41 from . import paths
jpayne@68 42
jpayne@68 43 paths._set_bedtools_path(path)
jpayne@68 44
jpayne@68 45
jpayne@68 46 def get_bedtools_path():
jpayne@68 47 """
jpayne@68 48 Returns the currently-set path to bedtools
jpayne@68 49 """
jpayne@68 50 from . import paths
jpayne@68 51
jpayne@68 52 return paths._get_bedtools_path()
jpayne@68 53
jpayne@68 54
jpayne@68 55 def set_R_path(path=""):
jpayne@68 56 """
jpayne@68 57 Explicitly set path to `R` installation dir.
jpayne@68 58
jpayne@68 59 If R is not available on the path, then it can be explicitly
jpayne@68 60 specified here.
jpayne@68 61
jpayne@68 62 Use path="" to reset to default system path.
jpayne@68 63 """
jpayne@68 64 from . import paths
jpayne@68 65
jpayne@68 66 paths._set_R_path(path)
jpayne@68 67
jpayne@68 68
jpayne@68 69 def _check_for_bedtools(force_check=False, verbose=False, override=None):
jpayne@68 70 """
jpayne@68 71 Checks installation as well as version (based on whether or not "bedtools
jpayne@68 72 intersect" works, or just "intersectBed")
jpayne@68 73 """
jpayne@68 74 if settings._bedtools_installed and not force_check:
jpayne@68 75 return True
jpayne@68 76
jpayne@68 77 if (len(settings.bedtools_version) == 0) or force_check:
jpayne@68 78 try:
jpayne@68 79 v = (
jpayne@68 80 subprocess.check_output(
jpayne@68 81 [os.path.join(settings._bedtools_path, "bedtools"), "--version"]
jpayne@68 82 )
jpayne@68 83 .decode("utf-8")
jpayne@68 84 .rstrip()
jpayne@68 85 )
jpayne@68 86
jpayne@68 87 if verbose:
jpayne@68 88 print("Found bedtools version '%s'" % v)
jpayne@68 89
jpayne@68 90 settings._bedtools_installed = True
jpayne@68 91
jpayne@68 92 # Override, used for testing
jpayne@68 93 if override is not None:
jpayne@68 94 v = override
jpayne@68 95
jpayne@68 96 # To allow more complicated versions as found in Linux distributions, e.g.:
jpayne@68 97 # bedtools v2.26.0
jpayne@68 98 # bedtools debian/2.28.0+dfsg-2-dirty
jpayne@68 99 m = re.search("^bedtools [^0-9]*([0-9][0-9.]*)", v)
jpayne@68 100 if not m:
jpayne@68 101 raise ValueError('Cannot identify version number from "{}"'.format(v))
jpayne@68 102 vv = m.group(1)
jpayne@68 103
jpayne@68 104 settings.bedtools_version = [int(i) for i in vv.split(".")]
jpayne@68 105
jpayne@68 106 settings._v_2_27_plus = (
jpayne@68 107 settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 27
jpayne@68 108 )
jpayne@68 109
jpayne@68 110 settings._v_2_15_plus = (
jpayne@68 111 settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 15
jpayne@68 112 )
jpayne@68 113
jpayne@68 114 except subprocess.CalledProcessError:
jpayne@68 115 if settings._bedtools_path:
jpayne@68 116 add_msg = "(tried path '%s')" % settings._bedtools_path
jpayne@68 117 else:
jpayne@68 118 add_msg = ""
jpayne@68 119 raise OSError(
jpayne@68 120 "Please make sure you have installed BEDTools"
jpayne@68 121 "(https://github.com/arq5x/bedtools) and that "
jpayne@68 122 "it's on the path. %s" % add_msg
jpayne@68 123 )
jpayne@68 124
jpayne@68 125
jpayne@68 126 def _check_for_R():
jpayne@68 127 try:
jpayne@68 128 p = subprocess.Popen(
jpayne@68 129 [os.path.join(settings._R_path, "R"), "--version"],
jpayne@68 130 stdout=subprocess.PIPE,
jpayne@68 131 stderr=subprocess.PIPE,
jpayne@68 132 )
jpayne@68 133 settings._R_installed = True
jpayne@68 134 except OSError:
jpayne@68 135 if settings._R_path:
jpayne@68 136 add_msg = "(tried path '%s')" % settings._R_path
jpayne@68 137 else:
jpayne@68 138 add_msg = ""
jpayne@68 139 raise ValueError("Please install R and ensure it is on your path %s" % add_msg)
jpayne@68 140
jpayne@68 141
jpayne@68 142 class Error(Exception):
jpayne@68 143 """Base class for this module's exceptions"""
jpayne@68 144
jpayne@68 145 pass
jpayne@68 146
jpayne@68 147
jpayne@68 148 class pybedtoolsError(Error):
jpayne@68 149 pass
jpayne@68 150
jpayne@68 151
jpayne@68 152 class BEDToolsError(Error):
jpayne@68 153 def __init__(self, cmd, msg):
jpayne@68 154 self.cmd = str(cmd)
jpayne@68 155 self.msg = str(msg)
jpayne@68 156
jpayne@68 157 def __str__(self):
jpayne@68 158 m = (
jpayne@68 159 "\nCommand was:\n\n\t"
jpayne@68 160 + self.cmd
jpayne@68 161 + "\n"
jpayne@68 162 + "\nError message was:\n"
jpayne@68 163 + self.msg
jpayne@68 164 )
jpayne@68 165 return m
jpayne@68 166
jpayne@68 167
jpayne@68 168 def isGZIP(fn):
jpayne@68 169 with open(fn, "rb") as f:
jpayne@68 170 start = f.read(3)
jpayne@68 171 if start == b"\x1f\x8b\x08":
jpayne@68 172 return True
jpayne@68 173 return False
jpayne@68 174
jpayne@68 175
jpayne@68 176 def isBGZIP(fn):
jpayne@68 177 """
jpayne@68 178 Reads a filename to see if it's a BGZIPed file or not.
jpayne@68 179 """
jpayne@68 180 with open(fn, "rb") as fh:
jpayne@68 181 header_str = fh.read(15)
jpayne@68 182
jpayne@68 183 if len(header_str) < 15:
jpayne@68 184 return False
jpayne@68 185
jpayne@68 186 header = struct.unpack_from("BBBBiBBHBBB", header_str)
jpayne@68 187
jpayne@68 188 id1, id2, cm, flg, mtime, xfl, os_, xlen, si1, si2, slen = header
jpayne@68 189 if (
jpayne@68 190 (id1 == 31)
jpayne@68 191 and (id2 == 139)
jpayne@68 192 and (cm == 8)
jpayne@68 193 and (flg == 4)
jpayne@68 194 and (si1 == 66)
jpayne@68 195 and (si2 == 67)
jpayne@68 196 and (slen == 2)
jpayne@68 197 ):
jpayne@68 198 return True
jpayne@68 199 return False
jpayne@68 200
jpayne@68 201
jpayne@68 202 def isBAM(fn):
jpayne@68 203 """
jpayne@68 204 Returns True if the file is both BGZIPed and the compressed contents have
jpayne@68 205 start with the magic number `BAM\\x01`, or if the file is CRAM format (see
jpayne@68 206 isCRAM()).
jpayne@68 207 """
jpayne@68 208 # Note: previously we were catching ValueError when trying to open
jpayne@68 209 # a non-BAM with pysam.Samfile. That started segfaulting, so now do it the
jpayne@68 210 # right way with magic number.
jpayne@68 211 with gzip.open(fn, "rb") as in_:
jpayne@68 212 if isBGZIP(fn) and (in_.read(4).decode() == "BAM\x01"):
jpayne@68 213 return True
jpayne@68 214 if isCRAM(fn):
jpayne@68 215 return True
jpayne@68 216
jpayne@68 217
jpayne@68 218 def isCRAM(fn):
jpayne@68 219 """
jpayne@68 220 Returns True if the file starts with the bytes for the characters "CRAM".
jpayne@68 221 """
jpayne@68 222 with open(fn, "rb") as in_:
jpayne@68 223 if in_.read(4).decode(errors="ignore") == "CRAM":
jpayne@68 224 return True
jpayne@68 225
jpayne@68 226
jpayne@68 227 def find_tagged(tag):
jpayne@68 228 """
jpayne@68 229 Returns the bedtool object with tagged with *tag*. Useful for tracking
jpayne@68 230 down bedtools you made previously.
jpayne@68 231 """
jpayne@68 232 for key, item in _tags.items():
jpayne@68 233 try:
jpayne@68 234 if item._tag == tag:
jpayne@68 235 return item
jpayne@68 236 except AttributeError:
jpayne@68 237 pass
jpayne@68 238 raise ValueError('tag "%s" not found' % tag)
jpayne@68 239
jpayne@68 240
jpayne@68 241 def _flatten_list(x):
jpayne@68 242 nested = True
jpayne@68 243 while nested:
jpayne@68 244 check_again = False
jpayne@68 245 flattened = []
jpayne@68 246
jpayne@68 247 for element in x:
jpayne@68 248 if isinstance(element, list):
jpayne@68 249 flattened.extend(element)
jpayne@68 250 check_again = True
jpayne@68 251 else:
jpayne@68 252 flattened.append(element)
jpayne@68 253 nested = check_again
jpayne@68 254 x = flattened[:]
jpayne@68 255 return x
jpayne@68 256
jpayne@68 257
jpayne@68 258 def set_tempdir(tempdir):
jpayne@68 259 """
jpayne@68 260 Set the directory for temp files.
jpayne@68 261
jpayne@68 262 Useful for clusters that use a /scratch partition rather than a /tmp dir.
jpayne@68 263 Convenience function to simply set tempfile.tempdir.
jpayne@68 264 """
jpayne@68 265 if not os.path.exists(tempdir):
jpayne@68 266 errstr = "The tempdir you specified, %s, does not exist" % tempdir
jpayne@68 267 raise FileNotFoundError(errstr)
jpayne@68 268 tempfile.tempdir = tempdir
jpayne@68 269
jpayne@68 270
jpayne@68 271 def get_tempdir():
jpayne@68 272 """
jpayne@68 273 Gets the current tempdir for the module.
jpayne@68 274 """
jpayne@68 275 return tempfile.gettempdir()
jpayne@68 276
jpayne@68 277
jpayne@68 278 def cleanup(verbose=False, remove_all=False):
jpayne@68 279 """
jpayne@68 280 Deletes all temp files from the current session (or optionally *all* \
jpayne@68 281 sessions)
jpayne@68 282
jpayne@68 283 If *verbose*, reports what it's doing
jpayne@68 284
jpayne@68 285 If *remove_all*, then ALL files matching "pybedtools.*.tmp" in the temp dir
jpayne@68 286 will be deleted.
jpayne@68 287 """
jpayne@68 288 if settings.KEEP_TEMPFILES:
jpayne@68 289 return
jpayne@68 290 for fn in filenames.TEMPFILES:
jpayne@68 291 if verbose:
jpayne@68 292 print("removing", fn)
jpayne@68 293 if os.path.exists(fn):
jpayne@68 294 os.unlink(fn)
jpayne@68 295 if remove_all:
jpayne@68 296 fns = glob.glob(os.path.join(get_tempdir(), "pybedtools.*.tmp"))
jpayne@68 297 for fn in fns:
jpayne@68 298 os.unlink(fn)
jpayne@68 299
jpayne@68 300
jpayne@68 301 def _version_2_15_plus_names(prog_name):
jpayne@68 302 if not settings._bedtools_installed:
jpayne@68 303 _check_for_bedtools()
jpayne@68 304 if not settings._v_2_15_plus:
jpayne@68 305 return [prog_name]
jpayne@68 306 try:
jpayne@68 307 prog_name = settings._prog_names[prog_name]
jpayne@68 308 except KeyError:
jpayne@68 309 if prog_name in settings._new_names:
jpayne@68 310 pass
jpayne@68 311 raise BEDToolsError(prog_name, prog_name + "not a recognized BEDTools program")
jpayne@68 312 return [os.path.join(settings._bedtools_path, "bedtools"), prog_name]
jpayne@68 313
jpayne@68 314
jpayne@68 315 def call_bedtools(
jpayne@68 316 cmds,
jpayne@68 317 tmpfn=None,
jpayne@68 318 stdin=None,
jpayne@68 319 check_stderr=None,
jpayne@68 320 decode_output=True,
jpayne@68 321 encode_input=True,
jpayne@68 322 ):
jpayne@68 323 """
jpayne@68 324 Use subprocess.Popen to call BEDTools and catch any errors.
jpayne@68 325
jpayne@68 326 Output goes to *tmpfn*, or, if None, output stays in subprocess.PIPE and
jpayne@68 327 can be iterated over.
jpayne@68 328
jpayne@68 329 *stdin* is an optional file-like object that will be sent to
jpayne@68 330 subprocess.Popen.
jpayne@68 331
jpayne@68 332 Prints some useful help upon getting common errors.
jpayne@68 333
jpayne@68 334 *check_stderr* is a function that takes the stderr string as input and
jpayne@68 335 returns True if it's OK (that is, it's not really an error). This is
jpayne@68 336 needed, e.g., for calling fastaFromBed which will report that it has to
jpayne@68 337 make a .fai for a fasta file.
jpayne@68 338
jpayne@68 339 *decode_output* should be set to False when you are iterating over a BAM
jpayne@68 340 file, where the data represent binary rather than text data.
jpayne@68 341 """
jpayne@68 342 input_is_stream = stdin is not None
jpayne@68 343 output_is_stream = tmpfn is None
jpayne@68 344
jpayne@68 345 _orig_cmds = cmds[:]
jpayne@68 346 cmds = []
jpayne@68 347 cmds.extend(_version_2_15_plus_names(_orig_cmds[0]))
jpayne@68 348 cmds.extend(_orig_cmds[1:])
jpayne@68 349
jpayne@68 350 try:
jpayne@68 351 # coming from an iterator, sending as iterator
jpayne@68 352 if input_is_stream and output_is_stream:
jpayne@68 353 logger.debug(
jpayne@68 354 "helpers.call_bedtools(): input is stream, output is " "stream"
jpayne@68 355 )
jpayne@68 356 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68 357 p = subprocess.Popen(
jpayne@68 358 cmds,
jpayne@68 359 stdout=subprocess.PIPE,
jpayne@68 360 stderr=subprocess.PIPE,
jpayne@68 361 stdin=subprocess.PIPE,
jpayne@68 362 bufsize=BUFSIZE,
jpayne@68 363 )
jpayne@68 364 if encode_input:
jpayne@68 365 for line in stdin:
jpayne@68 366 p.stdin.write(line.encode())
jpayne@68 367 else:
jpayne@68 368 for line in stdin:
jpayne@68 369 p.stdin.write(line)
jpayne@68 370
jpayne@68 371 # This is important to prevent deadlocks
jpayne@68 372 p.stdin.close()
jpayne@68 373
jpayne@68 374 if decode_output:
jpayne@68 375 output = (i.decode("UTF-8") for i in p.stdout)
jpayne@68 376 else:
jpayne@68 377 output = (i for i in p.stdout)
jpayne@68 378
jpayne@68 379 stderr = None
jpayne@68 380
jpayne@68 381 # coming from an iterator, writing to file
jpayne@68 382 if input_is_stream and not output_is_stream:
jpayne@68 383 logger.debug("helpers.call_bedtools(): input is stream, output is file")
jpayne@68 384 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68 385 outfile = open(tmpfn, "wb")
jpayne@68 386 p = subprocess.Popen(
jpayne@68 387 cmds,
jpayne@68 388 stdout=outfile,
jpayne@68 389 stderr=subprocess.PIPE,
jpayne@68 390 stdin=subprocess.PIPE,
jpayne@68 391 bufsize=BUFSIZE,
jpayne@68 392 )
jpayne@68 393 if hasattr(stdin, "read"):
jpayne@68 394 stdout, stderr = p.communicate(stdin.read())
jpayne@68 395 else:
jpayne@68 396 for item in stdin:
jpayne@68 397 p.stdin.write(item.encode())
jpayne@68 398 stdout, stderr = p.communicate()
jpayne@68 399 output = tmpfn
jpayne@68 400 outfile.close()
jpayne@68 401
jpayne@68 402 # coming from a file, sending as iterator
jpayne@68 403 if not input_is_stream and output_is_stream:
jpayne@68 404 logger.debug(
jpayne@68 405 "helpers.call_bedtools(): input is filename, " "output is stream"
jpayne@68 406 )
jpayne@68 407 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68 408 p = subprocess.Popen(
jpayne@68 409 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68 410 )
jpayne@68 411 if decode_output:
jpayne@68 412 output = (i.decode("UTF-8") for i in p.stdout)
jpayne@68 413 else:
jpayne@68 414 output = (i for i in p.stdout)
jpayne@68 415 stderr = None
jpayne@68 416
jpayne@68 417 # file-to-file
jpayne@68 418 if not input_is_stream and not output_is_stream:
jpayne@68 419 logger.debug(
jpayne@68 420 "helpers.call_bedtools(): input is filename, output "
jpayne@68 421 "is filename (%s)",
jpayne@68 422 tmpfn,
jpayne@68 423 )
jpayne@68 424 cmds = list(map(str, cmds))
jpayne@68 425 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68 426 outfile = open(tmpfn, "wb")
jpayne@68 427 p = subprocess.Popen(
jpayne@68 428 cmds, stdout=outfile, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68 429 )
jpayne@68 430 stdout, stderr = p.communicate()
jpayne@68 431 output = tmpfn
jpayne@68 432 outfile.close()
jpayne@68 433
jpayne@68 434 # Check if it's OK using a provided function to check stderr. If it's
jpayne@68 435 # OK, dump it to sys.stderr so it's printed, and reset it to None so we
jpayne@68 436 # don't raise an exception
jpayne@68 437 if check_stderr is not None:
jpayne@68 438 if isinstance(stderr, bytes):
jpayne@68 439 stderr = stderr.decode("UTF_8")
jpayne@68 440 if check_stderr(stderr):
jpayne@68 441 sys.stderr.write(stderr)
jpayne@68 442 stderr = None
jpayne@68 443
jpayne@68 444 if stderr:
jpayne@68 445 # Fix for issue #147. In general, we consider warnings to not be
jpayne@68 446 # fatal, so just show 'em and continue on.
jpayne@68 447 #
jpayne@68 448 # bedtools source has several different ways of showing a warning,
jpayne@68 449 # but they seem to all have "WARNING" in the first 20 or so
jpayne@68 450 # characters
jpayne@68 451 if isinstance(stderr, bytes):
jpayne@68 452 stderr = stderr.decode("UTF_8")
jpayne@68 453 if len(stderr) > 20 and "WARNING" in stderr[:20].upper():
jpayne@68 454 sys.stderr.write(stderr)
jpayne@68 455 else:
jpayne@68 456 raise BEDToolsError(subprocess.list2cmdline(cmds), stderr)
jpayne@68 457
jpayne@68 458 except (OSError, IOError) as err:
jpayne@68 459 print("%s: %s" % (type(err), os.strerror(err.errno)))
jpayne@68 460 print("The command was:\n\n\t%s\n" % subprocess.list2cmdline(cmds))
jpayne@68 461
jpayne@68 462 problems = {
jpayne@68 463 2: (
jpayne@68 464 "* Did you spell the command correctly?",
jpayne@68 465 "* Do you have BEDTools installed and on the path?",
jpayne@68 466 ),
jpayne@68 467 13: (
jpayne@68 468 "* Do you have permission to write "
jpayne@68 469 'to the output file ("%s")?' % tmpfn,
jpayne@68 470 ),
jpayne@68 471 24: (
jpayne@68 472 "* Too many files open -- please submit "
jpayne@68 473 "a bug report so that this can be fixed",
jpayne@68 474 ),
jpayne@68 475 32: (
jpayne@68 476 "* Broken pipe -- if you passed a BedTool object "
jpayne@68 477 "that was created using a generator function, "
jpayne@68 478 "please try saving it to disk first using the "
jpayne@68 479 ".saveas() method before calling this bedtools "
jpayne@68 480 "command. See issue #49 for more.",
jpayne@68 481 ),
jpayne@68 482 }
jpayne@68 483
jpayne@68 484 print("Things to check:")
jpayne@68 485 print("\n\t" + "\n\t".join(problems[err.errno]))
jpayne@68 486 raise OSError("See above for commands that gave the error")
jpayne@68 487
jpayne@68 488 return output
jpayne@68 489
jpayne@68 490
jpayne@68 491 def _check_sequence_stderr(x):
jpayne@68 492 """
jpayne@68 493 If stderr created by fastaFromBed starts with 'index file', then don't
jpayne@68 494 consider it an error.
jpayne@68 495 """
jpayne@68 496 if isinstance(x, bytes):
jpayne@68 497 x = x.decode("UTF-8")
jpayne@68 498 if x.startswith("index file"):
jpayne@68 499 return True
jpayne@68 500 if x.startswith("WARNING"):
jpayne@68 501 return True
jpayne@68 502 return False
jpayne@68 503
jpayne@68 504
jpayne@68 505 def _call_randomintersect(
jpayne@68 506 _self,
jpayne@68 507 other,
jpayne@68 508 iterations,
jpayne@68 509 intersect_kwargs,
jpayne@68 510 shuffle_kwargs,
jpayne@68 511 report_iterations,
jpayne@68 512 debug,
jpayne@68 513 _orig_processes,
jpayne@68 514 ):
jpayne@68 515 """
jpayne@68 516 Helper function that list-ifies the output from randomintersection, s.t.
jpayne@68 517 it can be pickled across a multiprocess Pool.
jpayne@68 518 """
jpayne@68 519 return list(
jpayne@68 520 _self.randomintersection(
jpayne@68 521 other,
jpayne@68 522 iterations,
jpayne@68 523 intersect_kwargs=intersect_kwargs,
jpayne@68 524 shuffle_kwargs=shuffle_kwargs,
jpayne@68 525 report_iterations=report_iterations,
jpayne@68 526 debug=False,
jpayne@68 527 processes=None,
jpayne@68 528 _orig_processes=_orig_processes,
jpayne@68 529 )
jpayne@68 530 )
jpayne@68 531
jpayne@68 532
jpayne@68 533 def close_or_delete(*args):
jpayne@68 534 """
jpayne@68 535 Single function that can be used to get rid of a BedTool, whether it's a
jpayne@68 536 streaming or file-based version.
jpayne@68 537 """
jpayne@68 538 for x in args:
jpayne@68 539 if isinstance(x.fn, str):
jpayne@68 540 os.unlink(x.fn)
jpayne@68 541 elif hasattr(x.fn, "close"):
jpayne@68 542 x.fn.close()
jpayne@68 543 if hasattr(x.fn, "throw"):
jpayne@68 544 x.fn.throw(StopIteration)
jpayne@68 545
jpayne@68 546
jpayne@68 547 def n_open_fds():
jpayne@68 548 pid = os.getpid()
jpayne@68 549 procs = subprocess.check_output(["lsof", "-w", "-Ff", "-p", str(pid)])
jpayne@68 550 nprocs = 0
jpayne@68 551 for i in procs.splitlines():
jpayne@68 552 if i[1:].isdigit() and i[0] == "f":
jpayne@68 553 nprocs += 1
jpayne@68 554 return nprocs
jpayne@68 555
jpayne@68 556
jpayne@68 557 import re
jpayne@68 558
jpayne@68 559 coord_re = re.compile(
jpayne@68 560 r"""
jpayne@68 561 (?P<chrom>.+):
jpayne@68 562 (?P<start>\d+)-
jpayne@68 563 (?P<stop>\d+)
jpayne@68 564 (?:\[(?P<strand>.)\])?""",
jpayne@68 565 re.VERBOSE,
jpayne@68 566 )
jpayne@68 567
jpayne@68 568
jpayne@68 569 def string_to_interval(s):
jpayne@68 570 """
jpayne@68 571 Convert string of the form "chrom:start-stop" or "chrom:start-stop[strand]"
jpayne@68 572 to an interval.
jpayne@68 573
jpayne@68 574 Assumes zero-based coords.
jpayne@68 575
jpayne@68 576 If it's already an interval, then return it as-is.
jpayne@68 577 """
jpayne@68 578 if isinstance(s, str):
jpayne@68 579 m = coord_re.search(s)
jpayne@68 580 if m.group("strand"):
jpayne@68 581 return create_interval_from_list(
jpayne@68 582 [
jpayne@68 583 m.group("chrom"),
jpayne@68 584 m.group("start"),
jpayne@68 585 m.group("stop"),
jpayne@68 586 ".",
jpayne@68 587 "0",
jpayne@68 588 m.group("strand"),
jpayne@68 589 ]
jpayne@68 590 )
jpayne@68 591 else:
jpayne@68 592 return create_interval_from_list(
jpayne@68 593 [m.group("chrom"), m.group("start"), m.group("stop")]
jpayne@68 594 )
jpayne@68 595 return s
jpayne@68 596
jpayne@68 597
jpayne@68 598 class FisherOutput(object):
jpayne@68 599 def __init__(self, s, **kwargs):
jpayne@68 600 """
jpayne@68 601 fisher returns text results like::
jpayne@68 602
jpayne@68 603 # Contingency Table
jpayne@68 604 #_________________________________________
jpayne@68 605 # | not in -b | in -b |
jpayne@68 606 # not in -a | 3137160615 | 503 |
jpayne@68 607 # in -a | 100 | 46 |
jpayne@68 608 #_________________________________________
jpayne@68 609 # p-values for fisher's exact test
jpayne@68 610 left right two-tail ratio
jpayne@68 611 1.00000 0.00000 0.00000 2868973.922
jpayne@68 612
jpayne@68 613 """
jpayne@68 614 if isinstance(s, str):
jpayne@68 615 s = open(s).read()
jpayne@68 616 if hasattr(s, "next"):
jpayne@68 617 s = "".join(i for i in s)
jpayne@68 618 table = {
jpayne@68 619 "not in -a": {"not in -b": None, "in -b": None},
jpayne@68 620 "in -a": {"not in -b": None, "in -b": None},
jpayne@68 621 }
jpayne@68 622 self.text = s
jpayne@68 623 lines = s.splitlines()
jpayne@68 624 for i in lines:
jpayne@68 625 if "not in -a" in i:
jpayne@68 626 _, in_b, not_in_b, _ = i.strip().split("|")
jpayne@68 627 table["not in -a"]["not in -b"] = int(not_in_b)
jpayne@68 628 table["not in -a"]["in -b"] = int(in_b)
jpayne@68 629
jpayne@68 630 if " in -a" in i:
jpayne@68 631 _, in_b, not_in_b, _ = i.strip().split("|")
jpayne@68 632 table["in -a"]["not in -b"] = int(not_in_b)
jpayne@68 633 table["in -a"]["in -b"] = int(in_b)
jpayne@68 634 self.table = table
jpayne@68 635 left, right, two_tail, ratio = lines[-1].split()
jpayne@68 636 self.left_tail = float(left)
jpayne@68 637 self.right_tail = float(right)
jpayne@68 638 self.two_tail = float(two_tail)
jpayne@68 639 self.ratio = float(ratio)
jpayne@68 640
jpayne@68 641 def __str__(self):
jpayne@68 642 return self.text
jpayne@68 643
jpayne@68 644 def __repr__(self):
jpayne@68 645 return "<%s at %s>\n%s" % (self.__class__.__name__, id(self), self.text)
jpayne@68 646
jpayne@68 647
jpayne@68 648 class SplitOutput(object):
jpayne@68 649 def __init__(self, output, **kwargs):
jpayne@68 650 """
jpayne@68 651 Handles output from bedtools split, which sends a report of files to
jpayne@68 652 stdout. This class parses that list into something more convenient to
jpayne@68 653 use within pybedtools.
jpayne@68 654
jpayne@68 655 Most useful is probably the .bedtools attribute, which is a list of
jpayne@68 656 BedTool objects.
jpayne@68 657 """
jpayne@68 658 from .bedtool import BedTool
jpayne@68 659
jpayne@68 660 if isinstance(output, str):
jpayne@68 661 output = open(output).read()
jpayne@68 662 if hasattr(output, "next"):
jpayne@68 663 output = "".join(i for i in output)
jpayne@68 664
jpayne@68 665 #: store a copy of the output
jpayne@68 666 self.text = output
jpayne@68 667
jpayne@68 668 #: BedTool objects created from output
jpayne@68 669 self.bedtools = []
jpayne@68 670
jpayne@68 671 #: Filenames that were created from the split
jpayne@68 672 self.files = []
jpayne@68 673
jpayne@68 674 #: number of bases in each file
jpayne@68 675 self.nbases = []
jpayne@68 676
jpayne@68 677 #: number of features in each file
jpayne@68 678 self.counts = []
jpayne@68 679
jpayne@68 680 for line in output.splitlines():
jpayne@68 681 toks = line.split()
jpayne@68 682 self.files.append(toks[0])
jpayne@68 683 self.nbases.append(int(toks[1]))
jpayne@68 684 self.counts.append(int(toks[2]))
jpayne@68 685 self.bedtools.append(BedTool(toks[0]))
jpayne@68 686
jpayne@68 687
jpayne@68 688 def internet_on(timeout=1):
jpayne@68 689 try:
jpayne@68 690 response = urllib.request.urlopen("http://genome.ucsc.edu", timeout=timeout)
jpayne@68 691 return True
jpayne@68 692 except urllib.error.URLError as err:
jpayne@68 693 pass
jpayne@68 694 return False
jpayne@68 695
jpayne@68 696
jpayne@68 697 def get_chromsizes_from_ucsc(
jpayne@68 698 genome,
jpayne@68 699 saveas=None,
jpayne@68 700 mysql="mysql",
jpayne@68 701 fetchchromsizes="fetchChromSizes",
jpayne@68 702 timeout=None,
jpayne@68 703 host_url="genome-mysql.cse.ucsc.edu",
jpayne@68 704 ):
jpayne@68 705 """
jpayne@68 706 Download chrom size info for *genome* from UCSC and returns the dictionary.
jpayne@68 707
jpayne@68 708 Parameters
jpayne@68 709 ----------
jpayne@68 710
jpayne@68 711 genome : str
jpayne@68 712 Name of the genome assembly (e.g., "hg38")
jpayne@68 713
jpayne@68 714 saveas : str
jpayne@68 715 Filename to save output to. Dictionary will still be returned.
jpayne@68 716
jpayne@68 717 mysql, fetchchromsizes : str
jpayne@68 718 Paths to MySQL and fetchChromSizes.
jpayne@68 719
jpayne@68 720 timeout : float
jpayne@68 721 How long to wait for a response; mostly used for testing.
jpayne@68 722
jpayne@68 723 host_url : str
jpayne@68 724 URL of UCSC mirror MySQL server.
jpayne@68 725 """
jpayne@68 726 if not internet_on(timeout=timeout):
jpayne@68 727 raise ValueError(
jpayne@68 728 "It appears you don't have an internet connection "
jpayne@68 729 "-- unable to get chromsizes from UCSC"
jpayne@68 730 )
jpayne@68 731 cmds = [
jpayne@68 732 mysql,
jpayne@68 733 "--user=genome",
jpayne@68 734 "--host=" + host_url,
jpayne@68 735 "-A",
jpayne@68 736 "-e",
jpayne@68 737 "select chrom, size from %s.chromInfo" % genome,
jpayne@68 738 ]
jpayne@68 739 failures = []
jpayne@68 740 d = {}
jpayne@68 741 try:
jpayne@68 742 p = subprocess.Popen(
jpayne@68 743 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68 744 )
jpayne@68 745 stdout, stderr = p.communicate()
jpayne@68 746 if stderr:
jpayne@68 747 print(stderr)
jpayne@68 748 print("Commands were:\n")
jpayne@68 749 print((subprocess.list2cmdline(cmds)))
jpayne@68 750
jpayne@68 751 lines = stdout.splitlines()[1:]
jpayne@68 752 for line in lines:
jpayne@68 753 if isinstance(line, bytes):
jpayne@68 754 line = line.decode("UTF-8")
jpayne@68 755 chrom, size = line.split()
jpayne@68 756 d[chrom] = (0, int(size))
jpayne@68 757
jpayne@68 758 if saveas is not None:
jpayne@68 759 chromsizes_to_file(d, saveas)
jpayne@68 760
jpayne@68 761 except OSError as err:
jpayne@68 762 if err.errno == 2:
jpayne@68 763 failures.append("Can't find mysql at path {0}".format(mysql))
jpayne@68 764 else:
jpayne@68 765 raise
jpayne@68 766 try:
jpayne@68 767 cmds = [fetchchromsizes, genome]
jpayne@68 768 p = subprocess.Popen(
jpayne@68 769 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68 770 )
jpayne@68 771 stdout, stderr = p.communicate()
jpayne@68 772 if stderr:
jpayne@68 773 if "INFO: trying WGET" not in str(stderr):
jpayne@68 774 print(stderr)
jpayne@68 775 print("Commands were:\n")
jpayne@68 776 print((subprocess.list2cmdline(cmds)))
jpayne@68 777
jpayne@68 778 lines = stdout.splitlines()
jpayne@68 779 for line in lines:
jpayne@68 780 if isinstance(line, bytes):
jpayne@68 781 line = line.decode("UTF-8")
jpayne@68 782 chrom, size = line.split()
jpayne@68 783 d[chrom] = (0, int(size))
jpayne@68 784
jpayne@68 785 if saveas is not None:
jpayne@68 786 chromsizes_to_file(d, saveas)
jpayne@68 787
jpayne@68 788 except OSError as err:
jpayne@68 789 if err.errno == 2:
jpayne@68 790 failures.append("Can't find path to fetchChromsizes")
jpayne@68 791
jpayne@68 792 if not d:
jpayne@68 793 raise OSError(failures)
jpayne@68 794 return d
jpayne@68 795
jpayne@68 796
jpayne@68 797 def chromsizes_to_file(chrom_sizes, fn=None):
jpayne@68 798 """
jpayne@68 799 Converts a *chromsizes* dictionary to a file. If *fn* is None, then a
jpayne@68 800 tempfile is created (which can be deleted with pybedtools.cleanup()).
jpayne@68 801
jpayne@68 802 Returns the filename.
jpayne@68 803 """
jpayne@68 804 if fn is None:
jpayne@68 805 tmpfn = tempfile.NamedTemporaryFile(
jpayne@68 806 prefix="pybedtools.", suffix=".tmp", delete=False
jpayne@68 807 )
jpayne@68 808 tmpfn = tmpfn.name
jpayne@68 809 filenames.TEMPFILES.append(tmpfn)
jpayne@68 810 fn = tmpfn
jpayne@68 811 if isinstance(chrom_sizes, str):
jpayne@68 812 chrom_sizes = chromsizes(chrom_sizes)
jpayne@68 813 fout = open(fn, "wt")
jpayne@68 814 for chrom, bounds in chrom_sizes.items():
jpayne@68 815 line = chrom + "\t" + str(bounds[1]) + "\n"
jpayne@68 816 fout.write(line)
jpayne@68 817 fout.close()
jpayne@68 818 return fn
jpayne@68 819
jpayne@68 820
jpayne@68 821 def get_chromsizes_from_genomepy(
jpayne@68 822 genome, saveas=None,
jpayne@68 823 ):
jpayne@68 824 """
jpayne@68 825 Get chrom size info for *genome* from genomepy, if genomepy is installed.
jpayne@68 826
jpayne@68 827 Parameters
jpayne@68 828 ----------
jpayne@68 829
jpayne@68 830 genome : str
jpayne@68 831 Name of the genome assembly (e.g., "hg38")
jpayne@68 832
jpayne@68 833 saveas : str
jpayne@68 834 Filename to save output to. Dictionary will still be returned.
jpayne@68 835 """
jpayne@68 836 if "genomepy" not in sys.modules:
jpayne@68 837 return None
jpayne@68 838
jpayne@68 839 d = {}
jpayne@68 840 try:
jpayne@68 841 g = genomepy.Genome(genome)
jpayne@68 842 # Fail silently if the sizes file cannot be accessed
jpayne@68 843 if not hasattr(g, "sizes_file"):
jpayne@68 844 return None
jpayne@68 845 for line in open(g.sizes_file):
jpayne@68 846 chrom, size = line.split()
jpayne@68 847 d[chrom] = (0, int(size))
jpayne@68 848
jpayne@68 849 if saveas is not None:
jpayne@68 850 chromsizes_to_file(d, saveas)
jpayne@68 851 except FileNotFoundError:
jpayne@68 852 return None
jpayne@68 853
jpayne@68 854 return d
jpayne@68 855
jpayne@68 856
jpayne@68 857 def chromsizes(genome):
jpayne@68 858 """
jpayne@68 859 Looks for a *genome* already included in the genome registry; if not found
jpayne@68 860 it first tries to look it up via genomepy. If genomepy is not installed, or
jpayne@68 861 if this lookup fails then it looks it up on UCSC. Returns the dictionary of
jpayne@68 862 chromsize tuples where each tuple has (start,stop).
jpayne@68 863
jpayne@68 864 Chromsizes are described as (start, stop) tuples to allow randomization
jpayne@68 865 within specified regions; e. g., you can make a chromsizes dictionary that
jpayne@68 866 represents the extent of a tiling array.
jpayne@68 867
jpayne@68 868 Example usage:
jpayne@68 869
jpayne@68 870 >>> dm3_chromsizes = chromsizes('dm3')
jpayne@68 871 >>> for i in sorted(dm3_chromsizes.items()):
jpayne@68 872 ... print(i)
jpayne@68 873 ('chr2L', (0, 23011544))
jpayne@68 874 ('chr2LHet', (0, 368872))
jpayne@68 875 ('chr2R', (0, 21146708))
jpayne@68 876 ('chr2RHet', (0, 3288761))
jpayne@68 877 ('chr3L', (0, 24543557))
jpayne@68 878 ('chr3LHet', (0, 2555491))
jpayne@68 879 ('chr3R', (0, 27905053))
jpayne@68 880 ('chr3RHet', (0, 2517507))
jpayne@68 881 ('chr4', (0, 1351857))
jpayne@68 882 ('chrM', (0, 19517))
jpayne@68 883 ('chrU', (0, 10049037))
jpayne@68 884 ('chrUextra', (0, 29004656))
jpayne@68 885 ('chrX', (0, 22422827))
jpayne@68 886 ('chrXHet', (0, 204112))
jpayne@68 887 ('chrYHet', (0, 347038))
jpayne@68 888
jpayne@68 889
jpayne@68 890 """
jpayne@68 891 try:
jpayne@68 892 return getattr(genome_registry, genome)
jpayne@68 893 except AttributeError:
jpayne@68 894 chromsizes = get_chromsizes_from_genomepy(genome)
jpayne@68 895 if chromsizes is None:
jpayne@68 896 return get_chromsizes_from_ucsc(genome)
jpayne@68 897 else:
jpayne@68 898 return chromsizes
jpayne@68 899
jpayne@68 900
jpayne@68 901 def get_includes():
jpayne@68 902 """
jpayne@68 903 Returns a list of include directories with BEDTools headers
jpayne@68 904 """
jpayne@68 905 dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
jpayne@68 906 return [dirname, os.path.join(dirname, "include")]
jpayne@68 907
jpayne@68 908
jpayne@68 909 atexit.register(cleanup)