csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/helpers.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/helpers.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 import sys
jpayne@68	2 import os
jpayne@68	3 import gzip
jpayne@68	4 import tempfile
jpayne@68	5 import subprocess
jpayne@68	6 import glob
jpayne@68	7 import struct
jpayne@68	8 import atexit
jpayne@68	9 import re
jpayne@68	10 import urllib
jpayne@68	11 import urllib.error
jpayne@68	12 import urllib.request
jpayne@68	13
jpayne@68	14 try: # Use genomepy to determine chrom sizes if it is installed
jpayne@68	15 import genomepy
jpayne@68	16 except ImportError:
jpayne@68	17 pass
jpayne@68	18
jpayne@68	19 from . import settings
jpayne@68	20 from . import filenames
jpayne@68	21 from . import genome_registry
jpayne@68	22 from .logger import logger
jpayne@68	23 from .cbedtools import create_interval_from_list
jpayne@68	24
jpayne@68	25 BUFSIZE = -1
jpayne@68	26
jpayne@68	27 _tags = {}
jpayne@68	28
jpayne@68	29
jpayne@68	30 def set_bedtools_path(path=""):
jpayne@68	31 """
jpayne@68	32 Explicitly set path to `BEDTools` installation dir.
jpayne@68	33
jpayne@68	34 If BEDTools is not available on your system path, specify the path to the
jpayne@68	35 dir containing the BEDTools executables (intersectBed, subtractBed, etc)
jpayne@68	36 with this function.
jpayne@68	37
jpayne@68	38 To reset and use the default system path, call this function with no
jpayne@68	39 arguments or use path="".
jpayne@68	40 """
jpayne@68	41 from . import paths
jpayne@68	42
jpayne@68	43 paths._set_bedtools_path(path)
jpayne@68	44
jpayne@68	45
jpayne@68	46 def get_bedtools_path():
jpayne@68	47 """
jpayne@68	48 Returns the currently-set path to bedtools
jpayne@68	49 """
jpayne@68	50 from . import paths
jpayne@68	51
jpayne@68	52 return paths._get_bedtools_path()
jpayne@68	53
jpayne@68	54
jpayne@68	55 def set_R_path(path=""):
jpayne@68	56 """
jpayne@68	57 Explicitly set path to `R` installation dir.
jpayne@68	58
jpayne@68	59 If R is not available on the path, then it can be explicitly
jpayne@68	60 specified here.
jpayne@68	61
jpayne@68	62 Use path="" to reset to default system path.
jpayne@68	63 """
jpayne@68	64 from . import paths
jpayne@68	65
jpayne@68	66 paths._set_R_path(path)
jpayne@68	67
jpayne@68	68
jpayne@68	69 def _check_for_bedtools(force_check=False, verbose=False, override=None):
jpayne@68	70 """
jpayne@68	71 Checks installation as well as version (based on whether or not "bedtools
jpayne@68	72 intersect" works, or just "intersectBed")
jpayne@68	73 """
jpayne@68	74 if settings._bedtools_installed and not force_check:
jpayne@68	75 return True
jpayne@68	76
jpayne@68	77 if (len(settings.bedtools_version) == 0) or force_check:
jpayne@68	78 try:
jpayne@68	79 v = (
jpayne@68	80 subprocess.check_output(
jpayne@68	81 [os.path.join(settings._bedtools_path, "bedtools"), "--version"]
jpayne@68	82 )
jpayne@68	83 .decode("utf-8")
jpayne@68	84 .rstrip()
jpayne@68	85 )
jpayne@68	86
jpayne@68	87 if verbose:
jpayne@68	88 print("Found bedtools version '%s'" % v)
jpayne@68	89
jpayne@68	90 settings._bedtools_installed = True
jpayne@68	91
jpayne@68	92 # Override, used for testing
jpayne@68	93 if override is not None:
jpayne@68	94 v = override
jpayne@68	95
jpayne@68	96 # To allow more complicated versions as found in Linux distributions, e.g.:
jpayne@68	97 # bedtools v2.26.0
jpayne@68	98 # bedtools debian/2.28.0+dfsg-2-dirty
jpayne@68	99 m = re.search("^bedtools [^0-9]([0-9][0-9.])", v)
jpayne@68	100 if not m:
jpayne@68	101 raise ValueError('Cannot identify version number from "{}"'.format(v))
jpayne@68	102 vv = m.group(1)
jpayne@68	103
jpayne@68	104 settings.bedtools_version = [int(i) for i in vv.split(".")]
jpayne@68	105
jpayne@68	106 settings._v_2_27_plus = (
jpayne@68	107 settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 27
jpayne@68	108 )
jpayne@68	109
jpayne@68	110 settings._v_2_15_plus = (
jpayne@68	111 settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 15
jpayne@68	112 )
jpayne@68	113
jpayne@68	114 except subprocess.CalledProcessError:
jpayne@68	115 if settings._bedtools_path:
jpayne@68	116 add_msg = "(tried path '%s')" % settings._bedtools_path
jpayne@68	117 else:
jpayne@68	118 add_msg = ""
jpayne@68	119 raise OSError(
jpayne@68	120 "Please make sure you have installed BEDTools"
jpayne@68	121 "(https://github.com/arq5x/bedtools) and that "
jpayne@68	122 "it's on the path. %s" % add_msg
jpayne@68	123 )
jpayne@68	124
jpayne@68	125
jpayne@68	126 def _check_for_R():
jpayne@68	127 try:
jpayne@68	128 p = subprocess.Popen(
jpayne@68	129 [os.path.join(settings._R_path, "R"), "--version"],
jpayne@68	130 stdout=subprocess.PIPE,
jpayne@68	131 stderr=subprocess.PIPE,
jpayne@68	132 )
jpayne@68	133 settings._R_installed = True
jpayne@68	134 except OSError:
jpayne@68	135 if settings._R_path:
jpayne@68	136 add_msg = "(tried path '%s')" % settings._R_path
jpayne@68	137 else:
jpayne@68	138 add_msg = ""
jpayne@68	139 raise ValueError("Please install R and ensure it is on your path %s" % add_msg)
jpayne@68	140
jpayne@68	141
jpayne@68	142 class Error(Exception):
jpayne@68	143 """Base class for this module's exceptions"""
jpayne@68	144
jpayne@68	145 pass
jpayne@68	146
jpayne@68	147
jpayne@68	148 class pybedtoolsError(Error):
jpayne@68	149 pass
jpayne@68	150
jpayne@68	151
jpayne@68	152 class BEDToolsError(Error):
jpayne@68	153 def __init__(self, cmd, msg):
jpayne@68	154 self.cmd = str(cmd)
jpayne@68	155 self.msg = str(msg)
jpayne@68	156
jpayne@68	157 def __str__(self):
jpayne@68	158 m = (
jpayne@68	159 "\nCommand was:\n\n\t"
jpayne@68	160 + self.cmd
jpayne@68	161 + "\n"
jpayne@68	162 + "\nError message was:\n"
jpayne@68	163 + self.msg
jpayne@68	164 )
jpayne@68	165 return m
jpayne@68	166
jpayne@68	167
jpayne@68	168 def isGZIP(fn):
jpayne@68	169 with open(fn, "rb") as f:
jpayne@68	170 start = f.read(3)
jpayne@68	171 if start == b"\x1f\x8b\x08":
jpayne@68	172 return True
jpayne@68	173 return False
jpayne@68	174
jpayne@68	175
jpayne@68	176 def isBGZIP(fn):
jpayne@68	177 """
jpayne@68	178 Reads a filename to see if it's a BGZIPed file or not.
jpayne@68	179 """
jpayne@68	180 with open(fn, "rb") as fh:
jpayne@68	181 header_str = fh.read(15)
jpayne@68	182
jpayne@68	183 if len(header_str) < 15:
jpayne@68	184 return False
jpayne@68	185
jpayne@68	186 header = struct.unpack_from("BBBBiBBHBBB", header_str)
jpayne@68	187
jpayne@68	188 id1, id2, cm, flg, mtime, xfl, os_, xlen, si1, si2, slen = header
jpayne@68	189 if (
jpayne@68	190 (id1 == 31)
jpayne@68	191 and (id2 == 139)
jpayne@68	192 and (cm == 8)
jpayne@68	193 and (flg == 4)
jpayne@68	194 and (si1 == 66)
jpayne@68	195 and (si2 == 67)
jpayne@68	196 and (slen == 2)
jpayne@68	197 ):
jpayne@68	198 return True
jpayne@68	199 return False
jpayne@68	200
jpayne@68	201
jpayne@68	202 def isBAM(fn):
jpayne@68	203 """
jpayne@68	204 Returns True if the file is both BGZIPed and the compressed contents have
jpayne@68	205 start with the magic number `BAM\\x01`, or if the file is CRAM format (see
jpayne@68	206 isCRAM()).
jpayne@68	207 """
jpayne@68	208 # Note: previously we were catching ValueError when trying to open
jpayne@68	209 # a non-BAM with pysam.Samfile. That started segfaulting, so now do it the
jpayne@68	210 # right way with magic number.
jpayne@68	211 with gzip.open(fn, "rb") as in_:
jpayne@68	212 if isBGZIP(fn) and (in_.read(4).decode() == "BAM\x01"):
jpayne@68	213 return True
jpayne@68	214 if isCRAM(fn):
jpayne@68	215 return True
jpayne@68	216
jpayne@68	217
jpayne@68	218 def isCRAM(fn):
jpayne@68	219 """
jpayne@68	220 Returns True if the file starts with the bytes for the characters "CRAM".
jpayne@68	221 """
jpayne@68	222 with open(fn, "rb") as in_:
jpayne@68	223 if in_.read(4).decode(errors="ignore") == "CRAM":
jpayne@68	224 return True
jpayne@68	225
jpayne@68	226
jpayne@68	227 def find_tagged(tag):
jpayne@68	228 """
jpayne@68	229 Returns the bedtool object with tagged with tag. Useful for tracking
jpayne@68	230 down bedtools you made previously.
jpayne@68	231 """
jpayne@68	232 for key, item in _tags.items():
jpayne@68	233 try:
jpayne@68	234 if item._tag == tag:
jpayne@68	235 return item
jpayne@68	236 except AttributeError:
jpayne@68	237 pass
jpayne@68	238 raise ValueError('tag "%s" not found' % tag)
jpayne@68	239
jpayne@68	240
jpayne@68	241 def _flatten_list(x):
jpayne@68	242 nested = True
jpayne@68	243 while nested:
jpayne@68	244 check_again = False
jpayne@68	245 flattened = []
jpayne@68	246
jpayne@68	247 for element in x:
jpayne@68	248 if isinstance(element, list):
jpayne@68	249 flattened.extend(element)
jpayne@68	250 check_again = True
jpayne@68	251 else:
jpayne@68	252 flattened.append(element)
jpayne@68	253 nested = check_again
jpayne@68	254 x = flattened[:]
jpayne@68	255 return x
jpayne@68	256
jpayne@68	257
jpayne@68	258 def set_tempdir(tempdir):
jpayne@68	259 """
jpayne@68	260 Set the directory for temp files.
jpayne@68	261
jpayne@68	262 Useful for clusters that use a /scratch partition rather than a /tmp dir.
jpayne@68	263 Convenience function to simply set tempfile.tempdir.
jpayne@68	264 """
jpayne@68	265 if not os.path.exists(tempdir):
jpayne@68	266 errstr = "The tempdir you specified, %s, does not exist" % tempdir
jpayne@68	267 raise FileNotFoundError(errstr)
jpayne@68	268 tempfile.tempdir = tempdir
jpayne@68	269
jpayne@68	270
jpayne@68	271 def get_tempdir():
jpayne@68	272 """
jpayne@68	273 Gets the current tempdir for the module.
jpayne@68	274 """
jpayne@68	275 return tempfile.gettempdir()
jpayne@68	276
jpayne@68	277
jpayne@68	278 def cleanup(verbose=False, remove_all=False):
jpayne@68	279 """
jpayne@68	280 Deletes all temp files from the current session (or optionally all \
jpayne@68	281 sessions)
jpayne@68	282
jpayne@68	283 If verbose, reports what it's doing
jpayne@68	284
jpayne@68	285 If remove_all, then ALL files matching "pybedtools.*.tmp" in the temp dir
jpayne@68	286 will be deleted.
jpayne@68	287 """
jpayne@68	288 if settings.KEEP_TEMPFILES:
jpayne@68	289 return
jpayne@68	290 for fn in filenames.TEMPFILES:
jpayne@68	291 if verbose:
jpayne@68	292 print("removing", fn)
jpayne@68	293 if os.path.exists(fn):
jpayne@68	294 os.unlink(fn)
jpayne@68	295 if remove_all:
jpayne@68	296 fns = glob.glob(os.path.join(get_tempdir(), "pybedtools.*.tmp"))
jpayne@68	297 for fn in fns:
jpayne@68	298 os.unlink(fn)
jpayne@68	299
jpayne@68	300
jpayne@68	301 def _version_2_15_plus_names(prog_name):
jpayne@68	302 if not settings._bedtools_installed:
jpayne@68	303 _check_for_bedtools()
jpayne@68	304 if not settings._v_2_15_plus:
jpayne@68	305 return [prog_name]
jpayne@68	306 try:
jpayne@68	307 prog_name = settings._prog_names[prog_name]
jpayne@68	308 except KeyError:
jpayne@68	309 if prog_name in settings._new_names:
jpayne@68	310 pass
jpayne@68	311 raise BEDToolsError(prog_name, prog_name + "not a recognized BEDTools program")
jpayne@68	312 return [os.path.join(settings._bedtools_path, "bedtools"), prog_name]
jpayne@68	313
jpayne@68	314
jpayne@68	315 def call_bedtools(
jpayne@68	316 cmds,
jpayne@68	317 tmpfn=None,
jpayne@68	318 stdin=None,
jpayne@68	319 check_stderr=None,
jpayne@68	320 decode_output=True,
jpayne@68	321 encode_input=True,
jpayne@68	322 ):
jpayne@68	323 """
jpayne@68	324 Use subprocess.Popen to call BEDTools and catch any errors.
jpayne@68	325
jpayne@68	326 Output goes to tmpfn, or, if None, output stays in subprocess.PIPE and
jpayne@68	327 can be iterated over.
jpayne@68	328
jpayne@68	329 stdin is an optional file-like object that will be sent to
jpayne@68	330 subprocess.Popen.
jpayne@68	331
jpayne@68	332 Prints some useful help upon getting common errors.
jpayne@68	333
jpayne@68	334 check_stderr is a function that takes the stderr string as input and
jpayne@68	335 returns True if it's OK (that is, it's not really an error). This is
jpayne@68	336 needed, e.g., for calling fastaFromBed which will report that it has to
jpayne@68	337 make a .fai for a fasta file.
jpayne@68	338
jpayne@68	339 decode_output should be set to False when you are iterating over a BAM
jpayne@68	340 file, where the data represent binary rather than text data.
jpayne@68	341 """
jpayne@68	342 input_is_stream = stdin is not None
jpayne@68	343 output_is_stream = tmpfn is None
jpayne@68	344
jpayne@68	345 _orig_cmds = cmds[:]
jpayne@68	346 cmds = []
jpayne@68	347 cmds.extend(_version_2_15_plus_names(_orig_cmds[0]))
jpayne@68	348 cmds.extend(_orig_cmds[1:])
jpayne@68	349
jpayne@68	350 try:
jpayne@68	351 # coming from an iterator, sending as iterator
jpayne@68	352 if input_is_stream and output_is_stream:
jpayne@68	353 logger.debug(
jpayne@68	354 "helpers.call_bedtools(): input is stream, output is " "stream"
jpayne@68	355 )
jpayne@68	356 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68	357 p = subprocess.Popen(
jpayne@68	358 cmds,
jpayne@68	359 stdout=subprocess.PIPE,
jpayne@68	360 stderr=subprocess.PIPE,
jpayne@68	361 stdin=subprocess.PIPE,
jpayne@68	362 bufsize=BUFSIZE,
jpayne@68	363 )
jpayne@68	364 if encode_input:
jpayne@68	365 for line in stdin:
jpayne@68	366 p.stdin.write(line.encode())
jpayne@68	367 else:
jpayne@68	368 for line in stdin:
jpayne@68	369 p.stdin.write(line)
jpayne@68	370
jpayne@68	371 # This is important to prevent deadlocks
jpayne@68	372 p.stdin.close()
jpayne@68	373
jpayne@68	374 if decode_output:
jpayne@68	375 output = (i.decode("UTF-8") for i in p.stdout)
jpayne@68	376 else:
jpayne@68	377 output = (i for i in p.stdout)
jpayne@68	378
jpayne@68	379 stderr = None
jpayne@68	380
jpayne@68	381 # coming from an iterator, writing to file
jpayne@68	382 if input_is_stream and not output_is_stream:
jpayne@68	383 logger.debug("helpers.call_bedtools(): input is stream, output is file")
jpayne@68	384 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68	385 outfile = open(tmpfn, "wb")
jpayne@68	386 p = subprocess.Popen(
jpayne@68	387 cmds,
jpayne@68	388 stdout=outfile,
jpayne@68	389 stderr=subprocess.PIPE,
jpayne@68	390 stdin=subprocess.PIPE,
jpayne@68	391 bufsize=BUFSIZE,
jpayne@68	392 )
jpayne@68	393 if hasattr(stdin, "read"):
jpayne@68	394 stdout, stderr = p.communicate(stdin.read())
jpayne@68	395 else:
jpayne@68	396 for item in stdin:
jpayne@68	397 p.stdin.write(item.encode())
jpayne@68	398 stdout, stderr = p.communicate()
jpayne@68	399 output = tmpfn
jpayne@68	400 outfile.close()
jpayne@68	401
jpayne@68	402 # coming from a file, sending as iterator
jpayne@68	403 if not input_is_stream and output_is_stream:
jpayne@68	404 logger.debug(
jpayne@68	405 "helpers.call_bedtools(): input is filename, " "output is stream"
jpayne@68	406 )
jpayne@68	407 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68	408 p = subprocess.Popen(
jpayne@68	409 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68	410 )
jpayne@68	411 if decode_output:
jpayne@68	412 output = (i.decode("UTF-8") for i in p.stdout)
jpayne@68	413 else:
jpayne@68	414 output = (i for i in p.stdout)
jpayne@68	415 stderr = None
jpayne@68	416
jpayne@68	417 # file-to-file
jpayne@68	418 if not input_is_stream and not output_is_stream:
jpayne@68	419 logger.debug(
jpayne@68	420 "helpers.call_bedtools(): input is filename, output "
jpayne@68	421 "is filename (%s)",
jpayne@68	422 tmpfn,
jpayne@68	423 )
jpayne@68	424 cmds = list(map(str, cmds))
jpayne@68	425 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68	426 outfile = open(tmpfn, "wb")
jpayne@68	427 p = subprocess.Popen(
jpayne@68	428 cmds, stdout=outfile, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68	429 )
jpayne@68	430 stdout, stderr = p.communicate()
jpayne@68	431 output = tmpfn
jpayne@68	432 outfile.close()
jpayne@68	433
jpayne@68	434 # Check if it's OK using a provided function to check stderr. If it's
jpayne@68	435 # OK, dump it to sys.stderr so it's printed, and reset it to None so we
jpayne@68	436 # don't raise an exception
jpayne@68	437 if check_stderr is not None:
jpayne@68	438 if isinstance(stderr, bytes):
jpayne@68	439 stderr = stderr.decode("UTF_8")
jpayne@68	440 if check_stderr(stderr):
jpayne@68	441 sys.stderr.write(stderr)
jpayne@68	442 stderr = None
jpayne@68	443
jpayne@68	444 if stderr:
jpayne@68	445 # Fix for issue #147. In general, we consider warnings to not be
jpayne@68	446 # fatal, so just show 'em and continue on.
jpayne@68	447 #
jpayne@68	448 # bedtools source has several different ways of showing a warning,
jpayne@68	449 # but they seem to all have "WARNING" in the first 20 or so
jpayne@68	450 # characters
jpayne@68	451 if isinstance(stderr, bytes):
jpayne@68	452 stderr = stderr.decode("UTF_8")
jpayne@68	453 if len(stderr) > 20 and "WARNING" in stderr[:20].upper():
jpayne@68	454 sys.stderr.write(stderr)
jpayne@68	455 else:
jpayne@68	456 raise BEDToolsError(subprocess.list2cmdline(cmds), stderr)
jpayne@68	457
jpayne@68	458 except (OSError, IOError) as err:
jpayne@68	459 print("%s: %s" % (type(err), os.strerror(err.errno)))
jpayne@68	460 print("The command was:\n\n\t%s\n" % subprocess.list2cmdline(cmds))
jpayne@68	461
jpayne@68	462 problems = {
jpayne@68	463 2: (
jpayne@68	464 "* Did you spell the command correctly?",
jpayne@68	465 "* Do you have BEDTools installed and on the path?",
jpayne@68	466 ),
jpayne@68	467 13: (
jpayne@68	468 "* Do you have permission to write "
jpayne@68	469 'to the output file ("%s")?' % tmpfn,
jpayne@68	470 ),
jpayne@68	471 24: (
jpayne@68	472 "* Too many files open -- please submit "
jpayne@68	473 "a bug report so that this can be fixed",
jpayne@68	474 ),
jpayne@68	475 32: (
jpayne@68	476 "* Broken pipe -- if you passed a BedTool object "
jpayne@68	477 "that was created using a generator function, "
jpayne@68	478 "please try saving it to disk first using the "
jpayne@68	479 ".saveas() method before calling this bedtools "
jpayne@68	480 "command. See issue #49 for more.",
jpayne@68	481 ),
jpayne@68	482 }
jpayne@68	483
jpayne@68	484 print("Things to check:")
jpayne@68	485 print("\n\t" + "\n\t".join(problems[err.errno]))
jpayne@68	486 raise OSError("See above for commands that gave the error")
jpayne@68	487
jpayne@68	488 return output
jpayne@68	489
jpayne@68	490
jpayne@68	491 def _check_sequence_stderr(x):
jpayne@68	492 """
jpayne@68	493 If stderr created by fastaFromBed starts with 'index file', then don't
jpayne@68	494 consider it an error.
jpayne@68	495 """
jpayne@68	496 if isinstance(x, bytes):
jpayne@68	497 x = x.decode("UTF-8")
jpayne@68	498 if x.startswith("index file"):
jpayne@68	499 return True
jpayne@68	500 if x.startswith("WARNING"):
jpayne@68	501 return True
jpayne@68	502 return False
jpayne@68	503
jpayne@68	504
jpayne@68	505 def _call_randomintersect(
jpayne@68	506 _self,
jpayne@68	507 other,
jpayne@68	508 iterations,
jpayne@68	509 intersect_kwargs,
jpayne@68	510 shuffle_kwargs,
jpayne@68	511 report_iterations,
jpayne@68	512 debug,
jpayne@68	513 _orig_processes,
jpayne@68	514 ):
jpayne@68	515 """
jpayne@68	516 Helper function that list-ifies the output from randomintersection, s.t.
jpayne@68	517 it can be pickled across a multiprocess Pool.
jpayne@68	518 """
jpayne@68	519 return list(
jpayne@68	520 _self.randomintersection(
jpayne@68	521 other,
jpayne@68	522 iterations,
jpayne@68	523 intersect_kwargs=intersect_kwargs,
jpayne@68	524 shuffle_kwargs=shuffle_kwargs,
jpayne@68	525 report_iterations=report_iterations,
jpayne@68	526 debug=False,
jpayne@68	527 processes=None,
jpayne@68	528 _orig_processes=_orig_processes,
jpayne@68	529 )
jpayne@68	530 )
jpayne@68	531
jpayne@68	532
jpayne@68	533 def close_or_delete(*args):
jpayne@68	534 """
jpayne@68	535 Single function that can be used to get rid of a BedTool, whether it's a
jpayne@68	536 streaming or file-based version.
jpayne@68	537 """
jpayne@68	538 for x in args:
jpayne@68	539 if isinstance(x.fn, str):
jpayne@68	540 os.unlink(x.fn)
jpayne@68	541 elif hasattr(x.fn, "close"):
jpayne@68	542 x.fn.close()
jpayne@68	543 if hasattr(x.fn, "throw"):
jpayne@68	544 x.fn.throw(StopIteration)
jpayne@68	545
jpayne@68	546
jpayne@68	547 def n_open_fds():
jpayne@68	548 pid = os.getpid()
jpayne@68	549 procs = subprocess.check_output(["lsof", "-w", "-Ff", "-p", str(pid)])
jpayne@68	550 nprocs = 0
jpayne@68	551 for i in procs.splitlines():
jpayne@68	552 if i[1:].isdigit() and i[0] == "f":
jpayne@68	553 nprocs += 1
jpayne@68	554 return nprocs
jpayne@68	555
jpayne@68	556
jpayne@68	557 import re
jpayne@68	558
jpayne@68	559 coord_re = re.compile(
jpayne@68	560 r"""
jpayne@68	561 (?P<chrom>.+):
jpayne@68	562 (?P<start>\d+)-
jpayne@68	563 (?P<stop>\d+)
jpayne@68	564 (?:\[(?P<strand>.)\])?""",
jpayne@68	565 re.VERBOSE,
jpayne@68	566 )
jpayne@68	567
jpayne@68	568
jpayne@68	569 def string_to_interval(s):
jpayne@68	570 """
jpayne@68	571 Convert string of the form "chrom:start-stop" or "chrom:start-stop[strand]"
jpayne@68	572 to an interval.
jpayne@68	573
jpayne@68	574 Assumes zero-based coords.
jpayne@68	575
jpayne@68	576 If it's already an interval, then return it as-is.
jpayne@68	577 """
jpayne@68	578 if isinstance(s, str):
jpayne@68	579 m = coord_re.search(s)
jpayne@68	580 if m.group("strand"):
jpayne@68	581 return create_interval_from_list(
jpayne@68	582 [
jpayne@68	583 m.group("chrom"),
jpayne@68	584 m.group("start"),
jpayne@68	585 m.group("stop"),
jpayne@68	586 ".",
jpayne@68	587 "0",
jpayne@68	588 m.group("strand"),
jpayne@68	589 ]
jpayne@68	590 )
jpayne@68	591 else:
jpayne@68	592 return create_interval_from_list(
jpayne@68	593 [m.group("chrom"), m.group("start"), m.group("stop")]
jpayne@68	594 )
jpayne@68	595 return s
jpayne@68	596
jpayne@68	597
jpayne@68	598 class FisherOutput(object):
jpayne@68	599 def __init__(self, s, **kwargs):
jpayne@68	600 """
jpayne@68	601 fisher returns text results like::
jpayne@68	602
jpayne@68	603 # Contingency Table
jpayne@68	604 #_________________________________________
jpayne@68	605 # \| not in -b \| in -b \|
jpayne@68	606 # not in -a \| 3137160615 \| 503 \|
jpayne@68	607 # in -a \| 100 \| 46 \|
jpayne@68	608 #_________________________________________
jpayne@68	609 # p-values for fisher's exact test
jpayne@68	610 left right two-tail ratio
jpayne@68	611 1.00000 0.00000 0.00000 2868973.922
jpayne@68	612
jpayne@68	613 """
jpayne@68	614 if isinstance(s, str):
jpayne@68	615 s = open(s).read()
jpayne@68	616 if hasattr(s, "next"):
jpayne@68	617 s = "".join(i for i in s)
jpayne@68	618 table = {
jpayne@68	619 "not in -a": {"not in -b": None, "in -b": None},
jpayne@68	620 "in -a": {"not in -b": None, "in -b": None},
jpayne@68	621 }
jpayne@68	622 self.text = s
jpayne@68	623 lines = s.splitlines()
jpayne@68	624 for i in lines:
jpayne@68	625 if "not in -a" in i:
jpayne@68	626 _, in_b, not_in_b, _ = i.strip().split("\|")
jpayne@68	627 table["not in -a"]["not in -b"] = int(not_in_b)
jpayne@68	628 table["not in -a"]["in -b"] = int(in_b)
jpayne@68	629
jpayne@68	630 if " in -a" in i:
jpayne@68	631 _, in_b, not_in_b, _ = i.strip().split("\|")
jpayne@68	632 table["in -a"]["not in -b"] = int(not_in_b)
jpayne@68	633 table["in -a"]["in -b"] = int(in_b)
jpayne@68	634 self.table = table
jpayne@68	635 left, right, two_tail, ratio = lines[-1].split()
jpayne@68	636 self.left_tail = float(left)
jpayne@68	637 self.right_tail = float(right)
jpayne@68	638 self.two_tail = float(two_tail)
jpayne@68	639 self.ratio = float(ratio)
jpayne@68	640
jpayne@68	641 def __str__(self):
jpayne@68	642 return self.text
jpayne@68	643
jpayne@68	644 def __repr__(self):
jpayne@68	645 return "<%s at %s>\n%s" % (self.__class__.__name__, id(self), self.text)
jpayne@68	646
jpayne@68	647
jpayne@68	648 class SplitOutput(object):
jpayne@68	649 def __init__(self, output, **kwargs):
jpayne@68	650 """
jpayne@68	651 Handles output from bedtools split, which sends a report of files to
jpayne@68	652 stdout. This class parses that list into something more convenient to
jpayne@68	653 use within pybedtools.
jpayne@68	654
jpayne@68	655 Most useful is probably the .bedtools attribute, which is a list of
jpayne@68	656 BedTool objects.
jpayne@68	657 """
jpayne@68	658 from .bedtool import BedTool
jpayne@68	659
jpayne@68	660 if isinstance(output, str):
jpayne@68	661 output = open(output).read()
jpayne@68	662 if hasattr(output, "next"):
jpayne@68	663 output = "".join(i for i in output)
jpayne@68	664
jpayne@68	665 #: store a copy of the output
jpayne@68	666 self.text = output
jpayne@68	667
jpayne@68	668 #: BedTool objects created from output
jpayne@68	669 self.bedtools = []
jpayne@68	670
jpayne@68	671 #: Filenames that were created from the split
jpayne@68	672 self.files = []
jpayne@68	673
jpayne@68	674 #: number of bases in each file
jpayne@68	675 self.nbases = []
jpayne@68	676
jpayne@68	677 #: number of features in each file
jpayne@68	678 self.counts = []
jpayne@68	679
jpayne@68	680 for line in output.splitlines():
jpayne@68	681 toks = line.split()
jpayne@68	682 self.files.append(toks[0])
jpayne@68	683 self.nbases.append(int(toks[1]))
jpayne@68	684 self.counts.append(int(toks[2]))
jpayne@68	685 self.bedtools.append(BedTool(toks[0]))
jpayne@68	686
jpayne@68	687
jpayne@68	688 def internet_on(timeout=1):
jpayne@68	689 try:
jpayne@68	690 response = urllib.request.urlopen("http://genome.ucsc.edu", timeout=timeout)
jpayne@68	691 return True
jpayne@68	692 except urllib.error.URLError as err:
jpayne@68	693 pass
jpayne@68	694 return False
jpayne@68	695
jpayne@68	696
jpayne@68	697 def get_chromsizes_from_ucsc(
jpayne@68	698 genome,
jpayne@68	699 saveas=None,
jpayne@68	700 mysql="mysql",
jpayne@68	701 fetchchromsizes="fetchChromSizes",
jpayne@68	702 timeout=None,
jpayne@68	703 host_url="genome-mysql.cse.ucsc.edu",
jpayne@68	704 ):
jpayne@68	705 """
jpayne@68	706 Download chrom size info for genome from UCSC and returns the dictionary.
jpayne@68	707
jpayne@68	708 Parameters
jpayne@68	709 ----------
jpayne@68	710
jpayne@68	711 genome : str
jpayne@68	712 Name of the genome assembly (e.g., "hg38")
jpayne@68	713
jpayne@68	714 saveas : str
jpayne@68	715 Filename to save output to. Dictionary will still be returned.
jpayne@68	716
jpayne@68	717 mysql, fetchchromsizes : str
jpayne@68	718 Paths to MySQL and fetchChromSizes.
jpayne@68	719
jpayne@68	720 timeout : float
jpayne@68	721 How long to wait for a response; mostly used for testing.
jpayne@68	722
jpayne@68	723 host_url : str
jpayne@68	724 URL of UCSC mirror MySQL server.
jpayne@68	725 """
jpayne@68	726 if not internet_on(timeout=timeout):
jpayne@68	727 raise ValueError(
jpayne@68	728 "It appears you don't have an internet connection "
jpayne@68	729 "-- unable to get chromsizes from UCSC"
jpayne@68	730 )
jpayne@68	731 cmds = [
jpayne@68	732 mysql,
jpayne@68	733 "--user=genome",
jpayne@68	734 "--host=" + host_url,
jpayne@68	735 "-A",
jpayne@68	736 "-e",
jpayne@68	737 "select chrom, size from %s.chromInfo" % genome,
jpayne@68	738 ]
jpayne@68	739 failures = []
jpayne@68	740 d = {}
jpayne@68	741 try:
jpayne@68	742 p = subprocess.Popen(
jpayne@68	743 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68	744 )
jpayne@68	745 stdout, stderr = p.communicate()
jpayne@68	746 if stderr:
jpayne@68	747 print(stderr)
jpayne@68	748 print("Commands were:\n")
jpayne@68	749 print((subprocess.list2cmdline(cmds)))
jpayne@68	750
jpayne@68	751 lines = stdout.splitlines()[1:]
jpayne@68	752 for line in lines:
jpayne@68	753 if isinstance(line, bytes):
jpayne@68	754 line = line.decode("UTF-8")
jpayne@68	755 chrom, size = line.split()
jpayne@68	756 d[chrom] = (0, int(size))
jpayne@68	757
jpayne@68	758 if saveas is not None:
jpayne@68	759 chromsizes_to_file(d, saveas)
jpayne@68	760
jpayne@68	761 except OSError as err:
jpayne@68	762 if err.errno == 2:
jpayne@68	763 failures.append("Can't find mysql at path {0}".format(mysql))
jpayne@68	764 else:
jpayne@68	765 raise
jpayne@68	766 try:
jpayne@68	767 cmds = [fetchchromsizes, genome]
jpayne@68	768 p = subprocess.Popen(
jpayne@68	769 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68	770 )
jpayne@68	771 stdout, stderr = p.communicate()
jpayne@68	772 if stderr:
jpayne@68	773 if "INFO: trying WGET" not in str(stderr):
jpayne@68	774 print(stderr)
jpayne@68	775 print("Commands were:\n")
jpayne@68	776 print((subprocess.list2cmdline(cmds)))
jpayne@68	777
jpayne@68	778 lines = stdout.splitlines()
jpayne@68	779 for line in lines:
jpayne@68	780 if isinstance(line, bytes):
jpayne@68	781 line = line.decode("UTF-8")
jpayne@68	782 chrom, size = line.split()
jpayne@68	783 d[chrom] = (0, int(size))
jpayne@68	784
jpayne@68	785 if saveas is not None:
jpayne@68	786 chromsizes_to_file(d, saveas)
jpayne@68	787
jpayne@68	788 except OSError as err:
jpayne@68	789 if err.errno == 2:
jpayne@68	790 failures.append("Can't find path to fetchChromsizes")
jpayne@68	791
jpayne@68	792 if not d:
jpayne@68	793 raise OSError(failures)
jpayne@68	794 return d
jpayne@68	795
jpayne@68	796
jpayne@68	797 def chromsizes_to_file(chrom_sizes, fn=None):
jpayne@68	798 """
jpayne@68	799 Converts a chromsizes dictionary to a file. If fn is None, then a
jpayne@68	800 tempfile is created (which can be deleted with pybedtools.cleanup()).
jpayne@68	801
jpayne@68	802 Returns the filename.
jpayne@68	803 """
jpayne@68	804 if fn is None:
jpayne@68	805 tmpfn = tempfile.NamedTemporaryFile(
jpayne@68	806 prefix="pybedtools.", suffix=".tmp", delete=False
jpayne@68	807 )
jpayne@68	808 tmpfn = tmpfn.name
jpayne@68	809 filenames.TEMPFILES.append(tmpfn)
jpayne@68	810 fn = tmpfn
jpayne@68	811 if isinstance(chrom_sizes, str):
jpayne@68	812 chrom_sizes = chromsizes(chrom_sizes)
jpayne@68	813 fout = open(fn, "wt")
jpayne@68	814 for chrom, bounds in chrom_sizes.items():
jpayne@68	815 line = chrom + "\t" + str(bounds[1]) + "\n"
jpayne@68	816 fout.write(line)
jpayne@68	817 fout.close()
jpayne@68	818 return fn
jpayne@68	819
jpayne@68	820
jpayne@68	821 def get_chromsizes_from_genomepy(
jpayne@68	822 genome, saveas=None,
jpayne@68	823 ):
jpayne@68	824 """
jpayne@68	825 Get chrom size info for genome from genomepy, if genomepy is installed.
jpayne@68	826
jpayne@68	827 Parameters
jpayne@68	828 ----------
jpayne@68	829
jpayne@68	830 genome : str
jpayne@68	831 Name of the genome assembly (e.g., "hg38")
jpayne@68	832
jpayne@68	833 saveas : str
jpayne@68	834 Filename to save output to. Dictionary will still be returned.
jpayne@68	835 """
jpayne@68	836 if "genomepy" not in sys.modules:
jpayne@68	837 return None
jpayne@68	838
jpayne@68	839 d = {}
jpayne@68	840 try:
jpayne@68	841 g = genomepy.Genome(genome)
jpayne@68	842 # Fail silently if the sizes file cannot be accessed
jpayne@68	843 if not hasattr(g, "sizes_file"):
jpayne@68	844 return None
jpayne@68	845 for line in open(g.sizes_file):
jpayne@68	846 chrom, size = line.split()
jpayne@68	847 d[chrom] = (0, int(size))
jpayne@68	848
jpayne@68	849 if saveas is not None:
jpayne@68	850 chromsizes_to_file(d, saveas)
jpayne@68	851 except FileNotFoundError:
jpayne@68	852 return None
jpayne@68	853
jpayne@68	854 return d
jpayne@68	855
jpayne@68	856
jpayne@68	857 def chromsizes(genome):
jpayne@68	858 """
jpayne@68	859 Looks for a genome already included in the genome registry; if not found
jpayne@68	860 it first tries to look it up via genomepy. If genomepy is not installed, or
jpayne@68	861 if this lookup fails then it looks it up on UCSC. Returns the dictionary of
jpayne@68	862 chromsize tuples where each tuple has (start,stop).
jpayne@68	863
jpayne@68	864 Chromsizes are described as (start, stop) tuples to allow randomization
jpayne@68	865 within specified regions; e. g., you can make a chromsizes dictionary that
jpayne@68	866 represents the extent of a tiling array.
jpayne@68	867
jpayne@68	868 Example usage:
jpayne@68	869
jpayne@68	870 >>> dm3_chromsizes = chromsizes('dm3')
jpayne@68	871 >>> for i in sorted(dm3_chromsizes.items()):
jpayne@68	872 ... print(i)
jpayne@68	873 ('chr2L', (0, 23011544))
jpayne@68	874 ('chr2LHet', (0, 368872))
jpayne@68	875 ('chr2R', (0, 21146708))
jpayne@68	876 ('chr2RHet', (0, 3288761))
jpayne@68	877 ('chr3L', (0, 24543557))
jpayne@68	878 ('chr3LHet', (0, 2555491))
jpayne@68	879 ('chr3R', (0, 27905053))
jpayne@68	880 ('chr3RHet', (0, 2517507))
jpayne@68	881 ('chr4', (0, 1351857))
jpayne@68	882 ('chrM', (0, 19517))
jpayne@68	883 ('chrU', (0, 10049037))
jpayne@68	884 ('chrUextra', (0, 29004656))
jpayne@68	885 ('chrX', (0, 22422827))
jpayne@68	886 ('chrXHet', (0, 204112))
jpayne@68	887 ('chrYHet', (0, 347038))
jpayne@68	888
jpayne@68	889
jpayne@68	890 """
jpayne@68	891 try:
jpayne@68	892 return getattr(genome_registry, genome)
jpayne@68	893 except AttributeError:
jpayne@68	894 chromsizes = get_chromsizes_from_genomepy(genome)
jpayne@68	895 if chromsizes is None:
jpayne@68	896 return get_chromsizes_from_ucsc(genome)
jpayne@68	897 else:
jpayne@68	898 return chromsizes
jpayne@68	899
jpayne@68	900
jpayne@68	901 def get_includes():
jpayne@68	902 """
jpayne@68	903 Returns a list of include directories with BEDTools headers
jpayne@68	904 """
jpayne@68	905 dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
jpayne@68	906 return [dirname, os.path.join(dirname, "include")]
jpayne@68	907
jpayne@68	908
jpayne@68	909 atexit.register(cleanup)

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/helpers.py @ 68:5028fdace37b