jpayne@68: import sys
jpayne@68: import os
jpayne@68: import gzip
jpayne@68: import tempfile
jpayne@68: import subprocess
jpayne@68: import glob
jpayne@68: import struct
jpayne@68: import atexit
jpayne@68: import re
jpayne@68: import urllib
jpayne@68: import urllib.error
jpayne@68: import urllib.request
jpayne@68: 
jpayne@68: try:  # Use genomepy to determine chrom sizes if it is installed
jpayne@68:     import genomepy
jpayne@68: except ImportError:
jpayne@68:     pass
jpayne@68: 
jpayne@68: from . import settings
jpayne@68: from . import filenames
jpayne@68: from . import genome_registry
jpayne@68: from .logger import logger
jpayne@68: from .cbedtools import create_interval_from_list
jpayne@68: 
jpayne@68: BUFSIZE = -1
jpayne@68: 
jpayne@68: _tags = {}
jpayne@68: 
jpayne@68: 
jpayne@68: def set_bedtools_path(path=""):
jpayne@68:     """
jpayne@68:     Explicitly set path to `BEDTools` installation dir.
jpayne@68: 
jpayne@68:     If BEDTools is not available on your system path, specify the path to the
jpayne@68:     dir containing the BEDTools executables (intersectBed, subtractBed, etc)
jpayne@68:     with this function.
jpayne@68: 
jpayne@68:     To reset and use the default system path, call this function with no
jpayne@68:     arguments or use path="".
jpayne@68:     """
jpayne@68:     from . import paths
jpayne@68: 
jpayne@68:     paths._set_bedtools_path(path)
jpayne@68: 
jpayne@68: 
jpayne@68: def get_bedtools_path():
jpayne@68:     """
jpayne@68:     Returns the currently-set path to bedtools
jpayne@68:     """
jpayne@68:     from . import paths
jpayne@68: 
jpayne@68:     return paths._get_bedtools_path()
jpayne@68: 
jpayne@68: 
jpayne@68: def set_R_path(path=""):
jpayne@68:     """
jpayne@68:     Explicitly set path to `R` installation dir.
jpayne@68: 
jpayne@68:     If R is not available on the path, then it can be explicitly
jpayne@68:     specified here.
jpayne@68: 
jpayne@68:     Use path="" to reset to default system path.
jpayne@68:     """
jpayne@68:     from . import paths
jpayne@68: 
jpayne@68:     paths._set_R_path(path)
jpayne@68: 
jpayne@68: 
jpayne@68: def _check_for_bedtools(force_check=False, verbose=False, override=None):
jpayne@68:     """
jpayne@68:     Checks installation as well as version (based on whether or not "bedtools
jpayne@68:     intersect" works, or just "intersectBed")
jpayne@68:     """
jpayne@68:     if settings._bedtools_installed and not force_check:
jpayne@68:         return True
jpayne@68: 
jpayne@68:     if (len(settings.bedtools_version) == 0) or force_check:
jpayne@68:         try:
jpayne@68:             v = (
jpayne@68:                 subprocess.check_output(
jpayne@68:                     [os.path.join(settings._bedtools_path, "bedtools"), "--version"]
jpayne@68:                 )
jpayne@68:                 .decode("utf-8")
jpayne@68:                 .rstrip()
jpayne@68:             )
jpayne@68: 
jpayne@68:             if verbose:
jpayne@68:                 print("Found bedtools version '%s'" % v)
jpayne@68: 
jpayne@68:             settings._bedtools_installed = True
jpayne@68: 
jpayne@68:             # Override, used for testing
jpayne@68:             if override is not None:
jpayne@68:                 v = override
jpayne@68: 
jpayne@68:             # To allow more complicated versions as found in Linux distributions, e.g.:
jpayne@68:             #  bedtools v2.26.0
jpayne@68:             #  bedtools debian/2.28.0+dfsg-2-dirty
jpayne@68:             m = re.search("^bedtools [^0-9]*([0-9][0-9.]*)", v)
jpayne@68:             if not m:
jpayne@68:                 raise ValueError('Cannot identify version number from "{}"'.format(v))
jpayne@68:             vv = m.group(1)
jpayne@68: 
jpayne@68:             settings.bedtools_version = [int(i) for i in vv.split(".")]
jpayne@68: 
jpayne@68:             settings._v_2_27_plus = (
jpayne@68:                 settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 27
jpayne@68:             )
jpayne@68: 
jpayne@68:             settings._v_2_15_plus = (
jpayne@68:                 settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 15
jpayne@68:             )
jpayne@68: 
jpayne@68:         except subprocess.CalledProcessError:
jpayne@68:             if settings._bedtools_path:
jpayne@68:                 add_msg = "(tried path '%s')" % settings._bedtools_path
jpayne@68:             else:
jpayne@68:                 add_msg = ""
jpayne@68:             raise OSError(
jpayne@68:                 "Please make sure you have installed BEDTools"
jpayne@68:                 "(https://github.com/arq5x/bedtools) and that "
jpayne@68:                 "it's on the path. %s" % add_msg
jpayne@68:             )
jpayne@68: 
jpayne@68: 
jpayne@68: def _check_for_R():
jpayne@68:     try:
jpayne@68:         p = subprocess.Popen(
jpayne@68:             [os.path.join(settings._R_path, "R"), "--version"],
jpayne@68:             stdout=subprocess.PIPE,
jpayne@68:             stderr=subprocess.PIPE,
jpayne@68:         )
jpayne@68:         settings._R_installed = True
jpayne@68:     except OSError:
jpayne@68:         if settings._R_path:
jpayne@68:             add_msg = "(tried path '%s')" % settings._R_path
jpayne@68:         else:
jpayne@68:             add_msg = ""
jpayne@68:         raise ValueError("Please install R and ensure it is on your path %s" % add_msg)
jpayne@68: 
jpayne@68: 
jpayne@68: class Error(Exception):
jpayne@68:     """Base class for this module's exceptions"""
jpayne@68: 
jpayne@68:     pass
jpayne@68: 
jpayne@68: 
jpayne@68: class pybedtoolsError(Error):
jpayne@68:     pass
jpayne@68: 
jpayne@68: 
jpayne@68: class BEDToolsError(Error):
jpayne@68:     def __init__(self, cmd, msg):
jpayne@68:         self.cmd = str(cmd)
jpayne@68:         self.msg = str(msg)
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68:         m = (
jpayne@68:             "\nCommand was:\n\n\t"
jpayne@68:             + self.cmd
jpayne@68:             + "\n"
jpayne@68:             + "\nError message was:\n"
jpayne@68:             + self.msg
jpayne@68:         )
jpayne@68:         return m
jpayne@68: 
jpayne@68: 
jpayne@68: def isGZIP(fn):
jpayne@68:     with open(fn, "rb") as f:
jpayne@68:         start = f.read(3)
jpayne@68:         if start == b"\x1f\x8b\x08":
jpayne@68:             return True
jpayne@68:     return False
jpayne@68: 
jpayne@68: 
jpayne@68: def isBGZIP(fn):
jpayne@68:     """
jpayne@68:     Reads a filename to see if it's a BGZIPed file or not.
jpayne@68:     """
jpayne@68:     with open(fn, "rb") as fh:
jpayne@68:         header_str = fh.read(15)
jpayne@68: 
jpayne@68:     if len(header_str) < 15:
jpayne@68:         return False
jpayne@68: 
jpayne@68:     header = struct.unpack_from("BBBBiBBHBBB", header_str)
jpayne@68: 
jpayne@68:     id1, id2, cm, flg, mtime, xfl, os_, xlen, si1, si2, slen = header
jpayne@68:     if (
jpayne@68:         (id1 == 31)
jpayne@68:         and (id2 == 139)
jpayne@68:         and (cm == 8)
jpayne@68:         and (flg == 4)
jpayne@68:         and (si1 == 66)
jpayne@68:         and (si2 == 67)
jpayne@68:         and (slen == 2)
jpayne@68:     ):
jpayne@68:         return True
jpayne@68:     return False
jpayne@68: 
jpayne@68: 
jpayne@68: def isBAM(fn):
jpayne@68:     """
jpayne@68:     Returns True if the file is both BGZIPed and the compressed contents have
jpayne@68:     start with the magic number `BAM\\x01`, or if the file is CRAM format (see
jpayne@68:     isCRAM()).
jpayne@68:     """
jpayne@68:     # Note: previously we were catching ValueError when trying to open
jpayne@68:     # a non-BAM with pysam.Samfile. That started segfaulting, so now do it the
jpayne@68:     # right way with magic number.
jpayne@68:     with gzip.open(fn, "rb") as in_:
jpayne@68:         if isBGZIP(fn) and (in_.read(4).decode() == "BAM\x01"):
jpayne@68:             return True
jpayne@68:     if isCRAM(fn):
jpayne@68:         return True
jpayne@68: 
jpayne@68: 
jpayne@68: def isCRAM(fn):
jpayne@68:     """
jpayne@68:     Returns True if the file starts with the bytes for the characters "CRAM".
jpayne@68:     """
jpayne@68:     with open(fn, "rb") as in_:
jpayne@68:         if in_.read(4).decode(errors="ignore") == "CRAM":
jpayne@68:             return True
jpayne@68: 
jpayne@68: 
jpayne@68: def find_tagged(tag):
jpayne@68:     """
jpayne@68:     Returns the bedtool object with tagged with *tag*.  Useful for tracking
jpayne@68:     down bedtools you made previously.
jpayne@68:     """
jpayne@68:     for key, item in _tags.items():
jpayne@68:         try:
jpayne@68:             if item._tag == tag:
jpayne@68:                 return item
jpayne@68:         except AttributeError:
jpayne@68:             pass
jpayne@68:     raise ValueError('tag "%s" not found' % tag)
jpayne@68: 
jpayne@68: 
jpayne@68: def _flatten_list(x):
jpayne@68:     nested = True
jpayne@68:     while nested:
jpayne@68:         check_again = False
jpayne@68:         flattened = []
jpayne@68: 
jpayne@68:         for element in x:
jpayne@68:             if isinstance(element, list):
jpayne@68:                 flattened.extend(element)
jpayne@68:                 check_again = True
jpayne@68:             else:
jpayne@68:                 flattened.append(element)
jpayne@68:         nested = check_again
jpayne@68:         x = flattened[:]
jpayne@68:     return x
jpayne@68: 
jpayne@68: 
jpayne@68: def set_tempdir(tempdir):
jpayne@68:     """
jpayne@68:     Set the directory for temp files.
jpayne@68: 
jpayne@68:     Useful for clusters that use a /scratch partition rather than a /tmp dir.
jpayne@68:     Convenience function to simply set tempfile.tempdir.
jpayne@68:     """
jpayne@68:     if not os.path.exists(tempdir):
jpayne@68:         errstr = "The tempdir you specified, %s, does not exist" % tempdir
jpayne@68:         raise FileNotFoundError(errstr)
jpayne@68:     tempfile.tempdir = tempdir
jpayne@68: 
jpayne@68: 
jpayne@68: def get_tempdir():
jpayne@68:     """
jpayne@68:     Gets the current tempdir for the module.
jpayne@68:     """
jpayne@68:     return tempfile.gettempdir()
jpayne@68: 
jpayne@68: 
jpayne@68: def cleanup(verbose=False, remove_all=False):
jpayne@68:     """
jpayne@68:     Deletes all temp files from the current session (or optionally *all* \
jpayne@68:             sessions)
jpayne@68: 
jpayne@68:     If *verbose*, reports what it's doing
jpayne@68: 
jpayne@68:     If *remove_all*, then ALL files matching "pybedtools.*.tmp" in the temp dir
jpayne@68:     will be deleted.
jpayne@68:     """
jpayne@68:     if settings.KEEP_TEMPFILES:
jpayne@68:         return
jpayne@68:     for fn in filenames.TEMPFILES:
jpayne@68:         if verbose:
jpayne@68:             print("removing", fn)
jpayne@68:         if os.path.exists(fn):
jpayne@68:             os.unlink(fn)
jpayne@68:     if remove_all:
jpayne@68:         fns = glob.glob(os.path.join(get_tempdir(), "pybedtools.*.tmp"))
jpayne@68:         for fn in fns:
jpayne@68:             os.unlink(fn)
jpayne@68: 
jpayne@68: 
jpayne@68: def _version_2_15_plus_names(prog_name):
jpayne@68:     if not settings._bedtools_installed:
jpayne@68:         _check_for_bedtools()
jpayne@68:     if not settings._v_2_15_plus:
jpayne@68:         return [prog_name]
jpayne@68:     try:
jpayne@68:         prog_name = settings._prog_names[prog_name]
jpayne@68:     except KeyError:
jpayne@68:         if prog_name in settings._new_names:
jpayne@68:             pass
jpayne@68:         raise BEDToolsError(prog_name, prog_name + "not a recognized BEDTools program")
jpayne@68:     return [os.path.join(settings._bedtools_path, "bedtools"), prog_name]
jpayne@68: 
jpayne@68: 
jpayne@68: def call_bedtools(
jpayne@68:     cmds,
jpayne@68:     tmpfn=None,
jpayne@68:     stdin=None,
jpayne@68:     check_stderr=None,
jpayne@68:     decode_output=True,
jpayne@68:     encode_input=True,
jpayne@68: ):
jpayne@68:     """
jpayne@68:     Use subprocess.Popen to call BEDTools and catch any errors.
jpayne@68: 
jpayne@68:     Output goes to *tmpfn*, or, if None, output stays in subprocess.PIPE and
jpayne@68:     can be iterated over.
jpayne@68: 
jpayne@68:     *stdin* is an optional file-like object that will be sent to
jpayne@68:     subprocess.Popen.
jpayne@68: 
jpayne@68:     Prints some useful help upon getting common errors.
jpayne@68: 
jpayne@68:     *check_stderr* is a function that takes the stderr string as input and
jpayne@68:     returns True if it's OK (that is, it's not really an error).  This is
jpayne@68:     needed, e.g., for calling fastaFromBed which will report that it has to
jpayne@68:     make a .fai for a fasta file.
jpayne@68: 
jpayne@68:     *decode_output* should be set to False when you are iterating over a BAM
jpayne@68:     file, where the data represent binary rather than text data.
jpayne@68:     """
jpayne@68:     input_is_stream = stdin is not None
jpayne@68:     output_is_stream = tmpfn is None
jpayne@68: 
jpayne@68:     _orig_cmds = cmds[:]
jpayne@68:     cmds = []
jpayne@68:     cmds.extend(_version_2_15_plus_names(_orig_cmds[0]))
jpayne@68:     cmds.extend(_orig_cmds[1:])
jpayne@68: 
jpayne@68:     try:
jpayne@68:         # coming from an iterator, sending as iterator
jpayne@68:         if input_is_stream and output_is_stream:
jpayne@68:             logger.debug(
jpayne@68:                 "helpers.call_bedtools(): input is stream, output is " "stream"
jpayne@68:             )
jpayne@68:             logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68:             p = subprocess.Popen(
jpayne@68:                 cmds,
jpayne@68:                 stdout=subprocess.PIPE,
jpayne@68:                 stderr=subprocess.PIPE,
jpayne@68:                 stdin=subprocess.PIPE,
jpayne@68:                 bufsize=BUFSIZE,
jpayne@68:             )
jpayne@68:             if encode_input:
jpayne@68:                 for line in stdin:
jpayne@68:                     p.stdin.write(line.encode())
jpayne@68:             else:
jpayne@68:                 for line in stdin:
jpayne@68:                     p.stdin.write(line)
jpayne@68: 
jpayne@68:             # This is important to prevent deadlocks
jpayne@68:             p.stdin.close()
jpayne@68: 
jpayne@68:             if decode_output:
jpayne@68:                 output = (i.decode("UTF-8") for i in p.stdout)
jpayne@68:             else:
jpayne@68:                 output = (i for i in p.stdout)
jpayne@68: 
jpayne@68:             stderr = None
jpayne@68: 
jpayne@68:         # coming from an iterator, writing to file
jpayne@68:         if input_is_stream and not output_is_stream:
jpayne@68:             logger.debug("helpers.call_bedtools(): input is stream, output is file")
jpayne@68:             logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68:             outfile = open(tmpfn, "wb")
jpayne@68:             p = subprocess.Popen(
jpayne@68:                 cmds,
jpayne@68:                 stdout=outfile,
jpayne@68:                 stderr=subprocess.PIPE,
jpayne@68:                 stdin=subprocess.PIPE,
jpayne@68:                 bufsize=BUFSIZE,
jpayne@68:             )
jpayne@68:             if hasattr(stdin, "read"):
jpayne@68:                 stdout, stderr = p.communicate(stdin.read())
jpayne@68:             else:
jpayne@68:                 for item in stdin:
jpayne@68:                     p.stdin.write(item.encode())
jpayne@68:                 stdout, stderr = p.communicate()
jpayne@68:             output = tmpfn
jpayne@68:             outfile.close()
jpayne@68: 
jpayne@68:         # coming from a file, sending as iterator
jpayne@68:         if not input_is_stream and output_is_stream:
jpayne@68:             logger.debug(
jpayne@68:                 "helpers.call_bedtools(): input is filename, " "output is stream"
jpayne@68:             )
jpayne@68:             logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68:             p = subprocess.Popen(
jpayne@68:                 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68:             )
jpayne@68:             if decode_output:
jpayne@68:                 output = (i.decode("UTF-8") for i in p.stdout)
jpayne@68:             else:
jpayne@68:                 output = (i for i in p.stdout)
jpayne@68:             stderr = None
jpayne@68: 
jpayne@68:         # file-to-file
jpayne@68:         if not input_is_stream and not output_is_stream:
jpayne@68:             logger.debug(
jpayne@68:                 "helpers.call_bedtools(): input is filename, output "
jpayne@68:                 "is filename (%s)",
jpayne@68:                 tmpfn,
jpayne@68:             )
jpayne@68:             cmds = list(map(str, cmds))
jpayne@68:             logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds))
jpayne@68:             outfile = open(tmpfn, "wb")
jpayne@68:             p = subprocess.Popen(
jpayne@68:                 cmds, stdout=outfile, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68:             )
jpayne@68:             stdout, stderr = p.communicate()
jpayne@68:             output = tmpfn
jpayne@68:             outfile.close()
jpayne@68: 
jpayne@68:         # Check if it's OK using a provided function to check stderr. If it's
jpayne@68:         # OK, dump it to sys.stderr so it's printed, and reset it to None so we
jpayne@68:         # don't raise an exception
jpayne@68:         if check_stderr is not None:
jpayne@68:             if isinstance(stderr, bytes):
jpayne@68:                 stderr = stderr.decode("UTF_8")
jpayne@68:             if check_stderr(stderr):
jpayne@68:                 sys.stderr.write(stderr)
jpayne@68:                 stderr = None
jpayne@68: 
jpayne@68:         if stderr:
jpayne@68:             # Fix for issue #147. In general, we consider warnings to not be
jpayne@68:             # fatal, so just show 'em and continue on.
jpayne@68:             #
jpayne@68:             # bedtools source has several different ways of showing a warning,
jpayne@68:             # but they seem to all have "WARNING" in the first 20 or so
jpayne@68:             # characters
jpayne@68:             if isinstance(stderr, bytes):
jpayne@68:                 stderr = stderr.decode("UTF_8")
jpayne@68:             if len(stderr) > 20 and "WARNING" in stderr[:20].upper():
jpayne@68:                 sys.stderr.write(stderr)
jpayne@68:             else:
jpayne@68:                 raise BEDToolsError(subprocess.list2cmdline(cmds), stderr)
jpayne@68: 
jpayne@68:     except (OSError, IOError) as err:
jpayne@68:         print("%s: %s" % (type(err), os.strerror(err.errno)))
jpayne@68:         print("The command was:\n\n\t%s\n" % subprocess.list2cmdline(cmds))
jpayne@68: 
jpayne@68:         problems = {
jpayne@68:             2: (
jpayne@68:                 "* Did you spell the command correctly?",
jpayne@68:                 "* Do you have BEDTools installed and on the path?",
jpayne@68:             ),
jpayne@68:             13: (
jpayne@68:                 "* Do you have permission to write "
jpayne@68:                 'to the output file ("%s")?' % tmpfn,
jpayne@68:             ),
jpayne@68:             24: (
jpayne@68:                 "* Too many files open -- please submit "
jpayne@68:                 "a bug report so that this can be fixed",
jpayne@68:             ),
jpayne@68:             32: (
jpayne@68:                 "* Broken pipe -- if you passed a BedTool object "
jpayne@68:                 "that was created using a generator function, "
jpayne@68:                 "please try saving it to disk first using the "
jpayne@68:                 ".saveas() method before calling this bedtools "
jpayne@68:                 "command. See issue #49 for more.",
jpayne@68:             ),
jpayne@68:         }
jpayne@68: 
jpayne@68:         print("Things to check:")
jpayne@68:         print("\n\t" + "\n\t".join(problems[err.errno]))
jpayne@68:         raise OSError("See above for commands that gave the error")
jpayne@68: 
jpayne@68:     return output
jpayne@68: 
jpayne@68: 
jpayne@68: def _check_sequence_stderr(x):
jpayne@68:     """
jpayne@68:     If stderr created by fastaFromBed starts with 'index file', then don't
jpayne@68:     consider it an error.
jpayne@68:     """
jpayne@68:     if isinstance(x, bytes):
jpayne@68:         x = x.decode("UTF-8")
jpayne@68:     if x.startswith("index file"):
jpayne@68:         return True
jpayne@68:     if x.startswith("WARNING"):
jpayne@68:         return True
jpayne@68:     return False
jpayne@68: 
jpayne@68: 
jpayne@68: def _call_randomintersect(
jpayne@68:     _self,
jpayne@68:     other,
jpayne@68:     iterations,
jpayne@68:     intersect_kwargs,
jpayne@68:     shuffle_kwargs,
jpayne@68:     report_iterations,
jpayne@68:     debug,
jpayne@68:     _orig_processes,
jpayne@68: ):
jpayne@68:     """
jpayne@68:     Helper function that list-ifies the output from randomintersection, s.t.
jpayne@68:     it can be pickled across a multiprocess Pool.
jpayne@68:     """
jpayne@68:     return list(
jpayne@68:         _self.randomintersection(
jpayne@68:             other,
jpayne@68:             iterations,
jpayne@68:             intersect_kwargs=intersect_kwargs,
jpayne@68:             shuffle_kwargs=shuffle_kwargs,
jpayne@68:             report_iterations=report_iterations,
jpayne@68:             debug=False,
jpayne@68:             processes=None,
jpayne@68:             _orig_processes=_orig_processes,
jpayne@68:         )
jpayne@68:     )
jpayne@68: 
jpayne@68: 
jpayne@68: def close_or_delete(*args):
jpayne@68:     """
jpayne@68:     Single function that can be used to get rid of a BedTool, whether it's a
jpayne@68:     streaming or file-based version.
jpayne@68:     """
jpayne@68:     for x in args:
jpayne@68:         if isinstance(x.fn, str):
jpayne@68:             os.unlink(x.fn)
jpayne@68:         elif hasattr(x.fn, "close"):
jpayne@68:             x.fn.close()
jpayne@68:         if hasattr(x.fn, "throw"):
jpayne@68:             x.fn.throw(StopIteration)
jpayne@68: 
jpayne@68: 
jpayne@68: def n_open_fds():
jpayne@68:     pid = os.getpid()
jpayne@68:     procs = subprocess.check_output(["lsof", "-w", "-Ff", "-p", str(pid)])
jpayne@68:     nprocs = 0
jpayne@68:     for i in procs.splitlines():
jpayne@68:         if i[1:].isdigit() and i[0] == "f":
jpayne@68:             nprocs += 1
jpayne@68:     return nprocs
jpayne@68: 
jpayne@68: 
jpayne@68: import re
jpayne@68: 
jpayne@68: coord_re = re.compile(
jpayne@68:     r"""
jpayne@68:     (?P<chrom>.+):
jpayne@68:     (?P<start>\d+)-
jpayne@68:     (?P<stop>\d+)
jpayne@68:     (?:\[(?P<strand>.)\])?""",
jpayne@68:     re.VERBOSE,
jpayne@68: )
jpayne@68: 
jpayne@68: 
jpayne@68: def string_to_interval(s):
jpayne@68:     """
jpayne@68:     Convert string of the form "chrom:start-stop" or "chrom:start-stop[strand]"
jpayne@68:     to an interval.
jpayne@68: 
jpayne@68:     Assumes zero-based coords.
jpayne@68: 
jpayne@68:     If it's already an interval, then return it as-is.
jpayne@68:     """
jpayne@68:     if isinstance(s, str):
jpayne@68:         m = coord_re.search(s)
jpayne@68:         if m.group("strand"):
jpayne@68:             return create_interval_from_list(
jpayne@68:                 [
jpayne@68:                     m.group("chrom"),
jpayne@68:                     m.group("start"),
jpayne@68:                     m.group("stop"),
jpayne@68:                     ".",
jpayne@68:                     "0",
jpayne@68:                     m.group("strand"),
jpayne@68:                 ]
jpayne@68:             )
jpayne@68:         else:
jpayne@68:             return create_interval_from_list(
jpayne@68:                 [m.group("chrom"), m.group("start"), m.group("stop")]
jpayne@68:             )
jpayne@68:     return s
jpayne@68: 
jpayne@68: 
jpayne@68: class FisherOutput(object):
jpayne@68:     def __init__(self, s, **kwargs):
jpayne@68:         """
jpayne@68:         fisher returns text results like::
jpayne@68: 
jpayne@68:             # Contingency Table
jpayne@68:             #_________________________________________
jpayne@68:             #           | not in -b    | in -b        |
jpayne@68:             # not in -a | 3137160615   | 503          |
jpayne@68:             #     in -a | 100          | 46           |
jpayne@68:             #_________________________________________
jpayne@68:             # p-values for fisher's exact test
jpayne@68:             left	right	two-tail	ratio
jpayne@68:             1.00000	0.00000	0.00000	2868973.922
jpayne@68: 
jpayne@68:         """
jpayne@68:         if isinstance(s, str):
jpayne@68:             s = open(s).read()
jpayne@68:         if hasattr(s, "next"):
jpayne@68:             s = "".join(i for i in s)
jpayne@68:         table = {
jpayne@68:             "not in -a": {"not in -b": None, "in -b": None},
jpayne@68:             "in -a": {"not in -b": None, "in -b": None},
jpayne@68:         }
jpayne@68:         self.text = s
jpayne@68:         lines = s.splitlines()
jpayne@68:         for i in lines:
jpayne@68:             if "not in -a" in i:
jpayne@68:                 _, in_b, not_in_b, _ = i.strip().split("|")
jpayne@68:                 table["not in -a"]["not in -b"] = int(not_in_b)
jpayne@68:                 table["not in -a"]["in -b"] = int(in_b)
jpayne@68: 
jpayne@68:             if "    in -a" in i:
jpayne@68:                 _, in_b, not_in_b, _ = i.strip().split("|")
jpayne@68:                 table["in -a"]["not in -b"] = int(not_in_b)
jpayne@68:                 table["in -a"]["in -b"] = int(in_b)
jpayne@68:         self.table = table
jpayne@68:         left, right, two_tail, ratio = lines[-1].split()
jpayne@68:         self.left_tail = float(left)
jpayne@68:         self.right_tail = float(right)
jpayne@68:         self.two_tail = float(two_tail)
jpayne@68:         self.ratio = float(ratio)
jpayne@68: 
jpayne@68:     def __str__(self):
jpayne@68:         return self.text
jpayne@68: 
jpayne@68:     def __repr__(self):
jpayne@68:         return "<%s at %s>\n%s" % (self.__class__.__name__, id(self), self.text)
jpayne@68: 
jpayne@68: 
jpayne@68: class SplitOutput(object):
jpayne@68:     def __init__(self, output, **kwargs):
jpayne@68:         """
jpayne@68:         Handles output from bedtools split, which sends a report of files to
jpayne@68:         stdout. This class parses that list into something more convenient to
jpayne@68:         use within pybedtools.
jpayne@68: 
jpayne@68:         Most useful is probably the .bedtools attribute, which is a list of
jpayne@68:         BedTool objects.
jpayne@68:         """
jpayne@68:         from .bedtool import BedTool
jpayne@68: 
jpayne@68:         if isinstance(output, str):
jpayne@68:             output = open(output).read()
jpayne@68:         if hasattr(output, "next"):
jpayne@68:             output = "".join(i for i in output)
jpayne@68: 
jpayne@68:         #: store a copy of the output
jpayne@68:         self.text = output
jpayne@68: 
jpayne@68:         #: BedTool objects created from output
jpayne@68:         self.bedtools = []
jpayne@68: 
jpayne@68:         #: Filenames that were created from the split
jpayne@68:         self.files = []
jpayne@68: 
jpayne@68:         #: number of bases in each file
jpayne@68:         self.nbases = []
jpayne@68: 
jpayne@68:         #: number of features in each file
jpayne@68:         self.counts = []
jpayne@68: 
jpayne@68:         for line in output.splitlines():
jpayne@68:             toks = line.split()
jpayne@68:             self.files.append(toks[0])
jpayne@68:             self.nbases.append(int(toks[1]))
jpayne@68:             self.counts.append(int(toks[2]))
jpayne@68:             self.bedtools.append(BedTool(toks[0]))
jpayne@68: 
jpayne@68: 
jpayne@68: def internet_on(timeout=1):
jpayne@68:     try:
jpayne@68:         response = urllib.request.urlopen("http://genome.ucsc.edu", timeout=timeout)
jpayne@68:         return True
jpayne@68:     except urllib.error.URLError as err:
jpayne@68:         pass
jpayne@68:     return False
jpayne@68: 
jpayne@68: 
jpayne@68: def get_chromsizes_from_ucsc(
jpayne@68:     genome,
jpayne@68:     saveas=None,
jpayne@68:     mysql="mysql",
jpayne@68:     fetchchromsizes="fetchChromSizes",
jpayne@68:     timeout=None,
jpayne@68:     host_url="genome-mysql.cse.ucsc.edu",
jpayne@68: ):
jpayne@68:     """
jpayne@68:     Download chrom size info for *genome* from UCSC and returns the dictionary.
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68: 
jpayne@68:     genome : str
jpayne@68:         Name of the genome assembly (e.g., "hg38")
jpayne@68: 
jpayne@68:     saveas : str
jpayne@68:         Filename to save output to. Dictionary will still be returned.
jpayne@68: 
jpayne@68:     mysql, fetchchromsizes : str
jpayne@68:         Paths to MySQL and fetchChromSizes.
jpayne@68: 
jpayne@68:     timeout : float
jpayne@68:         How long to wait for a response; mostly used for testing.
jpayne@68: 
jpayne@68:     host_url : str
jpayne@68:         URL of UCSC mirror MySQL server.
jpayne@68:     """
jpayne@68:     if not internet_on(timeout=timeout):
jpayne@68:         raise ValueError(
jpayne@68:             "It appears you don't have an internet connection "
jpayne@68:             "-- unable to get chromsizes from UCSC"
jpayne@68:         )
jpayne@68:     cmds = [
jpayne@68:         mysql,
jpayne@68:         "--user=genome",
jpayne@68:         "--host=" + host_url,
jpayne@68:         "-A",
jpayne@68:         "-e",
jpayne@68:         "select chrom, size from %s.chromInfo" % genome,
jpayne@68:     ]
jpayne@68:     failures = []
jpayne@68:     d = {}
jpayne@68:     try:
jpayne@68:         p = subprocess.Popen(
jpayne@68:             cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68:         )
jpayne@68:         stdout, stderr = p.communicate()
jpayne@68:         if stderr:
jpayne@68:             print(stderr)
jpayne@68:             print("Commands were:\n")
jpayne@68:             print((subprocess.list2cmdline(cmds)))
jpayne@68: 
jpayne@68:         lines = stdout.splitlines()[1:]
jpayne@68:         for line in lines:
jpayne@68:             if isinstance(line, bytes):
jpayne@68:                 line = line.decode("UTF-8")
jpayne@68:             chrom, size = line.split()
jpayne@68:             d[chrom] = (0, int(size))
jpayne@68: 
jpayne@68:         if saveas is not None:
jpayne@68:             chromsizes_to_file(d, saveas)
jpayne@68: 
jpayne@68:     except OSError as err:
jpayne@68:         if err.errno == 2:
jpayne@68:             failures.append("Can't find mysql at path {0}".format(mysql))
jpayne@68:         else:
jpayne@68:             raise
jpayne@68:     try:
jpayne@68:         cmds = [fetchchromsizes, genome]
jpayne@68:         p = subprocess.Popen(
jpayne@68:             cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE
jpayne@68:         )
jpayne@68:         stdout, stderr = p.communicate()
jpayne@68:         if stderr:
jpayne@68:             if "INFO: trying WGET" not in str(stderr):
jpayne@68:                 print(stderr)
jpayne@68:                 print("Commands were:\n")
jpayne@68:                 print((subprocess.list2cmdline(cmds)))
jpayne@68: 
jpayne@68:         lines = stdout.splitlines()
jpayne@68:         for line in lines:
jpayne@68:             if isinstance(line, bytes):
jpayne@68:                 line = line.decode("UTF-8")
jpayne@68:             chrom, size = line.split()
jpayne@68:             d[chrom] = (0, int(size))
jpayne@68: 
jpayne@68:         if saveas is not None:
jpayne@68:             chromsizes_to_file(d, saveas)
jpayne@68: 
jpayne@68:     except OSError as err:
jpayne@68:         if err.errno == 2:
jpayne@68:             failures.append("Can't find path to fetchChromsizes")
jpayne@68: 
jpayne@68:     if not d:
jpayne@68:         raise OSError(failures)
jpayne@68:     return d
jpayne@68: 
jpayne@68: 
jpayne@68: def chromsizes_to_file(chrom_sizes, fn=None):
jpayne@68:     """
jpayne@68:     Converts a *chromsizes* dictionary to a file.  If *fn* is None, then a
jpayne@68:     tempfile is created (which can be deleted with pybedtools.cleanup()).
jpayne@68: 
jpayne@68:     Returns the filename.
jpayne@68:     """
jpayne@68:     if fn is None:
jpayne@68:         tmpfn = tempfile.NamedTemporaryFile(
jpayne@68:             prefix="pybedtools.", suffix=".tmp", delete=False
jpayne@68:         )
jpayne@68:         tmpfn = tmpfn.name
jpayne@68:         filenames.TEMPFILES.append(tmpfn)
jpayne@68:         fn = tmpfn
jpayne@68:     if isinstance(chrom_sizes, str):
jpayne@68:         chrom_sizes = chromsizes(chrom_sizes)
jpayne@68:     fout = open(fn, "wt")
jpayne@68:     for chrom, bounds in chrom_sizes.items():
jpayne@68:         line = chrom + "\t" + str(bounds[1]) + "\n"
jpayne@68:         fout.write(line)
jpayne@68:     fout.close()
jpayne@68:     return fn
jpayne@68: 
jpayne@68: 
jpayne@68: def get_chromsizes_from_genomepy(
jpayne@68:     genome, saveas=None,
jpayne@68: ):
jpayne@68:     """
jpayne@68:     Get chrom size info for *genome* from genomepy, if genomepy is installed.
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68: 
jpayne@68:     genome : str
jpayne@68:         Name of the genome assembly (e.g., "hg38")
jpayne@68: 
jpayne@68:     saveas : str
jpayne@68:         Filename to save output to. Dictionary will still be returned.
jpayne@68:     """
jpayne@68:     if "genomepy" not in sys.modules:
jpayne@68:         return None
jpayne@68: 
jpayne@68:     d = {}
jpayne@68:     try:
jpayne@68:         g = genomepy.Genome(genome)
jpayne@68:         # Fail silently if the sizes file cannot be accessed
jpayne@68:         if not hasattr(g, "sizes_file"):
jpayne@68:             return None
jpayne@68:         for line in open(g.sizes_file):
jpayne@68:             chrom, size = line.split()
jpayne@68:             d[chrom] = (0, int(size))
jpayne@68: 
jpayne@68:         if saveas is not None:
jpayne@68:             chromsizes_to_file(d, saveas)
jpayne@68:     except FileNotFoundError:
jpayne@68:         return None
jpayne@68: 
jpayne@68:     return d
jpayne@68: 
jpayne@68: 
jpayne@68: def chromsizes(genome):
jpayne@68:     """
jpayne@68:     Looks for a *genome* already included in the genome registry; if not found
jpayne@68:     it first tries to look it up via genomepy. If genomepy is not installed, or
jpayne@68:     if this lookup fails then it looks it up on UCSC.  Returns the dictionary of
jpayne@68:     chromsize tuples where each tuple has (start,stop).
jpayne@68: 
jpayne@68:     Chromsizes are described as (start, stop) tuples to allow randomization
jpayne@68:     within specified regions; e. g., you can make a chromsizes dictionary that
jpayne@68:     represents the extent of a tiling array.
jpayne@68: 
jpayne@68:     Example usage:
jpayne@68: 
jpayne@68:         >>> dm3_chromsizes = chromsizes('dm3')
jpayne@68:         >>> for i in sorted(dm3_chromsizes.items()):
jpayne@68:         ...     print(i)
jpayne@68:         ('chr2L', (0, 23011544))
jpayne@68:         ('chr2LHet', (0, 368872))
jpayne@68:         ('chr2R', (0, 21146708))
jpayne@68:         ('chr2RHet', (0, 3288761))
jpayne@68:         ('chr3L', (0, 24543557))
jpayne@68:         ('chr3LHet', (0, 2555491))
jpayne@68:         ('chr3R', (0, 27905053))
jpayne@68:         ('chr3RHet', (0, 2517507))
jpayne@68:         ('chr4', (0, 1351857))
jpayne@68:         ('chrM', (0, 19517))
jpayne@68:         ('chrU', (0, 10049037))
jpayne@68:         ('chrUextra', (0, 29004656))
jpayne@68:         ('chrX', (0, 22422827))
jpayne@68:         ('chrXHet', (0, 204112))
jpayne@68:         ('chrYHet', (0, 347038))
jpayne@68: 
jpayne@68: 
jpayne@68:     """
jpayne@68:     try:
jpayne@68:         return getattr(genome_registry, genome)
jpayne@68:     except AttributeError:
jpayne@68:         chromsizes = get_chromsizes_from_genomepy(genome)
jpayne@68:         if chromsizes is None:
jpayne@68:             return get_chromsizes_from_ucsc(genome)
jpayne@68:         else:
jpayne@68:             return chromsizes
jpayne@68: 
jpayne@68: 
jpayne@68: def get_includes():
jpayne@68:     """
jpayne@68:     Returns a list of include directories with BEDTools headers
jpayne@68:     """
jpayne@68:     dirname = os.path.abspath(os.path.join(os.path.dirname(__file__)))
jpayne@68:     return [dirname, os.path.join(dirname, "include")]
jpayne@68: 
jpayne@68: 
jpayne@68: atexit.register(cleanup)