jpayne@68: import sys jpayne@68: import os jpayne@68: import gzip jpayne@68: import tempfile jpayne@68: import subprocess jpayne@68: import glob jpayne@68: import struct jpayne@68: import atexit jpayne@68: import re jpayne@68: import urllib jpayne@68: import urllib.error jpayne@68: import urllib.request jpayne@68: jpayne@68: try: # Use genomepy to determine chrom sizes if it is installed jpayne@68: import genomepy jpayne@68: except ImportError: jpayne@68: pass jpayne@68: jpayne@68: from . import settings jpayne@68: from . import filenames jpayne@68: from . import genome_registry jpayne@68: from .logger import logger jpayne@68: from .cbedtools import create_interval_from_list jpayne@68: jpayne@68: BUFSIZE = -1 jpayne@68: jpayne@68: _tags = {} jpayne@68: jpayne@68: jpayne@68: def set_bedtools_path(path=""): jpayne@68: """ jpayne@68: Explicitly set path to `BEDTools` installation dir. jpayne@68: jpayne@68: If BEDTools is not available on your system path, specify the path to the jpayne@68: dir containing the BEDTools executables (intersectBed, subtractBed, etc) jpayne@68: with this function. jpayne@68: jpayne@68: To reset and use the default system path, call this function with no jpayne@68: arguments or use path="". jpayne@68: """ jpayne@68: from . import paths jpayne@68: jpayne@68: paths._set_bedtools_path(path) jpayne@68: jpayne@68: jpayne@68: def get_bedtools_path(): jpayne@68: """ jpayne@68: Returns the currently-set path to bedtools jpayne@68: """ jpayne@68: from . import paths jpayne@68: jpayne@68: return paths._get_bedtools_path() jpayne@68: jpayne@68: jpayne@68: def set_R_path(path=""): jpayne@68: """ jpayne@68: Explicitly set path to `R` installation dir. jpayne@68: jpayne@68: If R is not available on the path, then it can be explicitly jpayne@68: specified here. jpayne@68: jpayne@68: Use path="" to reset to default system path. jpayne@68: """ jpayne@68: from . import paths jpayne@68: jpayne@68: paths._set_R_path(path) jpayne@68: jpayne@68: jpayne@68: def _check_for_bedtools(force_check=False, verbose=False, override=None): jpayne@68: """ jpayne@68: Checks installation as well as version (based on whether or not "bedtools jpayne@68: intersect" works, or just "intersectBed") jpayne@68: """ jpayne@68: if settings._bedtools_installed and not force_check: jpayne@68: return True jpayne@68: jpayne@68: if (len(settings.bedtools_version) == 0) or force_check: jpayne@68: try: jpayne@68: v = ( jpayne@68: subprocess.check_output( jpayne@68: [os.path.join(settings._bedtools_path, "bedtools"), "--version"] jpayne@68: ) jpayne@68: .decode("utf-8") jpayne@68: .rstrip() jpayne@68: ) jpayne@68: jpayne@68: if verbose: jpayne@68: print("Found bedtools version '%s'" % v) jpayne@68: jpayne@68: settings._bedtools_installed = True jpayne@68: jpayne@68: # Override, used for testing jpayne@68: if override is not None: jpayne@68: v = override jpayne@68: jpayne@68: # To allow more complicated versions as found in Linux distributions, e.g.: jpayne@68: # bedtools v2.26.0 jpayne@68: # bedtools debian/2.28.0+dfsg-2-dirty jpayne@68: m = re.search("^bedtools [^0-9]*([0-9][0-9.]*)", v) jpayne@68: if not m: jpayne@68: raise ValueError('Cannot identify version number from "{}"'.format(v)) jpayne@68: vv = m.group(1) jpayne@68: jpayne@68: settings.bedtools_version = [int(i) for i in vv.split(".")] jpayne@68: jpayne@68: settings._v_2_27_plus = ( jpayne@68: settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 27 jpayne@68: ) jpayne@68: jpayne@68: settings._v_2_15_plus = ( jpayne@68: settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 15 jpayne@68: ) jpayne@68: jpayne@68: except subprocess.CalledProcessError: jpayne@68: if settings._bedtools_path: jpayne@68: add_msg = "(tried path '%s')" % settings._bedtools_path jpayne@68: else: jpayne@68: add_msg = "" jpayne@68: raise OSError( jpayne@68: "Please make sure you have installed BEDTools" jpayne@68: "(https://github.com/arq5x/bedtools) and that " jpayne@68: "it's on the path. %s" % add_msg jpayne@68: ) jpayne@68: jpayne@68: jpayne@68: def _check_for_R(): jpayne@68: try: jpayne@68: p = subprocess.Popen( jpayne@68: [os.path.join(settings._R_path, "R"), "--version"], jpayne@68: stdout=subprocess.PIPE, jpayne@68: stderr=subprocess.PIPE, jpayne@68: ) jpayne@68: settings._R_installed = True jpayne@68: except OSError: jpayne@68: if settings._R_path: jpayne@68: add_msg = "(tried path '%s')" % settings._R_path jpayne@68: else: jpayne@68: add_msg = "" jpayne@68: raise ValueError("Please install R and ensure it is on your path %s" % add_msg) jpayne@68: jpayne@68: jpayne@68: class Error(Exception): jpayne@68: """Base class for this module's exceptions""" jpayne@68: jpayne@68: pass jpayne@68: jpayne@68: jpayne@68: class pybedtoolsError(Error): jpayne@68: pass jpayne@68: jpayne@68: jpayne@68: class BEDToolsError(Error): jpayne@68: def __init__(self, cmd, msg): jpayne@68: self.cmd = str(cmd) jpayne@68: self.msg = str(msg) jpayne@68: jpayne@68: def __str__(self): jpayne@68: m = ( jpayne@68: "\nCommand was:\n\n\t" jpayne@68: + self.cmd jpayne@68: + "\n" jpayne@68: + "\nError message was:\n" jpayne@68: + self.msg jpayne@68: ) jpayne@68: return m jpayne@68: jpayne@68: jpayne@68: def isGZIP(fn): jpayne@68: with open(fn, "rb") as f: jpayne@68: start = f.read(3) jpayne@68: if start == b"\x1f\x8b\x08": jpayne@68: return True jpayne@68: return False jpayne@68: jpayne@68: jpayne@68: def isBGZIP(fn): jpayne@68: """ jpayne@68: Reads a filename to see if it's a BGZIPed file or not. jpayne@68: """ jpayne@68: with open(fn, "rb") as fh: jpayne@68: header_str = fh.read(15) jpayne@68: jpayne@68: if len(header_str) < 15: jpayne@68: return False jpayne@68: jpayne@68: header = struct.unpack_from("BBBBiBBHBBB", header_str) jpayne@68: jpayne@68: id1, id2, cm, flg, mtime, xfl, os_, xlen, si1, si2, slen = header jpayne@68: if ( jpayne@68: (id1 == 31) jpayne@68: and (id2 == 139) jpayne@68: and (cm == 8) jpayne@68: and (flg == 4) jpayne@68: and (si1 == 66) jpayne@68: and (si2 == 67) jpayne@68: and (slen == 2) jpayne@68: ): jpayne@68: return True jpayne@68: return False jpayne@68: jpayne@68: jpayne@68: def isBAM(fn): jpayne@68: """ jpayne@68: Returns True if the file is both BGZIPed and the compressed contents have jpayne@68: start with the magic number `BAM\\x01`, or if the file is CRAM format (see jpayne@68: isCRAM()). jpayne@68: """ jpayne@68: # Note: previously we were catching ValueError when trying to open jpayne@68: # a non-BAM with pysam.Samfile. That started segfaulting, so now do it the jpayne@68: # right way with magic number. jpayne@68: with gzip.open(fn, "rb") as in_: jpayne@68: if isBGZIP(fn) and (in_.read(4).decode() == "BAM\x01"): jpayne@68: return True jpayne@68: if isCRAM(fn): jpayne@68: return True jpayne@68: jpayne@68: jpayne@68: def isCRAM(fn): jpayne@68: """ jpayne@68: Returns True if the file starts with the bytes for the characters "CRAM". jpayne@68: """ jpayne@68: with open(fn, "rb") as in_: jpayne@68: if in_.read(4).decode(errors="ignore") == "CRAM": jpayne@68: return True jpayne@68: jpayne@68: jpayne@68: def find_tagged(tag): jpayne@68: """ jpayne@68: Returns the bedtool object with tagged with *tag*. Useful for tracking jpayne@68: down bedtools you made previously. jpayne@68: """ jpayne@68: for key, item in _tags.items(): jpayne@68: try: jpayne@68: if item._tag == tag: jpayne@68: return item jpayne@68: except AttributeError: jpayne@68: pass jpayne@68: raise ValueError('tag "%s" not found' % tag) jpayne@68: jpayne@68: jpayne@68: def _flatten_list(x): jpayne@68: nested = True jpayne@68: while nested: jpayne@68: check_again = False jpayne@68: flattened = [] jpayne@68: jpayne@68: for element in x: jpayne@68: if isinstance(element, list): jpayne@68: flattened.extend(element) jpayne@68: check_again = True jpayne@68: else: jpayne@68: flattened.append(element) jpayne@68: nested = check_again jpayne@68: x = flattened[:] jpayne@68: return x jpayne@68: jpayne@68: jpayne@68: def set_tempdir(tempdir): jpayne@68: """ jpayne@68: Set the directory for temp files. jpayne@68: jpayne@68: Useful for clusters that use a /scratch partition rather than a /tmp dir. jpayne@68: Convenience function to simply set tempfile.tempdir. jpayne@68: """ jpayne@68: if not os.path.exists(tempdir): jpayne@68: errstr = "The tempdir you specified, %s, does not exist" % tempdir jpayne@68: raise FileNotFoundError(errstr) jpayne@68: tempfile.tempdir = tempdir jpayne@68: jpayne@68: jpayne@68: def get_tempdir(): jpayne@68: """ jpayne@68: Gets the current tempdir for the module. jpayne@68: """ jpayne@68: return tempfile.gettempdir() jpayne@68: jpayne@68: jpayne@68: def cleanup(verbose=False, remove_all=False): jpayne@68: """ jpayne@68: Deletes all temp files from the current session (or optionally *all* \ jpayne@68: sessions) jpayne@68: jpayne@68: If *verbose*, reports what it's doing jpayne@68: jpayne@68: If *remove_all*, then ALL files matching "pybedtools.*.tmp" in the temp dir jpayne@68: will be deleted. jpayne@68: """ jpayne@68: if settings.KEEP_TEMPFILES: jpayne@68: return jpayne@68: for fn in filenames.TEMPFILES: jpayne@68: if verbose: jpayne@68: print("removing", fn) jpayne@68: if os.path.exists(fn): jpayne@68: os.unlink(fn) jpayne@68: if remove_all: jpayne@68: fns = glob.glob(os.path.join(get_tempdir(), "pybedtools.*.tmp")) jpayne@68: for fn in fns: jpayne@68: os.unlink(fn) jpayne@68: jpayne@68: jpayne@68: def _version_2_15_plus_names(prog_name): jpayne@68: if not settings._bedtools_installed: jpayne@68: _check_for_bedtools() jpayne@68: if not settings._v_2_15_plus: jpayne@68: return [prog_name] jpayne@68: try: jpayne@68: prog_name = settings._prog_names[prog_name] jpayne@68: except KeyError: jpayne@68: if prog_name in settings._new_names: jpayne@68: pass jpayne@68: raise BEDToolsError(prog_name, prog_name + "not a recognized BEDTools program") jpayne@68: return [os.path.join(settings._bedtools_path, "bedtools"), prog_name] jpayne@68: jpayne@68: jpayne@68: def call_bedtools( jpayne@68: cmds, jpayne@68: tmpfn=None, jpayne@68: stdin=None, jpayne@68: check_stderr=None, jpayne@68: decode_output=True, jpayne@68: encode_input=True, jpayne@68: ): jpayne@68: """ jpayne@68: Use subprocess.Popen to call BEDTools and catch any errors. jpayne@68: jpayne@68: Output goes to *tmpfn*, or, if None, output stays in subprocess.PIPE and jpayne@68: can be iterated over. jpayne@68: jpayne@68: *stdin* is an optional file-like object that will be sent to jpayne@68: subprocess.Popen. jpayne@68: jpayne@68: Prints some useful help upon getting common errors. jpayne@68: jpayne@68: *check_stderr* is a function that takes the stderr string as input and jpayne@68: returns True if it's OK (that is, it's not really an error). This is jpayne@68: needed, e.g., for calling fastaFromBed which will report that it has to jpayne@68: make a .fai for a fasta file. jpayne@68: jpayne@68: *decode_output* should be set to False when you are iterating over a BAM jpayne@68: file, where the data represent binary rather than text data. jpayne@68: """ jpayne@68: input_is_stream = stdin is not None jpayne@68: output_is_stream = tmpfn is None jpayne@68: jpayne@68: _orig_cmds = cmds[:] jpayne@68: cmds = [] jpayne@68: cmds.extend(_version_2_15_plus_names(_orig_cmds[0])) jpayne@68: cmds.extend(_orig_cmds[1:]) jpayne@68: jpayne@68: try: jpayne@68: # coming from an iterator, sending as iterator jpayne@68: if input_is_stream and output_is_stream: jpayne@68: logger.debug( jpayne@68: "helpers.call_bedtools(): input is stream, output is " "stream" jpayne@68: ) jpayne@68: logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds)) jpayne@68: p = subprocess.Popen( jpayne@68: cmds, jpayne@68: stdout=subprocess.PIPE, jpayne@68: stderr=subprocess.PIPE, jpayne@68: stdin=subprocess.PIPE, jpayne@68: bufsize=BUFSIZE, jpayne@68: ) jpayne@68: if encode_input: jpayne@68: for line in stdin: jpayne@68: p.stdin.write(line.encode()) jpayne@68: else: jpayne@68: for line in stdin: jpayne@68: p.stdin.write(line) jpayne@68: jpayne@68: # This is important to prevent deadlocks jpayne@68: p.stdin.close() jpayne@68: jpayne@68: if decode_output: jpayne@68: output = (i.decode("UTF-8") for i in p.stdout) jpayne@68: else: jpayne@68: output = (i for i in p.stdout) jpayne@68: jpayne@68: stderr = None jpayne@68: jpayne@68: # coming from an iterator, writing to file jpayne@68: if input_is_stream and not output_is_stream: jpayne@68: logger.debug("helpers.call_bedtools(): input is stream, output is file") jpayne@68: logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds)) jpayne@68: outfile = open(tmpfn, "wb") jpayne@68: p = subprocess.Popen( jpayne@68: cmds, jpayne@68: stdout=outfile, jpayne@68: stderr=subprocess.PIPE, jpayne@68: stdin=subprocess.PIPE, jpayne@68: bufsize=BUFSIZE, jpayne@68: ) jpayne@68: if hasattr(stdin, "read"): jpayne@68: stdout, stderr = p.communicate(stdin.read()) jpayne@68: else: jpayne@68: for item in stdin: jpayne@68: p.stdin.write(item.encode()) jpayne@68: stdout, stderr = p.communicate() jpayne@68: output = tmpfn jpayne@68: outfile.close() jpayne@68: jpayne@68: # coming from a file, sending as iterator jpayne@68: if not input_is_stream and output_is_stream: jpayne@68: logger.debug( jpayne@68: "helpers.call_bedtools(): input is filename, " "output is stream" jpayne@68: ) jpayne@68: logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds)) jpayne@68: p = subprocess.Popen( jpayne@68: cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE jpayne@68: ) jpayne@68: if decode_output: jpayne@68: output = (i.decode("UTF-8") for i in p.stdout) jpayne@68: else: jpayne@68: output = (i for i in p.stdout) jpayne@68: stderr = None jpayne@68: jpayne@68: # file-to-file jpayne@68: if not input_is_stream and not output_is_stream: jpayne@68: logger.debug( jpayne@68: "helpers.call_bedtools(): input is filename, output " jpayne@68: "is filename (%s)", jpayne@68: tmpfn, jpayne@68: ) jpayne@68: cmds = list(map(str, cmds)) jpayne@68: logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds)) jpayne@68: outfile = open(tmpfn, "wb") jpayne@68: p = subprocess.Popen( jpayne@68: cmds, stdout=outfile, stderr=subprocess.PIPE, bufsize=BUFSIZE jpayne@68: ) jpayne@68: stdout, stderr = p.communicate() jpayne@68: output = tmpfn jpayne@68: outfile.close() jpayne@68: jpayne@68: # Check if it's OK using a provided function to check stderr. If it's jpayne@68: # OK, dump it to sys.stderr so it's printed, and reset it to None so we jpayne@68: # don't raise an exception jpayne@68: if check_stderr is not None: jpayne@68: if isinstance(stderr, bytes): jpayne@68: stderr = stderr.decode("UTF_8") jpayne@68: if check_stderr(stderr): jpayne@68: sys.stderr.write(stderr) jpayne@68: stderr = None jpayne@68: jpayne@68: if stderr: jpayne@68: # Fix for issue #147. In general, we consider warnings to not be jpayne@68: # fatal, so just show 'em and continue on. jpayne@68: # jpayne@68: # bedtools source has several different ways of showing a warning, jpayne@68: # but they seem to all have "WARNING" in the first 20 or so jpayne@68: # characters jpayne@68: if isinstance(stderr, bytes): jpayne@68: stderr = stderr.decode("UTF_8") jpayne@68: if len(stderr) > 20 and "WARNING" in stderr[:20].upper(): jpayne@68: sys.stderr.write(stderr) jpayne@68: else: jpayne@68: raise BEDToolsError(subprocess.list2cmdline(cmds), stderr) jpayne@68: jpayne@68: except (OSError, IOError) as err: jpayne@68: print("%s: %s" % (type(err), os.strerror(err.errno))) jpayne@68: print("The command was:\n\n\t%s\n" % subprocess.list2cmdline(cmds)) jpayne@68: jpayne@68: problems = { jpayne@68: 2: ( jpayne@68: "* Did you spell the command correctly?", jpayne@68: "* Do you have BEDTools installed and on the path?", jpayne@68: ), jpayne@68: 13: ( jpayne@68: "* Do you have permission to write " jpayne@68: 'to the output file ("%s")?' % tmpfn, jpayne@68: ), jpayne@68: 24: ( jpayne@68: "* Too many files open -- please submit " jpayne@68: "a bug report so that this can be fixed", jpayne@68: ), jpayne@68: 32: ( jpayne@68: "* Broken pipe -- if you passed a BedTool object " jpayne@68: "that was created using a generator function, " jpayne@68: "please try saving it to disk first using the " jpayne@68: ".saveas() method before calling this bedtools " jpayne@68: "command. See issue #49 for more.", jpayne@68: ), jpayne@68: } jpayne@68: jpayne@68: print("Things to check:") jpayne@68: print("\n\t" + "\n\t".join(problems[err.errno])) jpayne@68: raise OSError("See above for commands that gave the error") jpayne@68: jpayne@68: return output jpayne@68: jpayne@68: jpayne@68: def _check_sequence_stderr(x): jpayne@68: """ jpayne@68: If stderr created by fastaFromBed starts with 'index file', then don't jpayne@68: consider it an error. jpayne@68: """ jpayne@68: if isinstance(x, bytes): jpayne@68: x = x.decode("UTF-8") jpayne@68: if x.startswith("index file"): jpayne@68: return True jpayne@68: if x.startswith("WARNING"): jpayne@68: return True jpayne@68: return False jpayne@68: jpayne@68: jpayne@68: def _call_randomintersect( jpayne@68: _self, jpayne@68: other, jpayne@68: iterations, jpayne@68: intersect_kwargs, jpayne@68: shuffle_kwargs, jpayne@68: report_iterations, jpayne@68: debug, jpayne@68: _orig_processes, jpayne@68: ): jpayne@68: """ jpayne@68: Helper function that list-ifies the output from randomintersection, s.t. jpayne@68: it can be pickled across a multiprocess Pool. jpayne@68: """ jpayne@68: return list( jpayne@68: _self.randomintersection( jpayne@68: other, jpayne@68: iterations, jpayne@68: intersect_kwargs=intersect_kwargs, jpayne@68: shuffle_kwargs=shuffle_kwargs, jpayne@68: report_iterations=report_iterations, jpayne@68: debug=False, jpayne@68: processes=None, jpayne@68: _orig_processes=_orig_processes, jpayne@68: ) jpayne@68: ) jpayne@68: jpayne@68: jpayne@68: def close_or_delete(*args): jpayne@68: """ jpayne@68: Single function that can be used to get rid of a BedTool, whether it's a jpayne@68: streaming or file-based version. jpayne@68: """ jpayne@68: for x in args: jpayne@68: if isinstance(x.fn, str): jpayne@68: os.unlink(x.fn) jpayne@68: elif hasattr(x.fn, "close"): jpayne@68: x.fn.close() jpayne@68: if hasattr(x.fn, "throw"): jpayne@68: x.fn.throw(StopIteration) jpayne@68: jpayne@68: jpayne@68: def n_open_fds(): jpayne@68: pid = os.getpid() jpayne@68: procs = subprocess.check_output(["lsof", "-w", "-Ff", "-p", str(pid)]) jpayne@68: nprocs = 0 jpayne@68: for i in procs.splitlines(): jpayne@68: if i[1:].isdigit() and i[0] == "f": jpayne@68: nprocs += 1 jpayne@68: return nprocs jpayne@68: jpayne@68: jpayne@68: import re jpayne@68: jpayne@68: coord_re = re.compile( jpayne@68: r""" jpayne@68: (?P.+): jpayne@68: (?P\d+)- jpayne@68: (?P\d+) jpayne@68: (?:\[(?P.)\])?""", jpayne@68: re.VERBOSE, jpayne@68: ) jpayne@68: jpayne@68: jpayne@68: def string_to_interval(s): jpayne@68: """ jpayne@68: Convert string of the form "chrom:start-stop" or "chrom:start-stop[strand]" jpayne@68: to an interval. jpayne@68: jpayne@68: Assumes zero-based coords. jpayne@68: jpayne@68: If it's already an interval, then return it as-is. jpayne@68: """ jpayne@68: if isinstance(s, str): jpayne@68: m = coord_re.search(s) jpayne@68: if m.group("strand"): jpayne@68: return create_interval_from_list( jpayne@68: [ jpayne@68: m.group("chrom"), jpayne@68: m.group("start"), jpayne@68: m.group("stop"), jpayne@68: ".", jpayne@68: "0", jpayne@68: m.group("strand"), jpayne@68: ] jpayne@68: ) jpayne@68: else: jpayne@68: return create_interval_from_list( jpayne@68: [m.group("chrom"), m.group("start"), m.group("stop")] jpayne@68: ) jpayne@68: return s jpayne@68: jpayne@68: jpayne@68: class FisherOutput(object): jpayne@68: def __init__(self, s, **kwargs): jpayne@68: """ jpayne@68: fisher returns text results like:: jpayne@68: jpayne@68: # Contingency Table jpayne@68: #_________________________________________ jpayne@68: # | not in -b | in -b | jpayne@68: # not in -a | 3137160615 | 503 | jpayne@68: # in -a | 100 | 46 | jpayne@68: #_________________________________________ jpayne@68: # p-values for fisher's exact test jpayne@68: left right two-tail ratio jpayne@68: 1.00000 0.00000 0.00000 2868973.922 jpayne@68: jpayne@68: """ jpayne@68: if isinstance(s, str): jpayne@68: s = open(s).read() jpayne@68: if hasattr(s, "next"): jpayne@68: s = "".join(i for i in s) jpayne@68: table = { jpayne@68: "not in -a": {"not in -b": None, "in -b": None}, jpayne@68: "in -a": {"not in -b": None, "in -b": None}, jpayne@68: } jpayne@68: self.text = s jpayne@68: lines = s.splitlines() jpayne@68: for i in lines: jpayne@68: if "not in -a" in i: jpayne@68: _, in_b, not_in_b, _ = i.strip().split("|") jpayne@68: table["not in -a"]["not in -b"] = int(not_in_b) jpayne@68: table["not in -a"]["in -b"] = int(in_b) jpayne@68: jpayne@68: if " in -a" in i: jpayne@68: _, in_b, not_in_b, _ = i.strip().split("|") jpayne@68: table["in -a"]["not in -b"] = int(not_in_b) jpayne@68: table["in -a"]["in -b"] = int(in_b) jpayne@68: self.table = table jpayne@68: left, right, two_tail, ratio = lines[-1].split() jpayne@68: self.left_tail = float(left) jpayne@68: self.right_tail = float(right) jpayne@68: self.two_tail = float(two_tail) jpayne@68: self.ratio = float(ratio) jpayne@68: jpayne@68: def __str__(self): jpayne@68: return self.text jpayne@68: jpayne@68: def __repr__(self): jpayne@68: return "<%s at %s>\n%s" % (self.__class__.__name__, id(self), self.text) jpayne@68: jpayne@68: jpayne@68: class SplitOutput(object): jpayne@68: def __init__(self, output, **kwargs): jpayne@68: """ jpayne@68: Handles output from bedtools split, which sends a report of files to jpayne@68: stdout. This class parses that list into something more convenient to jpayne@68: use within pybedtools. jpayne@68: jpayne@68: Most useful is probably the .bedtools attribute, which is a list of jpayne@68: BedTool objects. jpayne@68: """ jpayne@68: from .bedtool import BedTool jpayne@68: jpayne@68: if isinstance(output, str): jpayne@68: output = open(output).read() jpayne@68: if hasattr(output, "next"): jpayne@68: output = "".join(i for i in output) jpayne@68: jpayne@68: #: store a copy of the output jpayne@68: self.text = output jpayne@68: jpayne@68: #: BedTool objects created from output jpayne@68: self.bedtools = [] jpayne@68: jpayne@68: #: Filenames that were created from the split jpayne@68: self.files = [] jpayne@68: jpayne@68: #: number of bases in each file jpayne@68: self.nbases = [] jpayne@68: jpayne@68: #: number of features in each file jpayne@68: self.counts = [] jpayne@68: jpayne@68: for line in output.splitlines(): jpayne@68: toks = line.split() jpayne@68: self.files.append(toks[0]) jpayne@68: self.nbases.append(int(toks[1])) jpayne@68: self.counts.append(int(toks[2])) jpayne@68: self.bedtools.append(BedTool(toks[0])) jpayne@68: jpayne@68: jpayne@68: def internet_on(timeout=1): jpayne@68: try: jpayne@68: response = urllib.request.urlopen("http://genome.ucsc.edu", timeout=timeout) jpayne@68: return True jpayne@68: except urllib.error.URLError as err: jpayne@68: pass jpayne@68: return False jpayne@68: jpayne@68: jpayne@68: def get_chromsizes_from_ucsc( jpayne@68: genome, jpayne@68: saveas=None, jpayne@68: mysql="mysql", jpayne@68: fetchchromsizes="fetchChromSizes", jpayne@68: timeout=None, jpayne@68: host_url="genome-mysql.cse.ucsc.edu", jpayne@68: ): jpayne@68: """ jpayne@68: Download chrom size info for *genome* from UCSC and returns the dictionary. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: jpayne@68: genome : str jpayne@68: Name of the genome assembly (e.g., "hg38") jpayne@68: jpayne@68: saveas : str jpayne@68: Filename to save output to. Dictionary will still be returned. jpayne@68: jpayne@68: mysql, fetchchromsizes : str jpayne@68: Paths to MySQL and fetchChromSizes. jpayne@68: jpayne@68: timeout : float jpayne@68: How long to wait for a response; mostly used for testing. jpayne@68: jpayne@68: host_url : str jpayne@68: URL of UCSC mirror MySQL server. jpayne@68: """ jpayne@68: if not internet_on(timeout=timeout): jpayne@68: raise ValueError( jpayne@68: "It appears you don't have an internet connection " jpayne@68: "-- unable to get chromsizes from UCSC" jpayne@68: ) jpayne@68: cmds = [ jpayne@68: mysql, jpayne@68: "--user=genome", jpayne@68: "--host=" + host_url, jpayne@68: "-A", jpayne@68: "-e", jpayne@68: "select chrom, size from %s.chromInfo" % genome, jpayne@68: ] jpayne@68: failures = [] jpayne@68: d = {} jpayne@68: try: jpayne@68: p = subprocess.Popen( jpayne@68: cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE jpayne@68: ) jpayne@68: stdout, stderr = p.communicate() jpayne@68: if stderr: jpayne@68: print(stderr) jpayne@68: print("Commands were:\n") jpayne@68: print((subprocess.list2cmdline(cmds))) jpayne@68: jpayne@68: lines = stdout.splitlines()[1:] jpayne@68: for line in lines: jpayne@68: if isinstance(line, bytes): jpayne@68: line = line.decode("UTF-8") jpayne@68: chrom, size = line.split() jpayne@68: d[chrom] = (0, int(size)) jpayne@68: jpayne@68: if saveas is not None: jpayne@68: chromsizes_to_file(d, saveas) jpayne@68: jpayne@68: except OSError as err: jpayne@68: if err.errno == 2: jpayne@68: failures.append("Can't find mysql at path {0}".format(mysql)) jpayne@68: else: jpayne@68: raise jpayne@68: try: jpayne@68: cmds = [fetchchromsizes, genome] jpayne@68: p = subprocess.Popen( jpayne@68: cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE jpayne@68: ) jpayne@68: stdout, stderr = p.communicate() jpayne@68: if stderr: jpayne@68: if "INFO: trying WGET" not in str(stderr): jpayne@68: print(stderr) jpayne@68: print("Commands were:\n") jpayne@68: print((subprocess.list2cmdline(cmds))) jpayne@68: jpayne@68: lines = stdout.splitlines() jpayne@68: for line in lines: jpayne@68: if isinstance(line, bytes): jpayne@68: line = line.decode("UTF-8") jpayne@68: chrom, size = line.split() jpayne@68: d[chrom] = (0, int(size)) jpayne@68: jpayne@68: if saveas is not None: jpayne@68: chromsizes_to_file(d, saveas) jpayne@68: jpayne@68: except OSError as err: jpayne@68: if err.errno == 2: jpayne@68: failures.append("Can't find path to fetchChromsizes") jpayne@68: jpayne@68: if not d: jpayne@68: raise OSError(failures) jpayne@68: return d jpayne@68: jpayne@68: jpayne@68: def chromsizes_to_file(chrom_sizes, fn=None): jpayne@68: """ jpayne@68: Converts a *chromsizes* dictionary to a file. If *fn* is None, then a jpayne@68: tempfile is created (which can be deleted with pybedtools.cleanup()). jpayne@68: jpayne@68: Returns the filename. jpayne@68: """ jpayne@68: if fn is None: jpayne@68: tmpfn = tempfile.NamedTemporaryFile( jpayne@68: prefix="pybedtools.", suffix=".tmp", delete=False jpayne@68: ) jpayne@68: tmpfn = tmpfn.name jpayne@68: filenames.TEMPFILES.append(tmpfn) jpayne@68: fn = tmpfn jpayne@68: if isinstance(chrom_sizes, str): jpayne@68: chrom_sizes = chromsizes(chrom_sizes) jpayne@68: fout = open(fn, "wt") jpayne@68: for chrom, bounds in chrom_sizes.items(): jpayne@68: line = chrom + "\t" + str(bounds[1]) + "\n" jpayne@68: fout.write(line) jpayne@68: fout.close() jpayne@68: return fn jpayne@68: jpayne@68: jpayne@68: def get_chromsizes_from_genomepy( jpayne@68: genome, saveas=None, jpayne@68: ): jpayne@68: """ jpayne@68: Get chrom size info for *genome* from genomepy, if genomepy is installed. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: jpayne@68: genome : str jpayne@68: Name of the genome assembly (e.g., "hg38") jpayne@68: jpayne@68: saveas : str jpayne@68: Filename to save output to. Dictionary will still be returned. jpayne@68: """ jpayne@68: if "genomepy" not in sys.modules: jpayne@68: return None jpayne@68: jpayne@68: d = {} jpayne@68: try: jpayne@68: g = genomepy.Genome(genome) jpayne@68: # Fail silently if the sizes file cannot be accessed jpayne@68: if not hasattr(g, "sizes_file"): jpayne@68: return None jpayne@68: for line in open(g.sizes_file): jpayne@68: chrom, size = line.split() jpayne@68: d[chrom] = (0, int(size)) jpayne@68: jpayne@68: if saveas is not None: jpayne@68: chromsizes_to_file(d, saveas) jpayne@68: except FileNotFoundError: jpayne@68: return None jpayne@68: jpayne@68: return d jpayne@68: jpayne@68: jpayne@68: def chromsizes(genome): jpayne@68: """ jpayne@68: Looks for a *genome* already included in the genome registry; if not found jpayne@68: it first tries to look it up via genomepy. If genomepy is not installed, or jpayne@68: if this lookup fails then it looks it up on UCSC. Returns the dictionary of jpayne@68: chromsize tuples where each tuple has (start,stop). jpayne@68: jpayne@68: Chromsizes are described as (start, stop) tuples to allow randomization jpayne@68: within specified regions; e. g., you can make a chromsizes dictionary that jpayne@68: represents the extent of a tiling array. jpayne@68: jpayne@68: Example usage: jpayne@68: jpayne@68: >>> dm3_chromsizes = chromsizes('dm3') jpayne@68: >>> for i in sorted(dm3_chromsizes.items()): jpayne@68: ... print(i) jpayne@68: ('chr2L', (0, 23011544)) jpayne@68: ('chr2LHet', (0, 368872)) jpayne@68: ('chr2R', (0, 21146708)) jpayne@68: ('chr2RHet', (0, 3288761)) jpayne@68: ('chr3L', (0, 24543557)) jpayne@68: ('chr3LHet', (0, 2555491)) jpayne@68: ('chr3R', (0, 27905053)) jpayne@68: ('chr3RHet', (0, 2517507)) jpayne@68: ('chr4', (0, 1351857)) jpayne@68: ('chrM', (0, 19517)) jpayne@68: ('chrU', (0, 10049037)) jpayne@68: ('chrUextra', (0, 29004656)) jpayne@68: ('chrX', (0, 22422827)) jpayne@68: ('chrXHet', (0, 204112)) jpayne@68: ('chrYHet', (0, 347038)) jpayne@68: jpayne@68: jpayne@68: """ jpayne@68: try: jpayne@68: return getattr(genome_registry, genome) jpayne@68: except AttributeError: jpayne@68: chromsizes = get_chromsizes_from_genomepy(genome) jpayne@68: if chromsizes is None: jpayne@68: return get_chromsizes_from_ucsc(genome) jpayne@68: else: jpayne@68: return chromsizes jpayne@68: jpayne@68: jpayne@68: def get_includes(): jpayne@68: """ jpayne@68: Returns a list of include directories with BEDTools headers jpayne@68: """ jpayne@68: dirname = os.path.abspath(os.path.join(os.path.dirname(__file__))) jpayne@68: return [dirname, os.path.join(dirname, "include")] jpayne@68: jpayne@68: jpayne@68: atexit.register(cleanup)