Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/helpers.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 import sys | |
2 import os | |
3 import gzip | |
4 import tempfile | |
5 import subprocess | |
6 import glob | |
7 import struct | |
8 import atexit | |
9 import re | |
10 import urllib | |
11 import urllib.error | |
12 import urllib.request | |
13 | |
14 try: # Use genomepy to determine chrom sizes if it is installed | |
15 import genomepy | |
16 except ImportError: | |
17 pass | |
18 | |
19 from . import settings | |
20 from . import filenames | |
21 from . import genome_registry | |
22 from .logger import logger | |
23 from .cbedtools import create_interval_from_list | |
24 | |
25 BUFSIZE = -1 | |
26 | |
27 _tags = {} | |
28 | |
29 | |
30 def set_bedtools_path(path=""): | |
31 """ | |
32 Explicitly set path to `BEDTools` installation dir. | |
33 | |
34 If BEDTools is not available on your system path, specify the path to the | |
35 dir containing the BEDTools executables (intersectBed, subtractBed, etc) | |
36 with this function. | |
37 | |
38 To reset and use the default system path, call this function with no | |
39 arguments or use path="". | |
40 """ | |
41 from . import paths | |
42 | |
43 paths._set_bedtools_path(path) | |
44 | |
45 | |
46 def get_bedtools_path(): | |
47 """ | |
48 Returns the currently-set path to bedtools | |
49 """ | |
50 from . import paths | |
51 | |
52 return paths._get_bedtools_path() | |
53 | |
54 | |
55 def set_R_path(path=""): | |
56 """ | |
57 Explicitly set path to `R` installation dir. | |
58 | |
59 If R is not available on the path, then it can be explicitly | |
60 specified here. | |
61 | |
62 Use path="" to reset to default system path. | |
63 """ | |
64 from . import paths | |
65 | |
66 paths._set_R_path(path) | |
67 | |
68 | |
69 def _check_for_bedtools(force_check=False, verbose=False, override=None): | |
70 """ | |
71 Checks installation as well as version (based on whether or not "bedtools | |
72 intersect" works, or just "intersectBed") | |
73 """ | |
74 if settings._bedtools_installed and not force_check: | |
75 return True | |
76 | |
77 if (len(settings.bedtools_version) == 0) or force_check: | |
78 try: | |
79 v = ( | |
80 subprocess.check_output( | |
81 [os.path.join(settings._bedtools_path, "bedtools"), "--version"] | |
82 ) | |
83 .decode("utf-8") | |
84 .rstrip() | |
85 ) | |
86 | |
87 if verbose: | |
88 print("Found bedtools version '%s'" % v) | |
89 | |
90 settings._bedtools_installed = True | |
91 | |
92 # Override, used for testing | |
93 if override is not None: | |
94 v = override | |
95 | |
96 # To allow more complicated versions as found in Linux distributions, e.g.: | |
97 # bedtools v2.26.0 | |
98 # bedtools debian/2.28.0+dfsg-2-dirty | |
99 m = re.search("^bedtools [^0-9]*([0-9][0-9.]*)", v) | |
100 if not m: | |
101 raise ValueError('Cannot identify version number from "{}"'.format(v)) | |
102 vv = m.group(1) | |
103 | |
104 settings.bedtools_version = [int(i) for i in vv.split(".")] | |
105 | |
106 settings._v_2_27_plus = ( | |
107 settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 27 | |
108 ) | |
109 | |
110 settings._v_2_15_plus = ( | |
111 settings.bedtools_version[0] >= 2 and settings.bedtools_version[1] >= 15 | |
112 ) | |
113 | |
114 except subprocess.CalledProcessError: | |
115 if settings._bedtools_path: | |
116 add_msg = "(tried path '%s')" % settings._bedtools_path | |
117 else: | |
118 add_msg = "" | |
119 raise OSError( | |
120 "Please make sure you have installed BEDTools" | |
121 "(https://github.com/arq5x/bedtools) and that " | |
122 "it's on the path. %s" % add_msg | |
123 ) | |
124 | |
125 | |
126 def _check_for_R(): | |
127 try: | |
128 p = subprocess.Popen( | |
129 [os.path.join(settings._R_path, "R"), "--version"], | |
130 stdout=subprocess.PIPE, | |
131 stderr=subprocess.PIPE, | |
132 ) | |
133 settings._R_installed = True | |
134 except OSError: | |
135 if settings._R_path: | |
136 add_msg = "(tried path '%s')" % settings._R_path | |
137 else: | |
138 add_msg = "" | |
139 raise ValueError("Please install R and ensure it is on your path %s" % add_msg) | |
140 | |
141 | |
142 class Error(Exception): | |
143 """Base class for this module's exceptions""" | |
144 | |
145 pass | |
146 | |
147 | |
148 class pybedtoolsError(Error): | |
149 pass | |
150 | |
151 | |
152 class BEDToolsError(Error): | |
153 def __init__(self, cmd, msg): | |
154 self.cmd = str(cmd) | |
155 self.msg = str(msg) | |
156 | |
157 def __str__(self): | |
158 m = ( | |
159 "\nCommand was:\n\n\t" | |
160 + self.cmd | |
161 + "\n" | |
162 + "\nError message was:\n" | |
163 + self.msg | |
164 ) | |
165 return m | |
166 | |
167 | |
168 def isGZIP(fn): | |
169 with open(fn, "rb") as f: | |
170 start = f.read(3) | |
171 if start == b"\x1f\x8b\x08": | |
172 return True | |
173 return False | |
174 | |
175 | |
176 def isBGZIP(fn): | |
177 """ | |
178 Reads a filename to see if it's a BGZIPed file or not. | |
179 """ | |
180 with open(fn, "rb") as fh: | |
181 header_str = fh.read(15) | |
182 | |
183 if len(header_str) < 15: | |
184 return False | |
185 | |
186 header = struct.unpack_from("BBBBiBBHBBB", header_str) | |
187 | |
188 id1, id2, cm, flg, mtime, xfl, os_, xlen, si1, si2, slen = header | |
189 if ( | |
190 (id1 == 31) | |
191 and (id2 == 139) | |
192 and (cm == 8) | |
193 and (flg == 4) | |
194 and (si1 == 66) | |
195 and (si2 == 67) | |
196 and (slen == 2) | |
197 ): | |
198 return True | |
199 return False | |
200 | |
201 | |
202 def isBAM(fn): | |
203 """ | |
204 Returns True if the file is both BGZIPed and the compressed contents have | |
205 start with the magic number `BAM\\x01`, or if the file is CRAM format (see | |
206 isCRAM()). | |
207 """ | |
208 # Note: previously we were catching ValueError when trying to open | |
209 # a non-BAM with pysam.Samfile. That started segfaulting, so now do it the | |
210 # right way with magic number. | |
211 with gzip.open(fn, "rb") as in_: | |
212 if isBGZIP(fn) and (in_.read(4).decode() == "BAM\x01"): | |
213 return True | |
214 if isCRAM(fn): | |
215 return True | |
216 | |
217 | |
218 def isCRAM(fn): | |
219 """ | |
220 Returns True if the file starts with the bytes for the characters "CRAM". | |
221 """ | |
222 with open(fn, "rb") as in_: | |
223 if in_.read(4).decode(errors="ignore") == "CRAM": | |
224 return True | |
225 | |
226 | |
227 def find_tagged(tag): | |
228 """ | |
229 Returns the bedtool object with tagged with *tag*. Useful for tracking | |
230 down bedtools you made previously. | |
231 """ | |
232 for key, item in _tags.items(): | |
233 try: | |
234 if item._tag == tag: | |
235 return item | |
236 except AttributeError: | |
237 pass | |
238 raise ValueError('tag "%s" not found' % tag) | |
239 | |
240 | |
241 def _flatten_list(x): | |
242 nested = True | |
243 while nested: | |
244 check_again = False | |
245 flattened = [] | |
246 | |
247 for element in x: | |
248 if isinstance(element, list): | |
249 flattened.extend(element) | |
250 check_again = True | |
251 else: | |
252 flattened.append(element) | |
253 nested = check_again | |
254 x = flattened[:] | |
255 return x | |
256 | |
257 | |
258 def set_tempdir(tempdir): | |
259 """ | |
260 Set the directory for temp files. | |
261 | |
262 Useful for clusters that use a /scratch partition rather than a /tmp dir. | |
263 Convenience function to simply set tempfile.tempdir. | |
264 """ | |
265 if not os.path.exists(tempdir): | |
266 errstr = "The tempdir you specified, %s, does not exist" % tempdir | |
267 raise FileNotFoundError(errstr) | |
268 tempfile.tempdir = tempdir | |
269 | |
270 | |
271 def get_tempdir(): | |
272 """ | |
273 Gets the current tempdir for the module. | |
274 """ | |
275 return tempfile.gettempdir() | |
276 | |
277 | |
278 def cleanup(verbose=False, remove_all=False): | |
279 """ | |
280 Deletes all temp files from the current session (or optionally *all* \ | |
281 sessions) | |
282 | |
283 If *verbose*, reports what it's doing | |
284 | |
285 If *remove_all*, then ALL files matching "pybedtools.*.tmp" in the temp dir | |
286 will be deleted. | |
287 """ | |
288 if settings.KEEP_TEMPFILES: | |
289 return | |
290 for fn in filenames.TEMPFILES: | |
291 if verbose: | |
292 print("removing", fn) | |
293 if os.path.exists(fn): | |
294 os.unlink(fn) | |
295 if remove_all: | |
296 fns = glob.glob(os.path.join(get_tempdir(), "pybedtools.*.tmp")) | |
297 for fn in fns: | |
298 os.unlink(fn) | |
299 | |
300 | |
301 def _version_2_15_plus_names(prog_name): | |
302 if not settings._bedtools_installed: | |
303 _check_for_bedtools() | |
304 if not settings._v_2_15_plus: | |
305 return [prog_name] | |
306 try: | |
307 prog_name = settings._prog_names[prog_name] | |
308 except KeyError: | |
309 if prog_name in settings._new_names: | |
310 pass | |
311 raise BEDToolsError(prog_name, prog_name + "not a recognized BEDTools program") | |
312 return [os.path.join(settings._bedtools_path, "bedtools"), prog_name] | |
313 | |
314 | |
315 def call_bedtools( | |
316 cmds, | |
317 tmpfn=None, | |
318 stdin=None, | |
319 check_stderr=None, | |
320 decode_output=True, | |
321 encode_input=True, | |
322 ): | |
323 """ | |
324 Use subprocess.Popen to call BEDTools and catch any errors. | |
325 | |
326 Output goes to *tmpfn*, or, if None, output stays in subprocess.PIPE and | |
327 can be iterated over. | |
328 | |
329 *stdin* is an optional file-like object that will be sent to | |
330 subprocess.Popen. | |
331 | |
332 Prints some useful help upon getting common errors. | |
333 | |
334 *check_stderr* is a function that takes the stderr string as input and | |
335 returns True if it's OK (that is, it's not really an error). This is | |
336 needed, e.g., for calling fastaFromBed which will report that it has to | |
337 make a .fai for a fasta file. | |
338 | |
339 *decode_output* should be set to False when you are iterating over a BAM | |
340 file, where the data represent binary rather than text data. | |
341 """ | |
342 input_is_stream = stdin is not None | |
343 output_is_stream = tmpfn is None | |
344 | |
345 _orig_cmds = cmds[:] | |
346 cmds = [] | |
347 cmds.extend(_version_2_15_plus_names(_orig_cmds[0])) | |
348 cmds.extend(_orig_cmds[1:]) | |
349 | |
350 try: | |
351 # coming from an iterator, sending as iterator | |
352 if input_is_stream and output_is_stream: | |
353 logger.debug( | |
354 "helpers.call_bedtools(): input is stream, output is " "stream" | |
355 ) | |
356 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds)) | |
357 p = subprocess.Popen( | |
358 cmds, | |
359 stdout=subprocess.PIPE, | |
360 stderr=subprocess.PIPE, | |
361 stdin=subprocess.PIPE, | |
362 bufsize=BUFSIZE, | |
363 ) | |
364 if encode_input: | |
365 for line in stdin: | |
366 p.stdin.write(line.encode()) | |
367 else: | |
368 for line in stdin: | |
369 p.stdin.write(line) | |
370 | |
371 # This is important to prevent deadlocks | |
372 p.stdin.close() | |
373 | |
374 if decode_output: | |
375 output = (i.decode("UTF-8") for i in p.stdout) | |
376 else: | |
377 output = (i for i in p.stdout) | |
378 | |
379 stderr = None | |
380 | |
381 # coming from an iterator, writing to file | |
382 if input_is_stream and not output_is_stream: | |
383 logger.debug("helpers.call_bedtools(): input is stream, output is file") | |
384 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds)) | |
385 outfile = open(tmpfn, "wb") | |
386 p = subprocess.Popen( | |
387 cmds, | |
388 stdout=outfile, | |
389 stderr=subprocess.PIPE, | |
390 stdin=subprocess.PIPE, | |
391 bufsize=BUFSIZE, | |
392 ) | |
393 if hasattr(stdin, "read"): | |
394 stdout, stderr = p.communicate(stdin.read()) | |
395 else: | |
396 for item in stdin: | |
397 p.stdin.write(item.encode()) | |
398 stdout, stderr = p.communicate() | |
399 output = tmpfn | |
400 outfile.close() | |
401 | |
402 # coming from a file, sending as iterator | |
403 if not input_is_stream and output_is_stream: | |
404 logger.debug( | |
405 "helpers.call_bedtools(): input is filename, " "output is stream" | |
406 ) | |
407 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds)) | |
408 p = subprocess.Popen( | |
409 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE | |
410 ) | |
411 if decode_output: | |
412 output = (i.decode("UTF-8") for i in p.stdout) | |
413 else: | |
414 output = (i for i in p.stdout) | |
415 stderr = None | |
416 | |
417 # file-to-file | |
418 if not input_is_stream and not output_is_stream: | |
419 logger.debug( | |
420 "helpers.call_bedtools(): input is filename, output " | |
421 "is filename (%s)", | |
422 tmpfn, | |
423 ) | |
424 cmds = list(map(str, cmds)) | |
425 logger.debug("helpers.call_bedtools(): cmds=%s", " ".join(cmds)) | |
426 outfile = open(tmpfn, "wb") | |
427 p = subprocess.Popen( | |
428 cmds, stdout=outfile, stderr=subprocess.PIPE, bufsize=BUFSIZE | |
429 ) | |
430 stdout, stderr = p.communicate() | |
431 output = tmpfn | |
432 outfile.close() | |
433 | |
434 # Check if it's OK using a provided function to check stderr. If it's | |
435 # OK, dump it to sys.stderr so it's printed, and reset it to None so we | |
436 # don't raise an exception | |
437 if check_stderr is not None: | |
438 if isinstance(stderr, bytes): | |
439 stderr = stderr.decode("UTF_8") | |
440 if check_stderr(stderr): | |
441 sys.stderr.write(stderr) | |
442 stderr = None | |
443 | |
444 if stderr: | |
445 # Fix for issue #147. In general, we consider warnings to not be | |
446 # fatal, so just show 'em and continue on. | |
447 # | |
448 # bedtools source has several different ways of showing a warning, | |
449 # but they seem to all have "WARNING" in the first 20 or so | |
450 # characters | |
451 if isinstance(stderr, bytes): | |
452 stderr = stderr.decode("UTF_8") | |
453 if len(stderr) > 20 and "WARNING" in stderr[:20].upper(): | |
454 sys.stderr.write(stderr) | |
455 else: | |
456 raise BEDToolsError(subprocess.list2cmdline(cmds), stderr) | |
457 | |
458 except (OSError, IOError) as err: | |
459 print("%s: %s" % (type(err), os.strerror(err.errno))) | |
460 print("The command was:\n\n\t%s\n" % subprocess.list2cmdline(cmds)) | |
461 | |
462 problems = { | |
463 2: ( | |
464 "* Did you spell the command correctly?", | |
465 "* Do you have BEDTools installed and on the path?", | |
466 ), | |
467 13: ( | |
468 "* Do you have permission to write " | |
469 'to the output file ("%s")?' % tmpfn, | |
470 ), | |
471 24: ( | |
472 "* Too many files open -- please submit " | |
473 "a bug report so that this can be fixed", | |
474 ), | |
475 32: ( | |
476 "* Broken pipe -- if you passed a BedTool object " | |
477 "that was created using a generator function, " | |
478 "please try saving it to disk first using the " | |
479 ".saveas() method before calling this bedtools " | |
480 "command. See issue #49 for more.", | |
481 ), | |
482 } | |
483 | |
484 print("Things to check:") | |
485 print("\n\t" + "\n\t".join(problems[err.errno])) | |
486 raise OSError("See above for commands that gave the error") | |
487 | |
488 return output | |
489 | |
490 | |
491 def _check_sequence_stderr(x): | |
492 """ | |
493 If stderr created by fastaFromBed starts with 'index file', then don't | |
494 consider it an error. | |
495 """ | |
496 if isinstance(x, bytes): | |
497 x = x.decode("UTF-8") | |
498 if x.startswith("index file"): | |
499 return True | |
500 if x.startswith("WARNING"): | |
501 return True | |
502 return False | |
503 | |
504 | |
505 def _call_randomintersect( | |
506 _self, | |
507 other, | |
508 iterations, | |
509 intersect_kwargs, | |
510 shuffle_kwargs, | |
511 report_iterations, | |
512 debug, | |
513 _orig_processes, | |
514 ): | |
515 """ | |
516 Helper function that list-ifies the output from randomintersection, s.t. | |
517 it can be pickled across a multiprocess Pool. | |
518 """ | |
519 return list( | |
520 _self.randomintersection( | |
521 other, | |
522 iterations, | |
523 intersect_kwargs=intersect_kwargs, | |
524 shuffle_kwargs=shuffle_kwargs, | |
525 report_iterations=report_iterations, | |
526 debug=False, | |
527 processes=None, | |
528 _orig_processes=_orig_processes, | |
529 ) | |
530 ) | |
531 | |
532 | |
533 def close_or_delete(*args): | |
534 """ | |
535 Single function that can be used to get rid of a BedTool, whether it's a | |
536 streaming or file-based version. | |
537 """ | |
538 for x in args: | |
539 if isinstance(x.fn, str): | |
540 os.unlink(x.fn) | |
541 elif hasattr(x.fn, "close"): | |
542 x.fn.close() | |
543 if hasattr(x.fn, "throw"): | |
544 x.fn.throw(StopIteration) | |
545 | |
546 | |
547 def n_open_fds(): | |
548 pid = os.getpid() | |
549 procs = subprocess.check_output(["lsof", "-w", "-Ff", "-p", str(pid)]) | |
550 nprocs = 0 | |
551 for i in procs.splitlines(): | |
552 if i[1:].isdigit() and i[0] == "f": | |
553 nprocs += 1 | |
554 return nprocs | |
555 | |
556 | |
557 import re | |
558 | |
559 coord_re = re.compile( | |
560 r""" | |
561 (?P<chrom>.+): | |
562 (?P<start>\d+)- | |
563 (?P<stop>\d+) | |
564 (?:\[(?P<strand>.)\])?""", | |
565 re.VERBOSE, | |
566 ) | |
567 | |
568 | |
569 def string_to_interval(s): | |
570 """ | |
571 Convert string of the form "chrom:start-stop" or "chrom:start-stop[strand]" | |
572 to an interval. | |
573 | |
574 Assumes zero-based coords. | |
575 | |
576 If it's already an interval, then return it as-is. | |
577 """ | |
578 if isinstance(s, str): | |
579 m = coord_re.search(s) | |
580 if m.group("strand"): | |
581 return create_interval_from_list( | |
582 [ | |
583 m.group("chrom"), | |
584 m.group("start"), | |
585 m.group("stop"), | |
586 ".", | |
587 "0", | |
588 m.group("strand"), | |
589 ] | |
590 ) | |
591 else: | |
592 return create_interval_from_list( | |
593 [m.group("chrom"), m.group("start"), m.group("stop")] | |
594 ) | |
595 return s | |
596 | |
597 | |
598 class FisherOutput(object): | |
599 def __init__(self, s, **kwargs): | |
600 """ | |
601 fisher returns text results like:: | |
602 | |
603 # Contingency Table | |
604 #_________________________________________ | |
605 # | not in -b | in -b | | |
606 # not in -a | 3137160615 | 503 | | |
607 # in -a | 100 | 46 | | |
608 #_________________________________________ | |
609 # p-values for fisher's exact test | |
610 left right two-tail ratio | |
611 1.00000 0.00000 0.00000 2868973.922 | |
612 | |
613 """ | |
614 if isinstance(s, str): | |
615 s = open(s).read() | |
616 if hasattr(s, "next"): | |
617 s = "".join(i for i in s) | |
618 table = { | |
619 "not in -a": {"not in -b": None, "in -b": None}, | |
620 "in -a": {"not in -b": None, "in -b": None}, | |
621 } | |
622 self.text = s | |
623 lines = s.splitlines() | |
624 for i in lines: | |
625 if "not in -a" in i: | |
626 _, in_b, not_in_b, _ = i.strip().split("|") | |
627 table["not in -a"]["not in -b"] = int(not_in_b) | |
628 table["not in -a"]["in -b"] = int(in_b) | |
629 | |
630 if " in -a" in i: | |
631 _, in_b, not_in_b, _ = i.strip().split("|") | |
632 table["in -a"]["not in -b"] = int(not_in_b) | |
633 table["in -a"]["in -b"] = int(in_b) | |
634 self.table = table | |
635 left, right, two_tail, ratio = lines[-1].split() | |
636 self.left_tail = float(left) | |
637 self.right_tail = float(right) | |
638 self.two_tail = float(two_tail) | |
639 self.ratio = float(ratio) | |
640 | |
641 def __str__(self): | |
642 return self.text | |
643 | |
644 def __repr__(self): | |
645 return "<%s at %s>\n%s" % (self.__class__.__name__, id(self), self.text) | |
646 | |
647 | |
648 class SplitOutput(object): | |
649 def __init__(self, output, **kwargs): | |
650 """ | |
651 Handles output from bedtools split, which sends a report of files to | |
652 stdout. This class parses that list into something more convenient to | |
653 use within pybedtools. | |
654 | |
655 Most useful is probably the .bedtools attribute, which is a list of | |
656 BedTool objects. | |
657 """ | |
658 from .bedtool import BedTool | |
659 | |
660 if isinstance(output, str): | |
661 output = open(output).read() | |
662 if hasattr(output, "next"): | |
663 output = "".join(i for i in output) | |
664 | |
665 #: store a copy of the output | |
666 self.text = output | |
667 | |
668 #: BedTool objects created from output | |
669 self.bedtools = [] | |
670 | |
671 #: Filenames that were created from the split | |
672 self.files = [] | |
673 | |
674 #: number of bases in each file | |
675 self.nbases = [] | |
676 | |
677 #: number of features in each file | |
678 self.counts = [] | |
679 | |
680 for line in output.splitlines(): | |
681 toks = line.split() | |
682 self.files.append(toks[0]) | |
683 self.nbases.append(int(toks[1])) | |
684 self.counts.append(int(toks[2])) | |
685 self.bedtools.append(BedTool(toks[0])) | |
686 | |
687 | |
688 def internet_on(timeout=1): | |
689 try: | |
690 response = urllib.request.urlopen("http://genome.ucsc.edu", timeout=timeout) | |
691 return True | |
692 except urllib.error.URLError as err: | |
693 pass | |
694 return False | |
695 | |
696 | |
697 def get_chromsizes_from_ucsc( | |
698 genome, | |
699 saveas=None, | |
700 mysql="mysql", | |
701 fetchchromsizes="fetchChromSizes", | |
702 timeout=None, | |
703 host_url="genome-mysql.cse.ucsc.edu", | |
704 ): | |
705 """ | |
706 Download chrom size info for *genome* from UCSC and returns the dictionary. | |
707 | |
708 Parameters | |
709 ---------- | |
710 | |
711 genome : str | |
712 Name of the genome assembly (e.g., "hg38") | |
713 | |
714 saveas : str | |
715 Filename to save output to. Dictionary will still be returned. | |
716 | |
717 mysql, fetchchromsizes : str | |
718 Paths to MySQL and fetchChromSizes. | |
719 | |
720 timeout : float | |
721 How long to wait for a response; mostly used for testing. | |
722 | |
723 host_url : str | |
724 URL of UCSC mirror MySQL server. | |
725 """ | |
726 if not internet_on(timeout=timeout): | |
727 raise ValueError( | |
728 "It appears you don't have an internet connection " | |
729 "-- unable to get chromsizes from UCSC" | |
730 ) | |
731 cmds = [ | |
732 mysql, | |
733 "--user=genome", | |
734 "--host=" + host_url, | |
735 "-A", | |
736 "-e", | |
737 "select chrom, size from %s.chromInfo" % genome, | |
738 ] | |
739 failures = [] | |
740 d = {} | |
741 try: | |
742 p = subprocess.Popen( | |
743 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE | |
744 ) | |
745 stdout, stderr = p.communicate() | |
746 if stderr: | |
747 print(stderr) | |
748 print("Commands were:\n") | |
749 print((subprocess.list2cmdline(cmds))) | |
750 | |
751 lines = stdout.splitlines()[1:] | |
752 for line in lines: | |
753 if isinstance(line, bytes): | |
754 line = line.decode("UTF-8") | |
755 chrom, size = line.split() | |
756 d[chrom] = (0, int(size)) | |
757 | |
758 if saveas is not None: | |
759 chromsizes_to_file(d, saveas) | |
760 | |
761 except OSError as err: | |
762 if err.errno == 2: | |
763 failures.append("Can't find mysql at path {0}".format(mysql)) | |
764 else: | |
765 raise | |
766 try: | |
767 cmds = [fetchchromsizes, genome] | |
768 p = subprocess.Popen( | |
769 cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=BUFSIZE | |
770 ) | |
771 stdout, stderr = p.communicate() | |
772 if stderr: | |
773 if "INFO: trying WGET" not in str(stderr): | |
774 print(stderr) | |
775 print("Commands were:\n") | |
776 print((subprocess.list2cmdline(cmds))) | |
777 | |
778 lines = stdout.splitlines() | |
779 for line in lines: | |
780 if isinstance(line, bytes): | |
781 line = line.decode("UTF-8") | |
782 chrom, size = line.split() | |
783 d[chrom] = (0, int(size)) | |
784 | |
785 if saveas is not None: | |
786 chromsizes_to_file(d, saveas) | |
787 | |
788 except OSError as err: | |
789 if err.errno == 2: | |
790 failures.append("Can't find path to fetchChromsizes") | |
791 | |
792 if not d: | |
793 raise OSError(failures) | |
794 return d | |
795 | |
796 | |
797 def chromsizes_to_file(chrom_sizes, fn=None): | |
798 """ | |
799 Converts a *chromsizes* dictionary to a file. If *fn* is None, then a | |
800 tempfile is created (which can be deleted with pybedtools.cleanup()). | |
801 | |
802 Returns the filename. | |
803 """ | |
804 if fn is None: | |
805 tmpfn = tempfile.NamedTemporaryFile( | |
806 prefix="pybedtools.", suffix=".tmp", delete=False | |
807 ) | |
808 tmpfn = tmpfn.name | |
809 filenames.TEMPFILES.append(tmpfn) | |
810 fn = tmpfn | |
811 if isinstance(chrom_sizes, str): | |
812 chrom_sizes = chromsizes(chrom_sizes) | |
813 fout = open(fn, "wt") | |
814 for chrom, bounds in chrom_sizes.items(): | |
815 line = chrom + "\t" + str(bounds[1]) + "\n" | |
816 fout.write(line) | |
817 fout.close() | |
818 return fn | |
819 | |
820 | |
821 def get_chromsizes_from_genomepy( | |
822 genome, saveas=None, | |
823 ): | |
824 """ | |
825 Get chrom size info for *genome* from genomepy, if genomepy is installed. | |
826 | |
827 Parameters | |
828 ---------- | |
829 | |
830 genome : str | |
831 Name of the genome assembly (e.g., "hg38") | |
832 | |
833 saveas : str | |
834 Filename to save output to. Dictionary will still be returned. | |
835 """ | |
836 if "genomepy" not in sys.modules: | |
837 return None | |
838 | |
839 d = {} | |
840 try: | |
841 g = genomepy.Genome(genome) | |
842 # Fail silently if the sizes file cannot be accessed | |
843 if not hasattr(g, "sizes_file"): | |
844 return None | |
845 for line in open(g.sizes_file): | |
846 chrom, size = line.split() | |
847 d[chrom] = (0, int(size)) | |
848 | |
849 if saveas is not None: | |
850 chromsizes_to_file(d, saveas) | |
851 except FileNotFoundError: | |
852 return None | |
853 | |
854 return d | |
855 | |
856 | |
857 def chromsizes(genome): | |
858 """ | |
859 Looks for a *genome* already included in the genome registry; if not found | |
860 it first tries to look it up via genomepy. If genomepy is not installed, or | |
861 if this lookup fails then it looks it up on UCSC. Returns the dictionary of | |
862 chromsize tuples where each tuple has (start,stop). | |
863 | |
864 Chromsizes are described as (start, stop) tuples to allow randomization | |
865 within specified regions; e. g., you can make a chromsizes dictionary that | |
866 represents the extent of a tiling array. | |
867 | |
868 Example usage: | |
869 | |
870 >>> dm3_chromsizes = chromsizes('dm3') | |
871 >>> for i in sorted(dm3_chromsizes.items()): | |
872 ... print(i) | |
873 ('chr2L', (0, 23011544)) | |
874 ('chr2LHet', (0, 368872)) | |
875 ('chr2R', (0, 21146708)) | |
876 ('chr2RHet', (0, 3288761)) | |
877 ('chr3L', (0, 24543557)) | |
878 ('chr3LHet', (0, 2555491)) | |
879 ('chr3R', (0, 27905053)) | |
880 ('chr3RHet', (0, 2517507)) | |
881 ('chr4', (0, 1351857)) | |
882 ('chrM', (0, 19517)) | |
883 ('chrU', (0, 10049037)) | |
884 ('chrUextra', (0, 29004656)) | |
885 ('chrX', (0, 22422827)) | |
886 ('chrXHet', (0, 204112)) | |
887 ('chrYHet', (0, 347038)) | |
888 | |
889 | |
890 """ | |
891 try: | |
892 return getattr(genome_registry, genome) | |
893 except AttributeError: | |
894 chromsizes = get_chromsizes_from_genomepy(genome) | |
895 if chromsizes is None: | |
896 return get_chromsizes_from_ucsc(genome) | |
897 else: | |
898 return chromsizes | |
899 | |
900 | |
901 def get_includes(): | |
902 """ | |
903 Returns a list of include directories with BEDTools headers | |
904 """ | |
905 dirname = os.path.abspath(os.path.join(os.path.dirname(__file__))) | |
906 return [dirname, os.path.join(dirname, "include")] | |
907 | |
908 | |
909 atexit.register(cleanup) |