Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/bedtool.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 from __future__ import annotations | |
2 import tempfile | |
3 from textwrap import dedent | |
4 import shutil | |
5 import subprocess | |
6 import operator | |
7 import os | |
8 import sys | |
9 import random | |
10 import string | |
11 import pprint | |
12 from itertools import islice | |
13 from multiprocessing import Pool | |
14 import gzip | |
15 from typing import Any, Callable, Iterable, Iterator, Literal, Optional, TYPE_CHECKING, cast | |
16 import pysam | |
17 from warnings import warn | |
18 import pathlib | |
19 from pathlib import Path | |
20 | |
21 from .helpers import ( | |
22 get_tempdir, | |
23 _tags, | |
24 call_bedtools, | |
25 _flatten_list, | |
26 _check_sequence_stderr, | |
27 isBAM, | |
28 isBGZIP, | |
29 isGZIP, | |
30 BEDToolsError, | |
31 pybedtoolsError, | |
32 _call_randomintersect, | |
33 SplitOutput, | |
34 FisherOutput, | |
35 ) | |
36 from . import helpers | |
37 from .cbedtools import ( | |
38 IntervalFile, | |
39 IntervalIterator, | |
40 Interval, | |
41 create_interval_from_list, | |
42 BedToolsFileError, | |
43 ) | |
44 import pybedtools | |
45 from . import settings | |
46 from . import filenames | |
47 | |
48 if TYPE_CHECKING: | |
49 import pandas as pd | |
50 import matplotlib.colors as mcolors | |
51 | |
52 _implicit_registry = {} | |
53 _other_registry = {} | |
54 _bam_registry = {} | |
55 | |
56 | |
57 def _jaccard_output_to_dict(s, **kwargs) -> dict: | |
58 """ | |
59 jaccard method doesn't return an interval file, rather, it returns a short | |
60 summary of results. Here, we simply parse it into a dict for convenience. | |
61 """ | |
62 if isinstance(s, str): | |
63 _s = open(s).read() | |
64 elif hasattr(s, "next") or hasattr(s, "__next__"): | |
65 _s = "".join([i for i in s]) | |
66 else: | |
67 raise ValueError("Unexpected object %r" % s) | |
68 header, data = _s.splitlines() | |
69 header = header.split() | |
70 data = data.split() | |
71 data[0] = int(data[0]) | |
72 data[1] = int(data[1]) | |
73 data[2] = float(data[2]) | |
74 data[3] = int(data[3]) | |
75 return dict(list(zip(header, data))) | |
76 | |
77 | |
78 def _reldist_output_handler(s, **kwargs): | |
79 """ | |
80 reldist, if called with -detail, returns a valid BED file with the relative | |
81 distance as the last field. In that case, return the BedTool immediately. | |
82 If not -detail, then the results are a table, in which case here we parse | |
83 into a dict for convenience. | |
84 """ | |
85 if "detail" in kwargs: | |
86 return BedTool(s) | |
87 if isinstance(s, str): | |
88 iterable = open(s) | |
89 if hasattr(s, "next"): | |
90 iterable = s | |
91 header = next(iterable).split() | |
92 results = {} | |
93 for h in header: | |
94 results[h] = [] | |
95 for i in iterable: | |
96 reldist, count, total, fraction = i.split() | |
97 data = [float(reldist), int(count), int(total), float(fraction)] | |
98 for h, d in zip(header, data): | |
99 results[h].append(d) | |
100 return results | |
101 | |
102 | |
103 def _wraps( | |
104 prog: Optional[str] = None, | |
105 implicit: Optional[str] = None, | |
106 bam: Optional[str] = None, | |
107 other: Optional[str] = None, | |
108 uses_genome: bool = False, | |
109 make_tempfile_for: Optional[str] = None, | |
110 check_stderr:Optional[Callable]=None, | |
111 add_to_bedtool:Optional[dict] = None, | |
112 nonbam: Optional[Literal["ALL"] | str | list[str]] = None, | |
113 force_bam: bool = False, | |
114 genome_none_if: Optional[list[str]] =None, | |
115 genome_if: Optional[list[str]] = None, | |
116 genome_ok_if: Optional[list[str]] = None, | |
117 does_not_return_bedtool: Optional[Callable] =None, | |
118 arg_order: Optional[list[str]] = None, | |
119 ): | |
120 """ | |
121 Do-it-all wrapper, to be used as a decorator. | |
122 | |
123 *prog* is the name of the BEDTools program that will be called. The help | |
124 for this program will also be added to the decorated method's docstring. | |
125 | |
126 *implicit* is the BEDTools program arg that should be filled in | |
127 automatically. | |
128 | |
129 *bam* will disable the implicit substitution if *bam* is in the kwargs. | |
130 This is typically 'abam' or 'ibam' if the program accepts BAM input. | |
131 | |
132 *other* is the BEDTools program arg that is passed in as the second input, | |
133 if supported. Within the semantics of BEDTools, the typical case will be | |
134 that if implicit='a' then other='b'; if implicit='i' then other=None. | |
135 | |
136 *uses_genome*, if True, will check for 'g' and/or 'genome' args and | |
137 retrieve the corresponding genome files as needed. | |
138 | |
139 *make_tempfile_for* is used for the sequence methods and indicates which | |
140 kwarg should have a tempfile made for it if it's not provided ('fo' for the | |
141 sequence methods) | |
142 | |
143 *check_stderr*, if not None, is a function that accepts a string (which | |
144 will be anything written to stdout when calling the wrapped program). This | |
145 function should return True if the string is OK, and False if it should | |
146 truly be considered an error. This is needed for wrapping fastaFromBed, | |
147 which will report to stderr that it's creating an index file. | |
148 | |
149 *add_to_bedtool* is used for sequence methods. It is a dictionary mapping | |
150 kwargs to attributes to be created in the resulting BedTool. Typically it | |
151 is {'fo':'seqfn'} which will add the resulting sequence name to the | |
152 BedTool's .seqfn attribute. If *add_to_bedtool* is not None, then the | |
153 returned BedTool will be *self* with the added attribute. If a key is | |
154 "stdout" (e.g., {"stdout": attr_name}), then save the stdout of the command | |
155 as a tempfile and store the tempfile's name in the attribute. This is | |
156 required for linksBed and bedToIgv. | |
157 | |
158 *nonbam* is a kwarg that even if the input file was a BAM, the output will | |
159 *not* be BAM format. For example, the `-bed` arg for intersectBed will | |
160 cause the output to be in BED format, not BAM. If not None, this can be a | |
161 string, a list of strings, or the special string "ALL", which means that | |
162 the wrapped program will never return BAM output. | |
163 | |
164 *force_bam*, if True, will force the output to be BAM. This is used for | |
165 bedToBam. | |
166 | |
167 *genome_none_if* is a list of arguments that will ignore the requirement | |
168 for a genome. This is needed for window_maker, where -b and -g are | |
169 mutually exclusive. | |
170 | |
171 *genome_ok_if* is a list of arguments that, if they are in | |
172 *genome_none_if*, are still OK to pass in. This is needed for bedtool | |
173 genomecov, where -g is not needed if -ibam is specified...but it's still OK | |
174 if the user passes a genome arg. | |
175 | |
176 *genome_if* is a list of arguments that will trigger the requirement for | |
177 a genome; otherwise no genome needs to be specified. | |
178 | |
179 *does_not_return_bedtool*, if not None, should be a function that handles | |
180 the returned output. Its signature should be ``func(output, kwargs)``, | |
181 where `output` is the output from the [possibly streaming] call to BEDTools | |
182 and `kwargs` are passed verbatim from the wrapped method call. Some | |
183 examples of methods that use this are jaccard, reldist, fisher, and split | |
184 methods. | |
185 | |
186 *arg_order*, if not None, is a sorted list of arguments. This is used by | |
187 handle_kwargs() to deal with things like issues 81 and 345, where some | |
188 BEDTools programs are sensitive to argument order. | |
189 """ | |
190 | |
191 # NOTE: We are calling each BEDTools program to get its help and adding | |
192 # that to the docstring of each method. This is run at import time. However | |
193 # if BEDTools is not on the path at import time, `not_implemented` is set | |
194 # to True and isn't reset later until the module is reloaded. | |
195 # | |
196 # helpers.set_bedtools_path therefore will trigger a module reload. | |
197 not_implemented = False | |
198 | |
199 # Call the program with -h to get help, which prints to stderr. | |
200 try: | |
201 p = subprocess.Popen( | |
202 helpers._version_2_15_plus_names(prog) + ["-h"], | |
203 stdout=subprocess.PIPE, | |
204 stderr=subprocess.PIPE, | |
205 ) | |
206 help_str = p.communicate()[1].decode() | |
207 | |
208 # underscores throw off ReStructuredText syntax of docstrings, so | |
209 # replace 'em | |
210 help_str = help_str.replace("_", "**") | |
211 | |
212 # indent | |
213 help_str = help_str.split("\n") | |
214 help_str = ["\n\n**Original BEDTools help:**::"] + ["\t" + i for i in help_str] | |
215 help_str = "\n".join(help_str) + "\n" | |
216 | |
217 # If the program can't be found, then we'll eventually replace the method | |
218 # with a version that does nothing but raise a NotImplementedError (plus | |
219 # a helpful message). | |
220 except OSError: | |
221 help_str = ( | |
222 '"%s" does not appear to be installed ' | |
223 "or on the path, so this method is " | |
224 "disabled. Please install a more recent " | |
225 "version of BEDTools and re-import to " | |
226 "use this method." % prog | |
227 ) | |
228 not_implemented = True | |
229 | |
230 def decorator(func): | |
231 """ | |
232 Accepts a function to be wrapped; discards the original and returns a | |
233 new, rebuilt-from-scratch function based on the kwargs passed to | |
234 _wraps(). | |
235 """ | |
236 # Register the implicit (as well as bam and other) args in the global | |
237 # registry. BedTool.handle_kwargs() will access these at runtime. The | |
238 # registry is keyed by program name (like intersectBed). | |
239 _implicit_registry[prog] = implicit | |
240 if other is not None: | |
241 _other_registry[prog] = other | |
242 if bam is not None: | |
243 _bam_registry[prog] = bam | |
244 | |
245 # Here's where we replace an unable-to-be-found program's method with | |
246 # one that only returns a NotImplementedError | |
247 if not_implemented: | |
248 | |
249 def not_implemented_func(*args, **kwargs): | |
250 raise NotImplementedError(help_str) | |
251 | |
252 return not_implemented_func | |
253 | |
254 _add_doc = [] | |
255 if implicit: | |
256 _add_doc.append( | |
257 dedent( | |
258 """ | |
259 For convenience, the file or stream this BedTool points to | |
260 is implicitly passed as the `-%s` argument to `%s` | |
261 """ | |
262 % (implicit, prog) | |
263 ) | |
264 ) | |
265 | |
266 if uses_genome: | |
267 _add_doc.append( | |
268 dedent( | |
269 """ | |
270 There are two alternatives for supplying a genome. Use | |
271 `g="genome.filename"` if you have a genome's chrom sizes | |
272 saved as a file. This is the what BEDTools expects when | |
273 using it from the command line. Alternatively, use the | |
274 `genome="assembly.name"` (for example, `genome="hg19"`) to | |
275 use chrom sizes for that assembly without having to manage | |
276 a separate file. The `genome` argument triggers a call | |
277 `pybedtools.chromsizes`, so see that method for more | |
278 details. | |
279 """ | |
280 ) | |
281 ) | |
282 | |
283 def wrapped(self, *args, **kwargs): | |
284 """ | |
285 A newly created function that will be returned by the _wraps() | |
286 decorator | |
287 """ | |
288 | |
289 # Only one non-keyword argument is supported; this is then assumed | |
290 # to be "other" (e.g., `-b` for intersectBed) | |
291 if len(args) > 0: | |
292 assert len(args) == 1 | |
293 kwargs[other] = args[0] | |
294 | |
295 # Add the implicit values to kwargs. If the current BedTool is | |
296 # BAM, it will automatically be passed to the appropriate | |
297 # BAM-support arg (like `-abam`). But this also allows the user to | |
298 # explicitly specify the abam kwarg, which will override the | |
299 # auto-substitution. | |
300 # Note: here, `implicit` is something like "a"; `bam` is something | |
301 # like "abam" | |
302 if ( | |
303 (implicit not in kwargs) | |
304 and (bam not in kwargs) | |
305 and (implicit is not None) | |
306 ): | |
307 if not self._isbam: | |
308 kwargs[implicit] = self.fn | |
309 else: | |
310 # It is a bam file. If this program supports BAM as the | |
311 # first input, then we set it here | |
312 if bam is not None: | |
313 kwargs[bam] = self.fn | |
314 | |
315 # Otherwise, BEDTools can't currently handle it, so raise | |
316 # an exception. | |
317 else: | |
318 raise pybedtoolsError( | |
319 '"%s" currently can\'t handle BAM ' | |
320 "input, please use bam_to_bed() first." % prog | |
321 ) | |
322 | |
323 # Should this function handle genome files? | |
324 check_for_genome = uses_genome | |
325 if uses_genome: | |
326 if genome_none_if: | |
327 for i in genome_none_if: | |
328 if i in kwargs or i == implicit: | |
329 check_for_genome = False | |
330 | |
331 # for genomecov, if -ibam then -g is optional. So it's OK | |
332 # for the user to provide genome or g kwargs, even if | |
333 # -ibam. | |
334 if genome_ok_if: | |
335 for i in genome_ok_if: | |
336 if i in kwargs or i == implicit: | |
337 if ("g" in kwargs) or ("genome" in kwargs): | |
338 check_for_genome = True | |
339 if genome_if: | |
340 check_for_genome = False | |
341 for i in genome_if: | |
342 if (i in kwargs) or (i == implicit): | |
343 check_for_genome = True | |
344 if check_for_genome: | |
345 kwargs = self.check_genome(**kwargs) | |
346 | |
347 # For sequence methods, we may need to make a tempfile that will | |
348 # hold the resulting sequence. For example, fastaFromBed needs to | |
349 # make a tempfile for 'fo' if no 'fo' was explicitly specified by | |
350 # the user. | |
351 if make_tempfile_for is not None: | |
352 if make_tempfile_for not in kwargs: | |
353 kwargs[make_tempfile_for] = self._tmp() | |
354 | |
355 # At runtime, this will parse the kwargs, convert streams to | |
356 # tempfiles if needed, and return all the goodies | |
357 cmds, tmp, stdin = self.handle_kwargs(prog=prog, | |
358 arg_order=arg_order, | |
359 **kwargs) | |
360 | |
361 # Decide whether the output is BAM format or not. | |
362 result_is_bam = False | |
363 | |
364 # By default, if the current BedTool is BAM, then the result should | |
365 # be, too. | |
366 if self._isbam: | |
367 result_is_bam = True | |
368 | |
369 # If nonbam is "ALL", then this method will never return BAM | |
370 # output. | |
371 if nonbam == "ALL": | |
372 result_is_bam = False | |
373 | |
374 # If any of the `nonbam` args are found in kwargs, then result is | |
375 # not a BAM. Side note: the _nonbam name mangling is necessary to | |
376 # keep the nonbam arg passed into the original _wraps() decorator | |
377 # in scope. | |
378 if nonbam is not None and nonbam != "ALL": | |
379 if isinstance(nonbam, str): | |
380 _nonbam = [nonbam] | |
381 else: | |
382 _nonbam = nonbam | |
383 for i in _nonbam: | |
384 if i in kwargs: | |
385 result_is_bam = False | |
386 break | |
387 | |
388 if force_bam: | |
389 result_is_bam = True | |
390 | |
391 decode_output = not result_is_bam | |
392 | |
393 # Do the actual call | |
394 stream = call_bedtools( | |
395 cmds, | |
396 tmp, | |
397 stdin=stdin, | |
398 check_stderr=check_stderr, | |
399 decode_output=decode_output, | |
400 ) | |
401 | |
402 if does_not_return_bedtool: | |
403 return does_not_return_bedtool(stream, **kwargs) | |
404 | |
405 # Post-hoc editing of the BedTool -- for example, this is used for | |
406 # the sequence methods to add a `seqfn` attribute to the resulting | |
407 # BedTool. | |
408 if add_to_bedtool is not None: | |
409 for kw, attr in list(add_to_bedtool.items()): | |
410 if kw == "stdout": | |
411 value = stream | |
412 else: | |
413 value = kwargs[kw] | |
414 setattr(self, attr, value) | |
415 result = self | |
416 else: | |
417 result = BedTool(stream) | |
418 | |
419 result._isbam = result_is_bam | |
420 result._cmds = cmds | |
421 del kwargs | |
422 return result | |
423 | |
424 # Now add the edited docstring (original Python docstring plus BEDTools | |
425 # help) to the newly created method above | |
426 if func.__doc__ is None: | |
427 orig = "" | |
428 else: | |
429 orig = func.__doc__ | |
430 | |
431 wrapped.__doc__ = orig + "\n".join(_add_doc) + help_str | |
432 | |
433 # Add the original method's name to a new attribute so we can access it | |
434 # when logging history | |
435 wrapped._name = func.__name__ # type: ignore | |
436 | |
437 return wrapped | |
438 | |
439 return decorator | |
440 | |
441 | |
442 def _log_to_history(method: Callable): | |
443 """ | |
444 Decorator to add a method and its kwargs to the history. | |
445 | |
446 Assumes that you only add this decorator to bedtool instances that | |
447 return other bedtool instances | |
448 """ | |
449 | |
450 def decorated(self, *args, **kwargs): | |
451 | |
452 # this calls the actual method in the first place; *result* is | |
453 # whatever you get back | |
454 result = method(self, *args, **kwargs) | |
455 | |
456 # add appropriate tags | |
457 parent_tag = self._tag | |
458 result_tag = result._tag | |
459 | |
460 history_step = HistoryStep( | |
461 method, args, kwargs, self, parent_tag, result_tag | |
462 ) | |
463 | |
464 # only add the current history to the new bedtool if there's | |
465 # something to add | |
466 if len(self.history) > 0: | |
467 result.history.append(self.history) | |
468 | |
469 # but either way, add this history step to the result. | |
470 result.history.append(history_step) | |
471 | |
472 return result | |
473 | |
474 decorated.__doc__ = method.__doc__ | |
475 return decorated | |
476 | |
477 | |
478 class BedTool(object): | |
479 TEMPFILES = filenames.TEMPFILES | |
480 | |
481 def __init__(self, fn: Optional[Any] = None, | |
482 from_string: bool = False, | |
483 remote: bool = False): | |
484 """ | |
485 Wrapper around Aaron Quinlan's ``BEDtools`` suite of programs | |
486 (https://github.com/arq5x/bedtools); also contains many useful | |
487 methods for more detailed work with BED files. | |
488 | |
489 *fn* is typically the name of a BED-like file, but can also be | |
490 one of the following: | |
491 | |
492 * a string filename | |
493 * another BedTool object | |
494 * an iterable of Interval objects | |
495 * an open file object | |
496 * a "file contents" string (see below) | |
497 | |
498 If *from_string* is True, then you can pass a string that contains | |
499 the contents of the BedTool you want to create. This will treat all | |
500 spaces as TABs and write to tempfile, treating whatever you pass as | |
501 *fn* as the contents of the bed file. This also strips empty lines. | |
502 | |
503 Typical usage is to point to an existing file:: | |
504 | |
505 a = BedTool('a.bed') | |
506 | |
507 But you can also create one from scratch from a string:: | |
508 | |
509 >>> s = ''' | |
510 ... chrX 1 100 | |
511 ... chrX 25 800 | |
512 ... ''' | |
513 >>> a = BedTool(s, from_string=True) | |
514 | |
515 Or use examples that come with pybedtools:: | |
516 | |
517 >>> example_files = pybedtools.list_example_files() | |
518 >>> assert 'a.bed' in example_files | |
519 >>> a = pybedtools.example_bedtool('a.bed') | |
520 | |
521 """ | |
522 if remote: | |
523 raise ValueError( | |
524 "Remote BAM no longer supported (since BEDTools does not " "support it)" | |
525 ) | |
526 self.remote = remote | |
527 self._isbam = False | |
528 self._bam_header = "" | |
529 self._cmds = [] | |
530 if from_string: | |
531 if fn is None or not isinstance(fn, str): | |
532 raise ValueError("from_string=True requires a string to parse") | |
533 bed_contents = fn | |
534 fn = self._tmp() | |
535 fout = open(fn, "w") | |
536 for line in bed_contents.splitlines(): | |
537 if len(line.strip()) == 0: | |
538 continue | |
539 line = "\t".join(line.split()) + "\n" | |
540 fout.write(line) | |
541 fout.close() | |
542 | |
543 else: | |
544 # if fn is a Path object, we have to use its string representation | |
545 if isinstance(fn, pathlib.PurePath): | |
546 fn = str(fn) | |
547 | |
548 # our work is already done | |
549 if isinstance(fn, BedTool): | |
550 fn = fn.fn | |
551 | |
552 # from_string=False, so assume it's a filename | |
553 elif isinstance(fn, str): | |
554 if remote: | |
555 self._isbam = True | |
556 else: | |
557 if not os.path.exists(fn): | |
558 msg = 'File "%s" does not exist' % fn | |
559 raise FileNotFoundError(msg) | |
560 self._isbam = isBAM(fn) | |
561 | |
562 # TODO: we dont' really need this, but it's added here for | |
563 # compatibility with existing tests | |
564 if self._isbam: | |
565 header = pysam.Samfile(fn).header.to_dict() | |
566 # For example: | |
567 # { | |
568 # 'HD': {'VN': '1.0', 'SO': 'coordinate'}, | |
569 # 'SQ': [ | |
570 # {'LN': 23011544, | |
571 # 'SN': 'chr2L'}, | |
572 # {'LN': 21146708, | |
573 # 'SN': 'chr2R'}, | |
574 # {'LN': 24543557, | |
575 # 'SN': 'chr3L'}, | |
576 # {'LN': 27905053, | |
577 # 'SN': 'chr3R'}, | |
578 # {'LN': 1351857, | |
579 # 'SN': 'chr4'}, | |
580 # {'LN': 22422827, | |
581 # 'SN': 'chrX'} | |
582 # ] | |
583 # } | |
584 | |
585 txt_header = [] | |
586 for k, v in header.items(): | |
587 if isinstance(v, list): | |
588 for i in v: | |
589 if isinstance(i, dict): | |
590 txt_header.append( | |
591 "\t".join( | |
592 ["@" + k] | |
593 + [ | |
594 ":".join(map(str, j)) | |
595 for j in sorted(i.items(), reverse=True) | |
596 ] | |
597 ) | |
598 ) | |
599 elif isinstance(i, str): | |
600 txt_header.append(i) | |
601 | |
602 elif isinstance(v, dict): | |
603 txt_header.append( | |
604 "\t".join( | |
605 ["@" + k] | |
606 + [ | |
607 ":".join(map(str, j)) | |
608 for j in sorted(v.items(), reverse=True) | |
609 ] | |
610 ) | |
611 ) | |
612 else: | |
613 raise ValueError("unhandled type in BAM header") | |
614 self._bam_header = "\n".join(txt_header) + "\n" | |
615 | |
616 # If tuple or list, then save as file first | |
617 # (fixes #73) | |
618 elif isinstance(fn, (list, tuple)): | |
619 fn = BedTool(iter(fn)).saveas().fn | |
620 | |
621 # Otherwise assume iterator, say an open file as from | |
622 # subprocess.PIPE | |
623 else: | |
624 fn = fn | |
625 | |
626 self.fn = fn | |
627 tag = "".join([random.choice(string.ascii_lowercase) for _ in range(8)]) | |
628 self._tag = tag | |
629 _tags[tag] = self | |
630 self._hascounts = False | |
631 self._file_type = None | |
632 self.seqfn = None | |
633 self.fastq = None | |
634 self.igv_script = None | |
635 self.links_html = None | |
636 self.history = History() | |
637 | |
638 @classmethod | |
639 def from_dataframe( | |
640 cls, | |
641 df: pd.DataFrame, | |
642 outfile: Optional[str] =None, | |
643 sep: str ="\t", | |
644 header: bool =False, | |
645 na_rep:str =".", | |
646 index:bool =False, | |
647 **kwargs | |
648 ) -> BedTool: | |
649 """ | |
650 Creates a BedTool from a pandas.DataFrame. | |
651 | |
652 If `outfile` is None, a temporary file will be used. Otherwise it can | |
653 be a specific filename or an open file handle. Additional kwargs will | |
654 be passed to `pandas.DataFrame.to_csv`. | |
655 | |
656 The fields of the resulting BedTool will match the order of columns in | |
657 the dataframe. | |
658 """ | |
659 try: | |
660 import pandas | |
661 except ImportError: | |
662 raise ImportError("pandas must be installed to use dataframes") | |
663 if outfile is None: | |
664 outfile = cls._tmp() | |
665 default_kwargs = dict(sep=sep, header=header, na_rep=na_rep, index=index) | |
666 default_kwargs.update(kwargs) | |
667 df.to_csv(outfile, **default_kwargs) | |
668 | |
669 if isinstance(outfile, str): | |
670 fn = outfile | |
671 else: | |
672 try: | |
673 fn = outfile.name | |
674 except AttributeError: | |
675 raise ValueError( | |
676 "`outfile` is not a string and doesn't have a `name` attribute. " | |
677 "Unable to determine filename." | |
678 ) | |
679 return BedTool(fn) | |
680 | |
681 def split(self, func: Callable, *args, **kwargs) -> BedTool: | |
682 """ | |
683 Split each feature using a user-defined function. | |
684 | |
685 Calls the provided function `func` with each interval. In contrast to | |
686 `each` (which does something similar), this method expects `func` to | |
687 return an *iterable* of Interval objects. | |
688 | |
689 args and kwargs are passed directly to `func`. | |
690 | |
691 Returns a new BedTool. | |
692 """ | |
693 | |
694 def generator(): | |
695 for orig_interval in self: | |
696 for interval in func(orig_interval, *args, **kwargs): | |
697 yield interval | |
698 | |
699 return BedTool(generator()) | |
700 | |
701 def truncate_to_chrom(self, genome: str | dict) -> BedTool: | |
702 """ | |
703 Ensure all features fall within chromosome limits. | |
704 | |
705 Some peak-callers extend peaks such that the boundaries overstep | |
706 chromosome coordinates. Upon uploading such a file to a genome browser | |
707 like UCSC, this results in an error like:: | |
708 | |
709 Error line 101 of custom track: chromEnd larger than chrom chr2 | |
710 size | |
711 | |
712 Use this method to clean your file, truncating any out-of-bounds | |
713 features to fit within the chromosome coordinates of `genome`. | |
714 | |
715 `genome` can be either an assembly name ('dm3') or a dictionary where | |
716 keys are chrom and values are (start, stop) tuples. | |
717 """ | |
718 if isinstance(genome, dict): | |
719 chromdict = genome | |
720 else: | |
721 assert isinstance(genome, str) | |
722 chromdict = helpers.chromsizes(genome) | |
723 | |
724 tmp = self._tmp() | |
725 with open(tmp, "w") as fout: | |
726 for chrom, coords in list(chromdict.items()): | |
727 start, stop = coords | |
728 start = str(start) | |
729 stop = str(stop) | |
730 fout.write("\t".join([chrom, start, stop]) + "\n") | |
731 return self.intersect(tmp) | |
732 | |
733 def tabix_intervals(self, interval_or_string: Interval | str, check_coordinates: bool=False) -> BedTool: | |
734 """ | |
735 Retrieve all intervals within coordinates from a "tabixed" BedTool. | |
736 | |
737 Given either a string in "chrom:start-stop" format, or an interval-like | |
738 object with chrom, start, stop attributes, return a *streaming* BedTool | |
739 of the features in this BedTool that overlap the provided interval. | |
740 | |
741 If the coordinates are invalid, an empty generator is returned unless | |
742 `check_coordinates=True` in which case a ValueError will be raised. | |
743 """ | |
744 if not self._tabixed(): | |
745 raise ValueError( | |
746 "This BedTool has not been indexed for tabix " | |
747 "-- please use the .tabix() method" | |
748 ) | |
749 | |
750 # tabix expects 1-based coords, but BEDTools works with | |
751 # zero-based. pybedtools and pysam also work with zero-based. So we can | |
752 # pass zero-based directly to the pysam tabix interface. | |
753 tbx = pysam.TabixFile(self.fn) | |
754 | |
755 # If an interval is passed, use its coordinates directly | |
756 if isinstance(interval_or_string, Interval): | |
757 interval: Interval = interval_or_string | |
758 chrom, start, end = interval.chrom, interval.start, interval.stop | |
759 # Parse string directly instead of relying on Interval, in order to | |
760 # permit full chromosome fetching | |
761 else: | |
762 match = helpers.coord_re.search(interval_or_string) | |
763 # Assume string is contig if it doesn't fit chrom:start-end format | |
764 if match is None: | |
765 chrom = interval_or_string | |
766 start, end = None, None | |
767 # Otherwise parse the coordinates | |
768 else: | |
769 chrom, start, end = match.group(1, 2, 3) | |
770 start, end = int(start), int(end) | |
771 | |
772 # Fetch results. | |
773 try: | |
774 results = tbx.fetch(str(chrom), start, end) | |
775 except ValueError: | |
776 if check_coordinates: | |
777 raise | |
778 else: | |
779 results = [] | |
780 | |
781 # pysam.ctabix.TabixIterator does not include newlines when yielding so | |
782 # we need to add them. | |
783 def gen(): | |
784 for i in results: | |
785 yield i + "\n" | |
786 | |
787 # xref #190 | |
788 x = BedTool(gen()).saveas() | |
789 tbx.close() | |
790 return x | |
791 | |
792 def tabix_contigs(self): | |
793 """ | |
794 Returns a list of contigs from the tabix index. | |
795 """ | |
796 if not self._tabixed(): | |
797 raise ValueError( | |
798 "This BedTool has not been indexed for tabix " | |
799 "-- please use the .tabix() method" | |
800 ) | |
801 | |
802 tbx = pysam.TabixFile(self.fn) | |
803 return tbx.contigs | |
804 | |
805 def tabix(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> BedTool: | |
806 """ | |
807 Prepare a BedTool for use with Tabix. | |
808 | |
809 Returns a new BedTool that has been BGZIP compressed | |
810 and indexed by tabix. | |
811 | |
812 Parameters | |
813 ---------- | |
814 | |
815 in_place : bool | |
816 If True (default), then assume the file is already sorted and | |
817 replace the existing file with the BGZIPed version. | |
818 | |
819 force : bool | |
820 If True (default is False), then overwrite both the index and the | |
821 BGZIP file. | |
822 | |
823 is_sorted : bool | |
824 If True (default is False), then assume the file is already sorted | |
825 so that BedTool.bgzip() doesn't have to do that work. | |
826 """ | |
827 # Return quickly if nothing to do | |
828 if self._tabixed() and not force: | |
829 return self | |
830 | |
831 # Make sure it's BGZIPed | |
832 fn = self.bgzip(in_place=in_place, force=force, is_sorted=is_sorted) | |
833 if self.file_type is not None and self.file_type not in ["bam", "empty"]: | |
834 pysam.tabix_index(fn, force=force, preset=self.file_type) # type: ignore | |
835 return BedTool(fn) | |
836 | |
837 def _tabixed(self): | |
838 """ | |
839 Verifies that we're working with a tabixed file: a string filename | |
840 pointing to a BGZIPed file with a .tbi file in the same dir. | |
841 """ | |
842 if ( | |
843 isinstance(self.fn, str) | |
844 and isBGZIP(self.fn) | |
845 and os.path.exists(self.fn + ".tbi") | |
846 ): | |
847 return True | |
848 | |
849 def bgzip(self, in_place: bool = True, force: bool = False, is_sorted: bool = False) -> str: | |
850 """ | |
851 Helper function for more control over "tabixed" BedTools. | |
852 | |
853 Checks to see if we already have a BGZIP file; if not then prepare one. | |
854 Always leaves the original file alone. You can always just make a | |
855 BedTool out of an already sorted and BGZIPed file to avoid this step. | |
856 | |
857 `in_place` will put the BGZIPed file in the same dir (possibly after | |
858 sorting to tempfile). | |
859 | |
860 If `is_sorted`, then assume the file is already sorted. Otherwise call | |
861 bedtools sort with the `-header` option. | |
862 | |
863 `force` will overwrite without asking. | |
864 """ | |
865 # It may already be BGZIPed... | |
866 if isinstance(self.fn, str) and not force: | |
867 if isBGZIP(self.fn): | |
868 return self.fn | |
869 | |
870 # If not in_place, then make a tempfile for the BGZIPed version | |
871 if not in_place: | |
872 # Get tempfile name, sorted or not | |
873 if not is_sorted: | |
874 fn = self.sort(header=True).fn | |
875 else: | |
876 fn = self._tmp() | |
877 | |
878 # Register for later deletion | |
879 outfn = fn + ".gz" | |
880 BedTool.TEMPFILES.append(outfn) | |
881 | |
882 # Creates tempfile.gz | |
883 pysam.tabix_compress(fn, outfn, force=force) | |
884 return outfn | |
885 | |
886 # Otherwise, make sure the BGZIPed version has a similar name to the | |
887 # current BedTool's file | |
888 if in_place: | |
889 if not is_sorted: | |
890 fn = self.sort(header=True).saveas().fn | |
891 else: | |
892 fn = self.fn | |
893 outfn = self.fn + ".gz" | |
894 pysam.tabix_compress(fn, outfn, force=force) | |
895 return outfn | |
896 | |
897 def delete_temporary_history(self, ask: bool = True, raw_input_func=None): | |
898 """ | |
899 Use at your own risk! This method will delete temp files. You will be | |
900 prompted for deletion of files unless you specify *ask=False*. | |
901 | |
902 Deletes all temporary files created during the history of this BedTool | |
903 up to but not including the file this current BedTool points to. | |
904 | |
905 Any filenames that are in the history and have the following pattern | |
906 will be deleted:: | |
907 | |
908 <TEMP_DIR>/pybedtools.*.tmp | |
909 | |
910 (where <TEMP_DIR> is the result from get_tempdir() and is by default | |
911 "/tmp") | |
912 | |
913 Any files that don't have this format will be left alone. | |
914 | |
915 (*raw_input_func* is used for testing) | |
916 """ | |
917 flattened_history = _flatten_list(self.history) | |
918 to_delete = [] | |
919 tempdir = get_tempdir() | |
920 for i in flattened_history: | |
921 fn = i.fn | |
922 if fn.startswith(os.path.join(os.path.abspath(tempdir), "pybedtools")): | |
923 if fn.endswith(".tmp"): | |
924 to_delete.append(fn) | |
925 | |
926 if raw_input_func is None: | |
927 raw_input_func = input | |
928 | |
929 str_fns = "\n\t".join(to_delete) | |
930 if ask: | |
931 answer = raw_input_func("Delete these files?\n\t%s\n(y/N) " % str_fns) | |
932 | |
933 if not answer.lower()[0] == "y": | |
934 print("OK, not deleting.") | |
935 return | |
936 for fn in to_delete: | |
937 os.unlink(fn) | |
938 return | |
939 | |
940 def filter(self, func: Callable, *args, **kwargs) -> BedTool: | |
941 """ | |
942 Filter features by user-defined function. | |
943 | |
944 Takes a function *func* that is called for each feature in the | |
945 `BedTool` object and returns only those for which the function returns | |
946 True. | |
947 | |
948 *args and **kwargs are passed directly to *func*. | |
949 | |
950 Returns a streaming BedTool; if you want the filename then use the | |
951 .saveas() method. | |
952 | |
953 >>> a = pybedtools.example_bedtool('a.bed') | |
954 >>> subset = a.filter(lambda b: b.chrom == 'chr1' and b.start < 150) | |
955 >>> len(a), len(subset) | |
956 (4, 2) | |
957 | |
958 so it has extracted 2 records from the original 4. | |
959 | |
960 """ | |
961 return BedTool((f for f in self if func(f, *args, **kwargs))) | |
962 | |
963 def field_count(self, n:int=10) -> int: | |
964 """ | |
965 Number of fields in each line of this BedTool (checks `n` lines) | |
966 | |
967 Return the number of fields in the features this file contains. Checks | |
968 the first *n* features. | |
969 | |
970 >>> a = pybedtools.example_bedtool('a.bed') | |
971 >>> a.field_count() | |
972 6 | |
973 """ | |
974 if self.file_type == "empty": | |
975 return 0 | |
976 i = 0 | |
977 fields = set([]) | |
978 for feat in self: | |
979 if i > n: | |
980 break | |
981 i += 1 | |
982 # TODO: make this more efficient. | |
983 fields.update([len(feat.fields)]) | |
984 assert len(fields) == 1, fields | |
985 return list(fields)[0] | |
986 | |
987 def each(self, func: Callable, *args, **kwargs) -> BedTool: | |
988 """ | |
989 Modify each feature with a user-defined function. | |
990 | |
991 Applies user-defined function *func* to each feature. *func* must | |
992 accept an Interval as its first argument; *args and **kwargs will be | |
993 passed to *func*. | |
994 | |
995 *func* must return an Interval object OR a value that evaluates to | |
996 False, in which case the original feature will be removed from the | |
997 output. This way, an additional "filter" call is not necessary. | |
998 | |
999 >>> def truncate_feature(feature, limit=0): | |
1000 ... feature.score = str(len(feature)) | |
1001 ... if len(feature) > limit: | |
1002 ... feature.stop = feature.start + limit | |
1003 ... feature.name = feature.name + '.short' | |
1004 ... return feature | |
1005 | |
1006 >>> a = pybedtools.example_bedtool('a.bed') | |
1007 >>> b = a.each(truncate_feature, limit=100) | |
1008 >>> print(b) #doctest: +NORMALIZE_WHITESPACE | |
1009 chr1 1 100 feature1 99 + | |
1010 chr1 100 200 feature2 100 + | |
1011 chr1 150 250 feature3.short 350 - | |
1012 chr1 900 950 feature4 50 + | |
1013 <BLANKLINE> | |
1014 | |
1015 """ | |
1016 | |
1017 def _generator(): | |
1018 for f in self: | |
1019 result = func(f, *args, **kwargs) | |
1020 if result: | |
1021 yield result | |
1022 | |
1023 return BedTool(_generator()) | |
1024 | |
1025 def introns(self, gene: str = "gene", exon: str = "exon") -> BedTool: | |
1026 """ | |
1027 Create intron features (requires specific input format). | |
1028 | |
1029 NOTE: this method assumes a simple file with non-overlapping exons. For | |
1030 more sophisticated features, consider the gffutils package instead. | |
1031 | |
1032 Given a BED12 or a GFF with exons, create a new `BedTool` with just | |
1033 introns. The output is a bed6 file with the score column (5) being one | |
1034 of 'intron'/'utr5'/'utr3' | |
1035 """ | |
1036 # iterate over all the features in the gene. | |
1037 s = self.sort() | |
1038 if self.file_type == "gff": | |
1039 exon_iter = BedTool((f for f in s if f[2] == exon)).saveas() | |
1040 gene_iter = BedTool((f for f in s if f[2] == gene)).saveas() | |
1041 | |
1042 elif self.file_type == "bed": | |
1043 if s.field_count() == 12: | |
1044 exon_iter = s.bed6().saveas() | |
1045 gene_iter = s.saveas() | |
1046 else: | |
1047 # TODO: bed6. groupby on name and find smallest start, | |
1048 # largest stop. | |
1049 exon_iter = s | |
1050 gene_iter = None | |
1051 raise NotImplementedError( | |
1052 ".introns() only supported for bed12" "and GFF" | |
1053 ) | |
1054 | |
1055 else: | |
1056 raise NotImplementedError(".introns() only supported for BED and GFF") | |
1057 | |
1058 with open(BedTool._tmp(), "w") as fh: | |
1059 # group on the name. | |
1060 exon_intervals = IntervalFile(exon_iter.fn) | |
1061 for g in gene_iter: | |
1062 # search finds all, but we just want the ones that completely | |
1063 # overlap this gene. | |
1064 exons = [ | |
1065 e | |
1066 for e in exon_intervals.search(g, same_strand=True) | |
1067 if e.start >= g.start and e.end <= g.end | |
1068 ] | |
1069 | |
1070 for i, exon_instance in enumerate(exons): | |
1071 exon_instance: pybedtools.Interval | |
1072 # 5' utr between gene start and first intron | |
1073 if i == 0 and exon_instance.start > g.start: | |
1074 utr = {"+": "utr5", "-": "utr3"}[g.strand] | |
1075 print( | |
1076 "%s\t%i\t%i\t%s\t%s\t%s" | |
1077 % (g.chrom, g.start, exon_instance.start, g.name, utr, g.strand), | |
1078 file=fh, | |
1079 ) | |
1080 elif i == len(exons) - 1 and exon_instance.end < g.end: | |
1081 utr = {"+": "utr3", "-": "utr5"}[g.strand] | |
1082 print( | |
1083 "%s\t%i\t%i\t%s\t%s\t%s" | |
1084 % (g.chrom, exon_instance.end, g.end, g.name, utr, g.strand), | |
1085 file=fh, | |
1086 ) | |
1087 elif i != len(exons) - 1: | |
1088 istart = exon_instance.end | |
1089 iend = exons[i + 1].start | |
1090 print( | |
1091 "%s\t%i\t%i\t%s\tintron\t%s" | |
1092 % (g.chrom, istart, iend, g.name, g.strand), | |
1093 file=fh, | |
1094 ) | |
1095 return BedTool(fh.name) | |
1096 | |
1097 def features(self): | |
1098 """ | |
1099 Returns an iterable of features | |
1100 """ | |
1101 if hasattr(self, "next") or hasattr(self, "__next__"): | |
1102 return self | |
1103 return iter(self) | |
1104 | |
1105 FileType = Literal['bed', 'vcf', 'gff', 'bam', 'sam', 'empty'] | |
1106 | |
1107 @property | |
1108 def file_type(self) -> Optional[FileType]: | |
1109 """ | |
1110 Return the type of the current file. One of ('bed','vcf','gff', 'bam', | |
1111 'sam', 'empty'). | |
1112 | |
1113 >>> a = pybedtools.example_bedtool('a.bed') | |
1114 >>> print(a.file_type) | |
1115 bed | |
1116 """ | |
1117 if not isinstance(self.fn, str): | |
1118 raise ValueError( | |
1119 "Checking file_type not supported for " | |
1120 "non-file BedTools. Use .saveas() to " | |
1121 "save as a temp file first." | |
1122 ) | |
1123 if self._isbam: | |
1124 self._file_type = "bam" | |
1125 else: | |
1126 try: | |
1127 self._file_type = next(iter(self)).file_type | |
1128 except StopIteration: | |
1129 self._file_type = "empty" | |
1130 | |
1131 return self._file_type | |
1132 | |
1133 def cut(self, indexes: list[int], stream: bool = False) -> BedTool: | |
1134 """ | |
1135 Analogous to unix `cut`. | |
1136 | |
1137 Similar to unix `cut` except indexes are 0-based, must be a list | |
1138 and the columns are returned in the order requested. | |
1139 | |
1140 This method returns a BedTool of results, which means that the indexes | |
1141 returned must be valid GFF/GTF/BED/SAM features. | |
1142 | |
1143 If you would like arbitrary columns -- say, just chrom and featuretype | |
1144 of a GFF, which would not comprise a valid feature -- then instead of | |
1145 this method, simply use indexes on each feature, e.g, | |
1146 | |
1147 >>> gff = pybedtools.example_bedtool('d.gff') | |
1148 >>> results = [(f[0], f[2]) for f in gff] | |
1149 | |
1150 In addition, `indexes` can contain keys of the GFF/GTF attributes, in | |
1151 which case the values are returned. e.g. 'gene_name' will return the | |
1152 corresponding name from a GTF, or 'start' will return the start | |
1153 attribute of a BED Interval. | |
1154 """ | |
1155 if stream: | |
1156 return BedTool(([f[attr] for attr in indexes] for f in self)) | |
1157 else: | |
1158 with open(self._tmp(), "w") as fh: | |
1159 for f in self: | |
1160 print("\t".join(map(str, [f[attr] for attr in indexes])), file=fh) | |
1161 return BedTool(fh.name) | |
1162 | |
1163 @classmethod | |
1164 def _tmp(cls) -> str: | |
1165 """ | |
1166 Makes a tempfile and registers it in the BedTool.TEMPFILES class | |
1167 variable. Adds a "pybedtools." prefix and ".tmp" extension for easy | |
1168 deletion if you forget to call pybedtools.cleanup(). | |
1169 """ | |
1170 tmpfn = tempfile.NamedTemporaryFile( | |
1171 prefix=settings.tempfile_prefix, | |
1172 suffix=settings.tempfile_suffix, | |
1173 delete=False, | |
1174 ) | |
1175 tmpfn = tmpfn.name | |
1176 cls.TEMPFILES.append(tmpfn) | |
1177 return tmpfn | |
1178 | |
1179 def __iter__(self): | |
1180 """ | |
1181 Dispatches the right iterator depending on how this BedTool was | |
1182 created | |
1183 """ | |
1184 if self._isbam: | |
1185 # Note: BAM class takes filename or stream, so self.fn is OK | |
1186 # here | |
1187 return BAM(self.fn) | |
1188 | |
1189 # Plain ol' filename | |
1190 if isinstance(self.fn, str): | |
1191 if not os.path.exists(self.fn): | |
1192 raise BedToolsFileError("{0} does not exist".format(self.fn)) | |
1193 if isGZIP(self.fn): | |
1194 return IntervalIterator(gzip.open(self.fn, "rt")) | |
1195 else: | |
1196 return IntervalIterator(open(self.fn, "r")) | |
1197 # Any other kind of input (streaming string from stdout; iterable of | |
1198 # Intervals, iterable of (chrom, start, stop) tuples, etc are handled | |
1199 # appropriately by IntervalIterator. | |
1200 else: | |
1201 return IntervalIterator(self.fn) | |
1202 | |
1203 @property | |
1204 def intervals(self): | |
1205 if isinstance(self.fn, str): | |
1206 return IntervalFile(self.fn) | |
1207 else: | |
1208 raise ValueError("Please convert to a file-based BedTool using saveas") | |
1209 | |
1210 def __repr__(self): | |
1211 if isinstance(self.fn, str): | |
1212 if os.path.exists(self.fn) or self.remote: | |
1213 return "<BedTool(%s)>" % self.fn | |
1214 else: | |
1215 return "<BedTool(MISSING FILE: %s)>" % self.fn | |
1216 elif isinstance(self.fn, BedTool): | |
1217 return repr(self.fn) | |
1218 else: | |
1219 return "<BedTool(%s)>" % repr(self.fn) | |
1220 | |
1221 def __str__(self): | |
1222 """ | |
1223 Returns the string representation of the whole `BedTool` | |
1224 """ | |
1225 items = [] | |
1226 for i in iter(self): | |
1227 i = str(i) | |
1228 if isinstance(i, bytes): | |
1229 i = i.decode("UTF-8") | |
1230 items.append(i) | |
1231 return "".join(items) | |
1232 | |
1233 def __len__(self): | |
1234 return self.count() | |
1235 | |
1236 def __eq__(self, other: object) -> bool: | |
1237 if isinstance(other, BedTool): | |
1238 if not isinstance(self.fn, str) or not isinstance( | |
1239 other.fn, str | |
1240 ): | |
1241 raise NotImplementedError( | |
1242 "Testing equality only supported for" | |
1243 " BedTools that point to files" | |
1244 ) | |
1245 elif not isinstance(other, str): | |
1246 raise NotImplementedError( | |
1247 "Testing equality only supported for" | |
1248 " BedTools that point to files or str of content" | |
1249 ) | |
1250 return str(self) == str(other) | |
1251 | |
1252 def __ne__(self, other:object): | |
1253 return not self.__eq__(other) | |
1254 | |
1255 def __getitem__(self, key: slice|int): | |
1256 if isinstance(key, slice): | |
1257 return islice(self, key.start, key.stop, key.step) | |
1258 elif isinstance(key, int): | |
1259 return list(islice(self, key, key + 1))[0] | |
1260 else: | |
1261 raise ValueError( | |
1262 "Only slices or integers allowed for indexing " "into a BedTool" | |
1263 ) | |
1264 | |
1265 def __add__(self, other: BedTool) -> BedTool: | |
1266 try: | |
1267 result = self.intersect(other, u=True) | |
1268 except BEDToolsError as e: | |
1269 # BEDTools versions <2.20 would raise BEDToolsError | |
1270 if (self.file_type == "empty") or (other.file_type == "empty"): | |
1271 result = pybedtools.BedTool("", from_string=True) | |
1272 else: | |
1273 raise e | |
1274 return result | |
1275 | |
1276 def __sub__(self, other: BedTool) -> BedTool: | |
1277 result = None | |
1278 | |
1279 try: | |
1280 result = self.intersect(other, v=True) | |
1281 except BEDToolsError: | |
1282 # BEDTools versions <2.20 would raise BEDToolsError | |
1283 | |
1284 if (self.file_type == "empty") and (other.file_type == "empty"): | |
1285 result = pybedtools.BedTool("", from_string=True) | |
1286 elif other.file_type == "empty": | |
1287 result = self.saveas() | |
1288 elif self.file_type == "empty": | |
1289 result = pybedtools.BedTool("", from_string=True) | |
1290 if result is None: | |
1291 raise ValueError("Subtraction operation failed.") | |
1292 | |
1293 return result | |
1294 | |
1295 def head(self, n: int = 10, as_string: bool = False): | |
1296 """ | |
1297 Prints the first *n* lines or returns them if as_string is True | |
1298 | |
1299 Note that this only opens the underlying file (gzipped or not), so it | |
1300 does not check to see if the file is a valid BED file. | |
1301 | |
1302 >>> a = pybedtools.example_bedtool('a.bed') | |
1303 >>> a.head(2) #doctest: +NORMALIZE_WHITESPACE | |
1304 chr1 1 100 feature1 0 + | |
1305 chr1 100 200 feature2 0 + | |
1306 <BLANKLINE> | |
1307 | |
1308 """ | |
1309 if not isinstance(self.fn, str): | |
1310 raise NotImplementedError( | |
1311 "head() not supported for non file-based BedTools" | |
1312 ) | |
1313 if as_string: | |
1314 return "".join(str(line) for line in self[:n]) | |
1315 if self._isbam: | |
1316 raise NotImplementedError("head() not supported for BAM") | |
1317 else: | |
1318 if isGZIP(self.fn): | |
1319 openfunc = gzip.open | |
1320 openmode = "rt" | |
1321 else: | |
1322 openfunc = open | |
1323 openmode = "r" | |
1324 with openfunc(self.fn, openmode) as fin: | |
1325 for i, line in enumerate(fin): | |
1326 if i == (n): | |
1327 break | |
1328 print(line, end=" ") | |
1329 | |
1330 def set_chromsizes(self, chromsizes: str | dict): | |
1331 """ | |
1332 Prepare BedTool for operations that require chromosome coords. | |
1333 | |
1334 Set the chromsizes for this genome. If *chromsizes* is a string, it | |
1335 will be considered a genome assembly name. If that assembly name is | |
1336 not available in pybedtools.genome_registry, then it will be searched | |
1337 for on the UCSC Genome Browser. | |
1338 | |
1339 Example usage: | |
1340 | |
1341 >>> hg19 = pybedtools.chromsizes('hg19') | |
1342 >>> a = pybedtools.example_bedtool('a.bed') | |
1343 >>> a = a.set_chromsizes(hg19) | |
1344 >>> print(a.chromsizes['chr1']) | |
1345 (0, 249250621) | |
1346 | |
1347 """ | |
1348 if isinstance(chromsizes, str): | |
1349 self.chromsizes = pybedtools.chromsizes(chromsizes) | |
1350 elif isinstance(chromsizes, dict): | |
1351 self.chromsizes = chromsizes | |
1352 else: | |
1353 raise ValueError( | |
1354 "Need to specify chromsizes either as a string" | |
1355 " (assembly name) or a dictionary" | |
1356 ) | |
1357 return self | |
1358 | |
1359 def _collapse( | |
1360 self, | |
1361 iterable: Iterable, | |
1362 fn: Optional[str] = None, | |
1363 trackline: Optional[str] = None, | |
1364 in_compressed: bool = False, | |
1365 out_compressed: bool = False, | |
1366 ) -> str: | |
1367 """ | |
1368 Collapses an iterable into file *fn* (or a new tempfile if *fn* is | |
1369 None). | |
1370 | |
1371 Returns the newly created filename. | |
1372 | |
1373 Parameters | |
1374 ---------- | |
1375 | |
1376 iterable : iter | |
1377 Any iterable object whose items can be converted to an Interval. | |
1378 | |
1379 fn : str | |
1380 Output filename, if None then creates a temp file for output | |
1381 | |
1382 trackline : str | |
1383 If not None, string to be added to the top of the output. Newline | |
1384 will be added. | |
1385 | |
1386 in_compressed : bool | |
1387 Indicates whether the input is compressed | |
1388 | |
1389 out_compressed : bool | |
1390 Indicates whether the output should be compressed | |
1391 """ | |
1392 if fn is None: | |
1393 fn = self._tmp() | |
1394 | |
1395 in_open_func = gzip.open if in_compressed else open | |
1396 out_open_func = gzip.open if out_compressed else open | |
1397 | |
1398 # special case: if BAM-format BedTool is provided, no trackline should | |
1399 # be supplied, and don't iterate -- copy the file wholesale | |
1400 if isinstance(iterable, BedTool) and iterable._isbam: | |
1401 if trackline: | |
1402 raise ValueError( | |
1403 "trackline provided, but input is a BAM " | |
1404 "file, which takes no track line" | |
1405 ) | |
1406 with open(fn, "wb") as out_: | |
1407 out_.write(open(self.fn, "rb").read()) | |
1408 return fn | |
1409 | |
1410 # If we're just working with filename-based BedTool objects, just copy | |
1411 # the files directly | |
1412 if isinstance(iterable, BedTool) and isinstance(iterable.fn, str): | |
1413 with out_open_func(fn, "wt") as out_: | |
1414 if sys.version_info > (3,0): | |
1415 in_ = in_open_func(iterable.fn, "rt", errors="ignore") | |
1416 else: | |
1417 in_ = in_open_func(iterable.fn, "rt") | |
1418 if trackline: | |
1419 out_.write(trackline.strip() + "\n") | |
1420 out_.writelines(in_) | |
1421 in_.close() | |
1422 else: | |
1423 with out_open_func(fn, "wt") as out_: | |
1424 for i in iterable: | |
1425 if isinstance(i, (list, tuple)): | |
1426 i = create_interval_from_list(list(i)) | |
1427 out_.write(str(i)) | |
1428 return fn | |
1429 | |
1430 def handle_kwargs(self, prog:str, arg_order: Optional[list[str]] = None, **kwargs): | |
1431 """ | |
1432 Handle most cases of BEDTool program calls, but leave the specifics | |
1433 up to individual methods. | |
1434 | |
1435 *prog* is a BEDTools program name, e.g., 'intersectBed'. | |
1436 | |
1437 *arg_order* lists any arguments that are sensitive to order. Everything | |
1438 else will be reverse-sorted. | |
1439 | |
1440 *kwargs* are passed directly from the calling method (like | |
1441 self.intersect). | |
1442 | |
1443 This method figures out, given how this BedTool was constructed, what | |
1444 to send to BEDTools programs -- for example, an open file to stdin with | |
1445 the `-` argument, or a filename with the `-a` argument. | |
1446 """ | |
1447 pybedtools.logger.debug( | |
1448 "BedTool.handle_kwargs() got these kwargs:\n%s", pprint.pformat(kwargs) | |
1449 ) | |
1450 | |
1451 # If you pass in a list, how should it be converted to a BedTools arg? | |
1452 default_list_delimiter = " " | |
1453 list_delimiters = { | |
1454 "annotateBed": " ", | |
1455 "getOverlap": ",", | |
1456 "groupBy": ",", | |
1457 "multiIntersectBed": " ", | |
1458 "mergeBed": ",", | |
1459 "intersectBed": " ", | |
1460 "mapBed": ",", | |
1461 } | |
1462 stdin = None | |
1463 | |
1464 # ----------------------------------------------------------------- | |
1465 # Decide how to send instream1 to BEDTools. If there's no implicit | |
1466 # instream1 arg, then do nothing. | |
1467 # | |
1468 try: | |
1469 # e.g., 'a' for intersectBed | |
1470 if self._isbam: | |
1471 inarg1 = _bam_registry[prog] | |
1472 else: | |
1473 inarg1 = _implicit_registry[prog] | |
1474 | |
1475 # e.g., self.fn or 'a.bed' or an iterator... | |
1476 instream1 = kwargs[inarg1] | |
1477 | |
1478 # If it's a BedTool, then get underlying stream | |
1479 if isinstance(instream1, BedTool): | |
1480 instream1 = instream1.fn | |
1481 | |
1482 # Filename? No pipe, just provide the file | |
1483 if isinstance(instream1, str): | |
1484 kwargs[inarg1] = instream1 | |
1485 stdin = None | |
1486 | |
1487 # Open file? Pipe it | |
1488 # elif isinstance(instream1, file): | |
1489 # kwargs[inarg1] = 'stdin' | |
1490 # stdin = instream1 | |
1491 | |
1492 # A generator or iterator: pipe it as a generator of lines | |
1493 else: | |
1494 kwargs[inarg1] = "stdin" | |
1495 stdin = (str(i) for i in instream1) | |
1496 except KeyError: | |
1497 pass | |
1498 | |
1499 # ----------------------------------------------------------------- | |
1500 # Decide how to send instream2 to BEDTools. | |
1501 try: | |
1502 # e.g., 'b' for intersectBed | |
1503 inarg2 = _other_registry[prog] | |
1504 | |
1505 # e.g., another BedTool | |
1506 instream2 = kwargs[inarg2] | |
1507 | |
1508 # Get stream if BedTool | |
1509 if isinstance(instream2, BedTool): | |
1510 instream2 = instream2.fn | |
1511 | |
1512 # Filename | |
1513 if isinstance(instream2, str): | |
1514 kwargs[inarg2] = instream2 | |
1515 | |
1516 # If it's a list of strings, then we need to figure out if it's | |
1517 # a list of filenames or a list of intervals (see issue #156) | |
1518 # | |
1519 # Several options: | |
1520 # | |
1521 # - assume intervals have tabs but filenames don't | |
1522 # - assume that, upon being split on tabs, an interval is >=3 fields | |
1523 # - try creating an interval out of the first thing, success means interval | |
1524 # | |
1525 # The last seems the most robust. It does allow filenames with | |
1526 # tabs; deciding whether or not such filenames are a good idea is | |
1527 # left to the user. | |
1528 # | |
1529 elif isinstance(instream2, (list, tuple)) and isinstance( | |
1530 instream2[0], str | |
1531 ): | |
1532 try: | |
1533 _ = create_interval_from_list(instream2[0].split("\t")) | |
1534 kwargs[inarg2] = self._collapse(instream2) | |
1535 except IndexError: | |
1536 kwargs[inarg2] = instream2 | |
1537 | |
1538 # Otherwise we need to collapse it in order to send to BEDTools | |
1539 # programs | |
1540 else: | |
1541 kwargs[inarg2] = self._collapse(instream2) | |
1542 | |
1543 except KeyError: | |
1544 pass | |
1545 | |
1546 # If stream not specified, then a tempfile will be created | |
1547 if kwargs.pop("stream", None): | |
1548 tmp = None | |
1549 else: | |
1550 output = kwargs.pop("output", None) | |
1551 if output: | |
1552 tmp = output | |
1553 else: | |
1554 tmp = self._tmp() | |
1555 | |
1556 additional_args = kwargs.pop("additional_args", None) | |
1557 | |
1558 # Parse the kwargs into BEDTools-ready args | |
1559 cmds = [prog] | |
1560 | |
1561 # arg_order mechanism added to fix #345 | |
1562 if arg_order is None: | |
1563 arg_order = [] | |
1564 | |
1565 for arg in arg_order: | |
1566 if arg in kwargs: | |
1567 val = kwargs.pop(arg) | |
1568 cmds.append("-" + arg) | |
1569 cmds.append(val) | |
1570 | |
1571 # The reverse-sort is a temp fix for issue #81 | |
1572 for key, value in sorted(list(kwargs.items()), reverse=True): | |
1573 if isinstance(value, bool): | |
1574 if value: | |
1575 cmds.append("-" + key) | |
1576 else: | |
1577 continue | |
1578 elif isinstance(value, list) or isinstance(value, tuple): | |
1579 value = list(map(str, value)) | |
1580 try: | |
1581 delim = list_delimiters[prog] | |
1582 except KeyError: | |
1583 delim = default_list_delimiter | |
1584 | |
1585 if delim == " ": | |
1586 cmds.append("-" + key) | |
1587 cmds.extend(value) | |
1588 | |
1589 # make comma-separated list if that's what's needed | |
1590 else: | |
1591 cmds.append("-" + key) | |
1592 cmds.append(delim.join(value)) | |
1593 | |
1594 else: | |
1595 cmds.append("-" + key) | |
1596 cmds.append(str(value)) | |
1597 | |
1598 if additional_args: | |
1599 cmds.append(additional_args) | |
1600 | |
1601 return cmds, tmp, stdin | |
1602 | |
1603 def check_genome(self, **kwargs): | |
1604 """ | |
1605 Handles the different ways of specifying a genome in kwargs: | |
1606 | |
1607 g='genome.file' specifies a file directly | |
1608 genome='dm3' gets the file from genome registry | |
1609 self.chromsizes could be a dict.\ | |
1610 """ | |
1611 | |
1612 # If both g and genome are missing, assume self.chromsizes | |
1613 if ("g" not in kwargs) and ("genome" not in kwargs): | |
1614 if hasattr(self, "chromsizes"): | |
1615 kwargs["g"] = self.chromsizes | |
1616 else: | |
1617 raise ValueError( | |
1618 'No genome specified. Use the "g" or ' | |
1619 '"genome" kwargs, or use the ' | |
1620 ".set_chromsizes() method" | |
1621 ) | |
1622 | |
1623 # If both specified, rather than make an implicit decision, raise an | |
1624 # exception | |
1625 if "g" in kwargs and "genome" in kwargs: | |
1626 raise ValueError('Cannot specify both "g" and "genome"') | |
1627 | |
1628 # Something like genome='dm3' was specified | |
1629 if "g" not in kwargs and "genome" in kwargs: | |
1630 if isinstance(kwargs["genome"], dict): | |
1631 genome_dict = kwargs["genome"] | |
1632 else: | |
1633 genome_dict = pybedtools.chromsizes(kwargs["genome"]) | |
1634 genome_file = pybedtools.chromsizes_to_file(genome_dict) | |
1635 kwargs["g"] = genome_file | |
1636 del kwargs["genome"] | |
1637 | |
1638 # By the time we get here, 'g' is specified. | |
1639 | |
1640 # If a dict was provided, convert to tempfile here | |
1641 if isinstance(kwargs["g"], dict): | |
1642 kwargs["g"] = pybedtools.chromsizes_to_file(kwargs["g"]) | |
1643 | |
1644 if not os.path.exists(kwargs["g"]): | |
1645 msg = 'Genome file "%s" does not exist' % (kwargs["g"]) | |
1646 raise FileNotFoundError(msg) | |
1647 | |
1648 return kwargs | |
1649 | |
1650 @_log_to_history | |
1651 def remove_invalid(self): | |
1652 """ | |
1653 Remove invalid features that may break BEDTools programs. | |
1654 | |
1655 >>> a = pybedtools.BedTool("chr1 10 100\\nchr1 10 1", | |
1656 ... from_string=True) | |
1657 >>> print(a.remove_invalid()) #doctest: +NORMALIZE_WHITESPACE | |
1658 chr1 10 100 | |
1659 <BLANKLINE> | |
1660 | |
1661 """ | |
1662 tmp = self._tmp() | |
1663 | |
1664 # If it's a file-based BedTool -- which is likely, if we're trying to | |
1665 # remove invalid features -- then we need to parse it line by line. | |
1666 if isinstance(self.fn, str): | |
1667 i = IntervalIterator(open(self.fn, "r")) | |
1668 else: | |
1669 tmp = self.saveas() | |
1670 i = IntervalIterator(open(tmp.fn, "r")) | |
1671 | |
1672 def _generator(): | |
1673 while True: | |
1674 try: | |
1675 feature = next(i) | |
1676 if feature.start <= feature.stop: | |
1677 yield feature | |
1678 else: | |
1679 continue | |
1680 except pybedtools.MalformedBedLineError: | |
1681 continue | |
1682 except OverflowError: | |
1683 # This can happen if coords are negative | |
1684 continue | |
1685 except IndexError: | |
1686 continue | |
1687 except StopIteration: | |
1688 break | |
1689 | |
1690 return BedTool(_generator()) | |
1691 | |
1692 def all_hits(self, interval: Interval, same_strand: bool = False, overlap: float = 0.0): | |
1693 """ | |
1694 Return all intervals that overlap `interval`. | |
1695 | |
1696 Calls the `all_hits` method of an IntervalFile to return all intervals | |
1697 in this current BedTool that overlap `interval`. | |
1698 | |
1699 Require that overlaps have the same strand with same_strand=True. | |
1700 | |
1701 Notes: | |
1702 If this current BedTool is generator-based, it will be | |
1703 converted into a file first. | |
1704 | |
1705 If this current BedTool refers to a BAM file, it will be | |
1706 converted to a BED file first using default arguments. If you | |
1707 don't want this to happen, please convert to BED first before | |
1708 using this method. | |
1709 """ | |
1710 if not isinstance(interval, Interval): | |
1711 raise ValueError("Need an Interval instance") | |
1712 fn = self.fn | |
1713 if not isinstance(fn, str): | |
1714 fn = self.saveas().fn | |
1715 if self._isbam: | |
1716 fn = self.bam_to_bed().fn | |
1717 interval_file = pybedtools.IntervalFile(fn) | |
1718 return interval_file.all_hits(interval, same_strand, overlap) | |
1719 | |
1720 def any_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0): | |
1721 """ | |
1722 Return whether or not any intervals overlap `interval`. | |
1723 | |
1724 Calls the `any_hits` method of an IntervalFile. If there were any hits | |
1725 within `interval` in this BedTool, then return 1; otherwise 0. | |
1726 | |
1727 Require that overlaps have the same strand with same_strand=True. | |
1728 | |
1729 Notes: | |
1730 If this current BedTool is generator-based, it will be | |
1731 converted into a file first. | |
1732 | |
1733 If this current BedTool refers to a BAM file, it will be | |
1734 converted to a BED file first using default arguments. If you | |
1735 don't want this to happen, please convert to BED first before | |
1736 using this method. | |
1737 """ | |
1738 if not isinstance(interval, Interval): | |
1739 raise ValueError("Need an Interval instance") | |
1740 fn = self.fn | |
1741 if not isinstance(fn, str): | |
1742 fn = self.saveas().fn | |
1743 if self._isbam: | |
1744 fn = self.bam_to_bed().fn | |
1745 interval_file = pybedtools.IntervalFile(fn) | |
1746 return interval_file.any_hits(interval, same_strand, overlap) | |
1747 | |
1748 def count_hits(self, interval: Interval, same_strand: bool = False, overlap: float=0.0) -> int: | |
1749 """ | |
1750 Return the number of intervals that overlap `interval`. | |
1751 | |
1752 Calls the `count_hits` method of an IntervalFile. Returns the number | |
1753 of valid hits in this BedTool that overlap `interval`. | |
1754 | |
1755 Require that overlaps have the same strand with same_strand=True. | |
1756 | |
1757 Notes: | |
1758 If this current BedTool is generator-based, it will be | |
1759 converted into a file first. | |
1760 | |
1761 If this current BedTool refers to a BAM file, it will be | |
1762 converted to a BED file first using default arguments. If you | |
1763 don't want this to happen, please convert to BED first before | |
1764 using this method. | |
1765 """ | |
1766 if not isinstance(interval, Interval): | |
1767 raise ValueError("Need an Interval instance") | |
1768 fn = self.fn | |
1769 if not isinstance(fn, str): | |
1770 fn = self.saveas().fn | |
1771 if self._isbam: | |
1772 fn = self.bam_to_bed().fn | |
1773 interval_file = pybedtools.IntervalFile(fn) | |
1774 return interval_file.count_hits(interval, same_strand, overlap) | |
1775 | |
1776 @_log_to_history | |
1777 @_wraps(prog="bed12ToBed6", implicit="i", bam=None, other=None) | |
1778 def bed6(self, *args, **kwargs) -> BedTool: # type: ignore | |
1779 """ | |
1780 Wraps `bedtools bed12tobed6`. | |
1781 """ | |
1782 | |
1783 # Alias for backward compatibility | |
1784 bed12tobed6 = bed6 | |
1785 | |
1786 @_log_to_history | |
1787 @_wraps(prog="bamToBed", implicit="i", other=None, nonbam="ALL", bam="i") | |
1788 def bam_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore | |
1789 """ | |
1790 Wraps `bedtools bamtobed`. | |
1791 """ | |
1792 | |
1793 # Alias for backward compatibility | |
1794 bamtobed = bam_to_bed | |
1795 | |
1796 @_wraps(prog="bedToBam", implicit="i", uses_genome=True, force_bam=True) | |
1797 def _bed_to_bam(self, *args, **kwargs): | |
1798 """ | |
1799 Wraps bedToBam and is called internally for BED/GFF/VCF files by | |
1800 self.to_bam (which needs to do something different for SAM files...) | |
1801 """ | |
1802 | |
1803 @_log_to_history | |
1804 def to_bam(self, **kwargs): | |
1805 """ | |
1806 Wraps `bedtools bedtobam` | |
1807 | |
1808 If self.fn is in BED/VCF/GFF format, call BEDTools' bedToBam. If | |
1809 self.fn is in SAM format, then create a header out of the genome file | |
1810 and then convert using `samtools`. | |
1811 """ | |
1812 if self.file_type == "bam": | |
1813 return self | |
1814 if self.file_type in ("bed", "gff", "vcf"): | |
1815 return self._bed_to_bam(**kwargs) | |
1816 | |
1817 # TODO: to maintain backwards compatibility we go from Interval to | |
1818 # AlignedSegment. | |
1819 if self.file_type == "sam": | |
1820 | |
1821 # Use pysam, but construct the header out of a provided genome | |
1822 # file. | |
1823 | |
1824 # construct a genome out of whatever kwargs were passed in | |
1825 kwargs = self.check_genome(**kwargs) | |
1826 | |
1827 # Build a header that we can use for the output BAM file. | |
1828 genome = dict(i.split() for i in open(kwargs["g"])) | |
1829 SQ = [] | |
1830 ref_ids = {} | |
1831 text_header = ["@HD\tVN:1.0"] | |
1832 | |
1833 for i, (k, v) in enumerate(genome.items()): | |
1834 SQ.append(dict(SN=k, LN=int(v))) | |
1835 ref_ids[k] = i | |
1836 text_header.append("@SQ\tSN:{0}\tLN:{1}".format(k, v)) | |
1837 | |
1838 # And the text-format header | |
1839 text_header = "\n".join(text_header) + "\n" | |
1840 | |
1841 # The strategy is to write an actual SAM file to disk, along with | |
1842 # a header, and then read that back in. | |
1843 # | |
1844 # Painfully inefficient, but this will change once all py2 tests | |
1845 # pass. | |
1846 sam_tmp = self._tmp() | |
1847 bam_tmp = self._tmp() | |
1848 with open(sam_tmp, "w") as fout: | |
1849 fout.write(text_header) | |
1850 for interval in self: | |
1851 fout.write("\t".join(map(str, interval.fields)) + "\n") | |
1852 | |
1853 samfile = pysam.AlignmentFile(sam_tmp, "r") | |
1854 bamfile = pysam.AlignmentFile(bam_tmp, "wb", template=samfile) | |
1855 for alignment in samfile: | |
1856 bamfile.write(alignment) | |
1857 | |
1858 samfile.close() | |
1859 bamfile.close() | |
1860 new_bedtool = BedTool(bam_tmp) | |
1861 new_bedtool._isbam = True | |
1862 return new_bedtool | |
1863 | |
1864 # Alias for backward compatibility | |
1865 bedtobam = to_bam | |
1866 | |
1867 @_log_to_history | |
1868 @_wraps(prog="intersectBed", implicit="a", other="b", bam="abam", | |
1869 nonbam="bed", arg_order=["a", "abam"]) | |
1870 def intersect(self, *args, **kwargs) -> BedTool: # type: ignore | |
1871 """ | |
1872 Wraps `bedtools intersect`. | |
1873 """ | |
1874 | |
1875 @_log_to_history | |
1876 @_wraps( | |
1877 prog="fastaFromBed", | |
1878 implicit="bed", | |
1879 bam=None, | |
1880 other="fi", | |
1881 make_tempfile_for="fo", | |
1882 check_stderr=_check_sequence_stderr, | |
1883 add_to_bedtool={"fo": "seqfn"}, | |
1884 ) | |
1885 def sequence(self, *args, **kwargs) -> BedTool: # type: ignore | |
1886 ''' | |
1887 Wraps `bedtools getfasta`. | |
1888 | |
1889 *fi* is passed in by the user; *bed* is automatically passed in as the | |
1890 bedfile of this object; *fo* by default is a temp file. Use | |
1891 save_seqs() to save as a file. | |
1892 | |
1893 The end result is that this BedTool will assign a value to the attribute , self.seqfn, | |
1894 that points to the new fasta file. | |
1895 | |
1896 Example usage: | |
1897 | |
1898 >>> a = pybedtools.BedTool(""" | |
1899 ... chr1 1 10 | |
1900 ... chr1 50 55""", from_string=True) | |
1901 >>> fasta = pybedtools.example_filename('test.fa') | |
1902 >>> a = a.sequence(fi=fasta) | |
1903 >>> print(open(a.seqfn).read()) | |
1904 >chr1:1-10 | |
1905 GATGAGTCT | |
1906 >chr1:50-55 | |
1907 CCATC | |
1908 <BLANKLINE> | |
1909 | |
1910 ''' | |
1911 | |
1912 # Alias for backwards compatibility | |
1913 getfasta = sequence | |
1914 | |
1915 @staticmethod | |
1916 def seq(loc, fasta) -> str: | |
1917 """ | |
1918 Return just the sequence from a region string or a single location | |
1919 >>> fn = pybedtools.example_filename('test.fa') | |
1920 >>> BedTool.seq('chr1:2-10', fn) | |
1921 'GATGAGTCT' | |
1922 >>> BedTool.seq(('chr1', 1, 10), fn) | |
1923 'GATGAGTCT' | |
1924 """ | |
1925 if isinstance(loc, str): | |
1926 chrom, start_end = loc.split(":") | |
1927 start, end = list(map(int, start_end.split("-"))) | |
1928 start -= 1 | |
1929 else: | |
1930 chrom, start, end = loc[0], loc[1], loc[2] | |
1931 | |
1932 loc = BedTool("%s\t%i\t%i" % (chrom, start, end), from_string=True) | |
1933 lseq = loc.sequence(fi=fasta) | |
1934 return "".join([l.rstrip() for l in open(lseq.seqfn, "r") if l[0] != ">"]) | |
1935 | |
1936 @_log_to_history | |
1937 @_wraps( | |
1938 prog="nucBed", implicit="bed", other="fi", check_stderr=_check_sequence_stderr | |
1939 ) | |
1940 def nucleotide_content(self) -> BedTool: # type: ignore | |
1941 """ | |
1942 Wraps `bedtools nuc`. | |
1943 | |
1944 Profiles nucleotide content. The returned BED file contains extra | |
1945 information about the nucleotide content | |
1946 """ | |
1947 | |
1948 # Alias for backwards compatibility | |
1949 nuc = nucleotide_content | |
1950 | |
1951 @_log_to_history | |
1952 @_wraps(prog="multiBamCov", implicit="bed") | |
1953 def multi_bam_coverage(self) -> BedTool: # type: ignore | |
1954 """ | |
1955 Wraps `bedtools multicov`. | |
1956 | |
1957 Pass a list of sorted and indexed BAM files as `bams` | |
1958 """ | |
1959 | |
1960 # Alias for backwards compatibility | |
1961 multicov = multi_bam_coverage | |
1962 | |
1963 @_log_to_history | |
1964 @_wraps(prog="subtractBed", implicit="a", other="b", bam=None) | |
1965 def subtract(self, *args, **kwargs) -> BedTool: # type: ignore | |
1966 """ | |
1967 Wraps `bedtools subtract`. | |
1968 | |
1969 Subtracts from another BED file and returns a new BedTool object. | |
1970 | |
1971 Example usage: | |
1972 | |
1973 >>> a = pybedtools.example_bedtool('a.bed') | |
1974 >>> b = pybedtools.example_bedtool('b.bed') | |
1975 | |
1976 Do a "stranded" subtraction: | |
1977 | |
1978 >>> c = a.subtract(b, s=True) | |
1979 | |
1980 Require 50% of features in `a` to overlap: | |
1981 | |
1982 >>> c = a.subtract(b, f=0.5) | |
1983 """ | |
1984 if "a" not in kwargs: | |
1985 kwargs["a"] = self.fn | |
1986 | |
1987 if "b" not in kwargs: | |
1988 if len(args) > 0: | |
1989 kwargs["b"] = args[0] | |
1990 else: | |
1991 raise ValueError("Must specify a BED file to subtract, either as a positional argument or as the 'b' keyword argument.") | |
1992 | |
1993 cmds, tmp, stdin = self.handle_kwargs(prog="subtractBed", **kwargs) | |
1994 stream = call_bedtools(cmds, tmp, stdin=stdin) | |
1995 return BedTool(stream) | |
1996 | |
1997 @_log_to_history | |
1998 @_wraps(prog="slopBed", implicit="i", other=None, bam=None, uses_genome=True) | |
1999 def slop(self, *args, **kwargs) -> BedTool: # type: ignore | |
2000 """ | |
2001 Wraps `bedtools slop`. | |
2002 """ | |
2003 | |
2004 @_log_to_history | |
2005 @_wraps(prog="shiftBed", implicit="i", other=None, bam=None, uses_genome=True) | |
2006 def shift(self, *args, **kwargs) -> BedTool: # type: ignore | |
2007 """ | |
2008 Wraps `bedtools shift`. | |
2009 | |
2010 Shift each feature by user-defined number of bases. Returns a new BedTool object. | |
2011 | |
2012 Example usage: | |
2013 | |
2014 >>> a = pybedtools.example_bedtool('a.bed') | |
2015 | |
2016 Shift every feature by 5bp: | |
2017 | |
2018 >>> b = a.shift(genome='hg19', s=5) | |
2019 >>> print(b) #doctest: +NORMALIZE_WHITESPACE | |
2020 chr1 6 105 feature1 0 + | |
2021 chr1 105 205 feature2 0 + | |
2022 chr1 155 505 feature3 0 - | |
2023 chr1 905 955 feature4 0 + | |
2024 <BLANKLINE> | |
2025 | |
2026 Shift features on the '+' strand by -1bp and on '-' strand by +3bp: | |
2027 | |
2028 >>> b = a.shift(genome='hg19', p=-1, m=3) | |
2029 >>> print(b) #doctest: +NORMALIZE_WHITESPACE | |
2030 chr1 0 99 feature1 0 + | |
2031 chr1 99 199 feature2 0 + | |
2032 chr1 153 503 feature3 0 - | |
2033 chr1 899 949 feature4 0 + | |
2034 <BLANKLINE> | |
2035 | |
2036 # Disabling, see https://github.com/arq5x/bedtools2/issues/807 | |
2037 Shift features by a fraction of their length (0.50): | |
2038 | |
2039 #>>> b = a.shift(genome='hg19', pct=True, s=0.50) | |
2040 #>>> print(b) #doctest: +NORMALIZE_WHITESPACE | |
2041 #chr1 50 149 feature1 0 + | |
2042 #chr1 150 250 feature2 0 + | |
2043 #chr1 325 675 feature3 0 - | |
2044 #chr1 925 975 feature4 0 + | |
2045 #<BLANKLINE> | |
2046 | |
2047 """ | |
2048 | |
2049 @_log_to_history | |
2050 @_wraps(prog="mergeBed", implicit="i", other=None, bam=None) | |
2051 def merge(self, *args, **kwargs) -> BedTool: # type: ignore | |
2052 """ | |
2053 Wraps `bedtools merge`. | |
2054 | |
2055 Merge overlapping features together. Returns a new BedTool object. | |
2056 | |
2057 Example usage: | |
2058 | |
2059 >>> a = pybedtools.example_bedtool('a.bed') | |
2060 | |
2061 Merge: | |
2062 | |
2063 >>> c = a.merge() | |
2064 | |
2065 Allow merging of features 500 bp apart: | |
2066 | |
2067 >>> c = a.merge(d=500) | |
2068 | |
2069 """ | |
2070 | |
2071 @_log_to_history | |
2072 @_wraps(prog="closestBed", implicit="a", other="b", bam=None) | |
2073 def closest(self, *args, **kwargs) -> BedTool: # type: ignore | |
2074 """ | |
2075 Wraps `bedtools closest`. | |
2076 | |
2077 Return a new BedTool object containing closest features in *b*. Note | |
2078 that the resulting file is no longer a valid BED format; use the | |
2079 special "_closest" methods to work with the resulting file. | |
2080 | |
2081 Example usage:: | |
2082 | |
2083 a = BedTool('in.bed') | |
2084 | |
2085 # get the closest feature in 'other.bed' on the same strand | |
2086 b = a.closest('other.bed', s=True) | |
2087 | |
2088 """ | |
2089 | |
2090 @_log_to_history | |
2091 @_wraps(prog="windowBed", implicit="a", other="b", bam="abam", nonbam="bed") | |
2092 def window(self, *args, **kwargs) -> BedTool: # type: ignore | |
2093 """ | |
2094 Wraps `bedtools window`. | |
2095 | |
2096 Example usage:: | |
2097 | |
2098 >>> a = pybedtools.example_bedtool('a.bed') | |
2099 >>> b = pybedtools.example_bedtool('b.bed') | |
2100 >>> print(a.window(b, w=1000)) #doctest: +NORMALIZE_WHITESPACE | |
2101 chr1 1 100 feature1 0 + chr1 155 200 feature5 0 - | |
2102 chr1 1 100 feature1 0 + chr1 800 901 feature6 0 + | |
2103 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 - | |
2104 chr1 100 200 feature2 0 + chr1 800 901 feature6 0 + | |
2105 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 - | |
2106 chr1 150 500 feature3 0 - chr1 800 901 feature6 0 + | |
2107 chr1 900 950 feature4 0 + chr1 155 200 feature5 0 - | |
2108 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 + | |
2109 <BLANKLINE> | |
2110 """ | |
2111 | |
2112 @_log_to_history | |
2113 @_wraps(prog="shuffleBed", implicit="i", other=None, bam=None, uses_genome=True) | |
2114 def shuffle(self, *args, **kwargs) -> BedTool: # type: ignore | |
2115 """ | |
2116 Wraps `bedtools shuffle`. | |
2117 | |
2118 Example usage: | |
2119 | |
2120 >>> a = pybedtools.example_bedtool('a.bed') | |
2121 >>> seed = 1 # so this test always returns the same results | |
2122 >>> b = a.shuffle(genome='hg19', chrom=True, seed=seed) | |
2123 >>> print(b) #doctest: +NORMALIZE_WHITESPACE | |
2124 chr1 123081365 123081464 feature1 0 + | |
2125 chr1 243444570 243444670 feature2 0 + | |
2126 chr1 194620241 194620591 feature3 0 - | |
2127 chr1 172792873 172792923 feature4 0 + | |
2128 <BLANKLINE> | |
2129 """ | |
2130 | |
2131 @_log_to_history | |
2132 @_wraps(prog="sortBed", implicit="i", uses_genome=True, genome_if=["g", "genome"]) | |
2133 def sort(self, *args, **kwargs) -> BedTool: # type: ignore | |
2134 """ | |
2135 Wraps `bedtools sort`. | |
2136 | |
2137 Note that chromosomes are sorted lexicographically, so chr12 will come | |
2138 before chr9. | |
2139 | |
2140 Example usage: | |
2141 | |
2142 >>> a = pybedtools.BedTool(''' | |
2143 ... chr9 300 400 | |
2144 ... chr1 100 200 | |
2145 ... chr1 1 50 | |
2146 ... chr12 1 100 | |
2147 ... chr9 500 600 | |
2148 ... ''', from_string=True) | |
2149 >>> print(a.sort()) #doctest: +NORMALIZE_WHITESPACE | |
2150 chr1 1 50 | |
2151 chr1 100 200 | |
2152 chr12 1 100 | |
2153 chr9 300 400 | |
2154 chr9 500 600 | |
2155 <BLANKLINE> | |
2156 """ | |
2157 | |
2158 @_log_to_history | |
2159 @_wraps(prog="annotateBed", implicit="i") | |
2160 def annotate(self, *args, **kwargs) -> BedTool: # type: ignore | |
2161 """ | |
2162 Wraps `bedtools annotate`. | |
2163 | |
2164 Annotate this BedTool with a list of other files. | |
2165 Example usage: | |
2166 | |
2167 >>> a = pybedtools.example_bedtool('a.bed') | |
2168 >>> b_fn = pybedtools.example_filename('b.bed') | |
2169 >>> print(a.annotate(files=b_fn)) #doctest: +NORMALIZE_WHITESPACE | |
2170 chr1 1 100 feature1 0 + 0.000000 | |
2171 chr1 100 200 feature2 0 + 0.450000 | |
2172 chr1 150 500 feature3 0 - 0.128571 | |
2173 chr1 900 950 feature4 0 + 0.020000 | |
2174 <BLANKLINE> | |
2175 """ | |
2176 | |
2177 @_log_to_history | |
2178 @_wraps(prog="flankBed", implicit="i", uses_genome=True) | |
2179 def flank(self, *args, **kwargs) -> BedTool: | |
2180 """ | |
2181 Wraps `bedtools flank`. | |
2182 | |
2183 Example usage: | |
2184 | |
2185 >>> a = pybedtools.example_bedtool('a.bed') | |
2186 >>> print(a.flank(genome='hg19', b=100)) #doctest: +NORMALIZE_WHITESPACE | |
2187 chr1 0 1 feature1 0 + | |
2188 chr1 100 200 feature1 0 + | |
2189 chr1 0 100 feature2 0 + | |
2190 chr1 200 300 feature2 0 + | |
2191 chr1 50 150 feature3 0 - | |
2192 chr1 500 600 feature3 0 - | |
2193 chr1 800 900 feature4 0 + | |
2194 chr1 950 1050 feature4 0 + | |
2195 <BLANKLINE> | |
2196 | |
2197 """ | |
2198 kwargs = self.check_genome(**kwargs) | |
2199 | |
2200 if "i" not in kwargs: | |
2201 kwargs["i"] = self.fn | |
2202 | |
2203 cmds, tmp, stdin = self.handle_kwargs(prog="flankBed", **kwargs) | |
2204 stream = call_bedtools(cmds, tmp, stdin=stdin) | |
2205 return BedTool(stream) | |
2206 | |
2207 @_log_to_history | |
2208 @_wraps( | |
2209 prog="genomeCoverageBed", | |
2210 implicit="i", | |
2211 bam="ibam", | |
2212 genome_none_if=["ibam"], | |
2213 genome_ok_if=["ibam"], | |
2214 uses_genome=True, | |
2215 nonbam="ALL", | |
2216 ) | |
2217 def genome_coverage(self, *args, **kwargs): | |
2218 """ | |
2219 Wraps `bedtools genomecov`. | |
2220 | |
2221 Note that some invocations of `bedtools genomecov` do not result in | |
2222 a properly-formatted BED file. For example, the default behavior is to | |
2223 report a histogram of coverage. Iterating over the resulting, | |
2224 non-BED-format file will raise exceptions in pybedtools' parser. | |
2225 | |
2226 Consider using the `BedTool.to_dataframe` method to convert these | |
2227 non-BED files into a pandas DataFrame for further use. | |
2228 | |
2229 Example usage: | |
2230 | |
2231 BAM file input does not require a genome: | |
2232 | |
2233 >>> a = pybedtools.example_bedtool('x.bam') | |
2234 >>> b = a.genome_coverage(bg=True) | |
2235 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE | |
2236 chr2L 9329 9365 1 | |
2237 chr2L 10212 10248 1 | |
2238 chr2L 10255 10291 1 | |
2239 | |
2240 Other input does require a genome: | |
2241 | |
2242 >>> a = pybedtools.example_bedtool('x.bed') | |
2243 >>> b = a.genome_coverage(bg=True, genome='dm3') | |
2244 >>> b.head(3) #doctest: +NORMALIZE_WHITESPACE | |
2245 chr2L 9329 9365 1 | |
2246 chr2L 10212 10248 1 | |
2247 chr2L 10255 10291 1 | |
2248 | |
2249 Non-BED format results: | |
2250 >>> a = pybedtools.example_bedtool('x.bed') | |
2251 >>> b = a.genome_coverage(genome='dm3') | |
2252 >>> df = b.to_dataframe(names=['chrom', 'depth', 'n', 'chromsize', 'fraction']) | |
2253 """ | |
2254 | |
2255 # Alias for backwards compatibility | |
2256 genomecov = genome_coverage | |
2257 | |
2258 @_log_to_history | |
2259 @_wraps(prog="coverageBed", implicit="a", other="b", bam="abam", nonbam="ALL") | |
2260 def coverage(self, *args, **kwargs) -> BedTool: # type: ignore | |
2261 """ | |
2262 Wraps `bedtools coverage`. | |
2263 | |
2264 Note that starting in version 2.24.0, BEDTools swapped the semantics of | |
2265 the "a" and "b" files. | |
2266 | |
2267 Example usage: | |
2268 | |
2269 >>> a = pybedtools.example_bedtool('a.bed') | |
2270 >>> b = pybedtools.example_bedtool('b.bed') | |
2271 >>> c = b.coverage(a) | |
2272 >>> c.head(3) #doctest: +NORMALIZE_WHITESPACE | |
2273 chr1 155 200 feature5 0 - 2 45 45 1.0000000 | |
2274 chr1 800 901 feature6 0 + 1 1 101 0.0099010 | |
2275 """ | |
2276 | |
2277 @_log_to_history | |
2278 @_wraps( | |
2279 prog="maskFastaFromBed", | |
2280 implicit="bed", | |
2281 other="fi", | |
2282 make_tempfile_for="fo", | |
2283 add_to_bedtool={"fo": "seqfn"}, | |
2284 check_stderr=_check_sequence_stderr, | |
2285 ) | |
2286 def mask_fasta(self, *args, **kwargs) -> BedTool: # type: ignore | |
2287 """ | |
2288 Wraps `bedtools maskfasta`. | |
2289 | |
2290 Masks a fasta file at the positions in a BED file and saves result as | |
2291 'out' and stores the filename in seqfn. | |
2292 | |
2293 >>> a = pybedtools.BedTool('chr1 100 110', from_string=True) | |
2294 >>> fasta_fn = pybedtools.example_filename('test.fa') | |
2295 >>> a = a.mask_fasta(fi=fasta_fn, fo='masked.fa.example') | |
2296 >>> b = a.slop(b=2, genome='hg19') | |
2297 >>> b = b.sequence(fi=a.seqfn) | |
2298 >>> print(open(b.seqfn).read()) | |
2299 >chr1:98-112 | |
2300 TTNNNNNNNNNNAT | |
2301 <BLANKLINE> | |
2302 >>> os.unlink('masked.fa.example') | |
2303 >>> if os.path.exists('masked.fa.example.fai'): | |
2304 ... os.unlink('masked.fa.example.fai') | |
2305 """ | |
2306 | |
2307 # Alias for backwards compatibility | |
2308 maskfasta = mask_fasta | |
2309 | |
2310 @_log_to_history | |
2311 @_wraps(prog="complementBed", implicit="i", uses_genome=True) | |
2312 def complement(self, *args, **kwargs) -> BedTool: # type: ignore | |
2313 """ | |
2314 Wraps `bedtools complement`. | |
2315 Example usage: | |
2316 | |
2317 >>> a = pybedtools.example_bedtool('a.bed') | |
2318 >>> a.complement(genome='hg19').head(5) #doctest: +NORMALIZE_WHITESPACE | |
2319 chr1 0 1 | |
2320 chr1 500 900 | |
2321 chr1 950 249250621 | |
2322 chr2 0 243199373 | |
2323 chr3 0 198022430 | |
2324 """ | |
2325 | |
2326 @_log_to_history | |
2327 @_wraps(prog="getOverlap", implicit="i") | |
2328 def overlap(self, *args, **kwargs) -> BedTool: # type: ignore | |
2329 """ | |
2330 Wraps `bedtools overlap`. | |
2331 | |
2332 Example usage: | |
2333 | |
2334 >>> a = pybedtools.example_bedtool('a.bed') | |
2335 >>> b = pybedtools.example_bedtool('b.bed') | |
2336 >>> c = a.window(b, w=10).overlap(cols=[2,3,8,9]) | |
2337 >>> print(c) #doctest: +NORMALIZE_WHITESPACE | |
2338 chr1 100 200 feature2 0 + chr1 155 200 feature5 0 - 45 | |
2339 chr1 150 500 feature3 0 - chr1 155 200 feature5 0 - 45 | |
2340 chr1 900 950 feature4 0 + chr1 800 901 feature6 0 + 1 | |
2341 <BLANKLINE> | |
2342 """ | |
2343 | |
2344 # TODO: needs test files and doctests written | |
2345 @_log_to_history | |
2346 @_wraps(prog="pairToBed", implicit="a", other="b", bam="abam", nonbam="bedpe") | |
2347 def pair_to_bed(self, *args, **kwargs) -> BedTool: # type: ignore | |
2348 """ | |
2349 Wraps `bedtools pairtobed`. | |
2350 """ | |
2351 | |
2352 # Alias for backwards compatibility | |
2353 pairtobed = pair_to_bed | |
2354 | |
2355 @_log_to_history | |
2356 @_wraps(prog="pairToPair", implicit="a", other="b") | |
2357 def pair_to_pair(self) -> BedTool: # type: ignore | |
2358 """ | |
2359 Wraps `bedtools pairtopair`. | |
2360 """ | |
2361 | |
2362 # Alias for backwards compatibility | |
2363 pairtopair = pair_to_pair | |
2364 | |
2365 @_log_to_history | |
2366 @_wraps(prog="groupBy", implicit="i") | |
2367 def groupby(self, *args, **kwargs) -> BedTool: | |
2368 """ | |
2369 Wraps `bedtools groupby`. | |
2370 | |
2371 Example usage: | |
2372 | |
2373 >>> a = pybedtools.example_bedtool('gdc.gff') | |
2374 >>> b = pybedtools.example_bedtool('gdc.bed') | |
2375 >>> c = a.intersect(b, c=True) | |
2376 >>> d = c.groupby(g=[1, 4, 5], c=10, o=['sum']) | |
2377 >>> print(d) #doctest: +NORMALIZE_WHITESPACE | |
2378 chr2L 41 70 0 | |
2379 chr2L 71 130 2 | |
2380 chr2L 131 170 4 | |
2381 chr2L 171 200 0 | |
2382 chr2L 201 220 1 | |
2383 chr2L 41 130 2 | |
2384 chr2L 171 220 1 | |
2385 chr2L 41 220 7 | |
2386 chr2L 161 230 6 | |
2387 chr2L 41 220 7 | |
2388 <BLANKLINE> | |
2389 | |
2390 """ | |
2391 | |
2392 @_log_to_history | |
2393 @_wraps(prog="tagBam", implicit="i", bam="i") | |
2394 def tag_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore | |
2395 """ | |
2396 Wraps `bedtools tag`. | |
2397 | |
2398 `files` and `labels` should lists of equal length. | |
2399 | |
2400 """ | |
2401 if "i" in kwargs: | |
2402 if "labels" in kwargs: | |
2403 if len(kwargs["i"]) != len(kwargs["labels"]): | |
2404 raise ValueError("files and labels must be lists of equal length") | |
2405 | |
2406 | |
2407 # Alias for backwards compatibility | |
2408 tag = tag_bam | |
2409 | |
2410 @_log_to_history | |
2411 @_wraps(prog="mapBed", implicit="a", other="b") | |
2412 def map(self, *args, **kwargs) -> BedTool: # type: ignore | |
2413 """ | |
2414 Wraps `bedtools map`; See also :meth:`BedTool.each`. | |
2415 """ | |
2416 | |
2417 @_log_to_history | |
2418 @_wraps(prog="multiIntersectBed", uses_genome=True, genome_if=["empty"]) | |
2419 def multi_intersect(self, *args, **kwargs) -> BedTool: # type: ignore | |
2420 """ | |
2421 Wraps `bedtools multiintersect`. | |
2422 | |
2423 Provide a list of filenames as the "i" argument. e.g. if you already | |
2424 have BedTool objects then use their `.fn` attribute, like this:: | |
2425 | |
2426 >>> x = pybedtools.BedTool() | |
2427 >>> a = pybedtools.example_bedtool('a.bed') | |
2428 >>> b = pybedtools.example_bedtool('b.bed') | |
2429 >>> result = x.multi_intersect(i=[a.fn, b.fn]) | |
2430 >>> print(result) #doctest: +NORMALIZE_WHITESPACE | |
2431 chr1 1 155 1 1 1 0 | |
2432 chr1 155 200 2 1,2 1 1 | |
2433 chr1 200 500 1 1 1 0 | |
2434 chr1 800 900 1 2 0 1 | |
2435 chr1 900 901 2 1,2 1 1 | |
2436 chr1 901 950 1 1 1 0 | |
2437 <BLANKLINE> | |
2438 | |
2439 """ | |
2440 | |
2441 # Alias for backwards compatibility | |
2442 multiinter = multi_intersect | |
2443 | |
2444 @_log_to_history | |
2445 @_wraps(prog="randomBed", uses_genome=True) | |
2446 def random(self, *args, **kwargs) -> BedTool: # type: ignore | |
2447 """ | |
2448 Wraps `bedtools random`. | |
2449 | |
2450 Since this method does not operate on an existing file, create | |
2451 a BedTool with no arguments and then call this method, e.g., | |
2452 | |
2453 >>> x = BedTool() | |
2454 >>> y = x.random(l=100, n=10, genome='hg19') | |
2455 """ | |
2456 | |
2457 @_log_to_history | |
2458 @_wraps("bedpeToBam", implicit="i", uses_genome=True, force_bam=True) | |
2459 def bedpe_to_bam(self, *args, **kwargs) -> pysam.AlignmentFile: # type: ignore | |
2460 """ | |
2461 Wraps `bedtools bedpetobam`. | |
2462 """ | |
2463 | |
2464 # Alias for backwards compatibility | |
2465 bedpetobam = bedpe_to_bam | |
2466 | |
2467 @_log_to_history | |
2468 @_wraps(prog="clusterBed", implicit="i") | |
2469 def cluster(self, *args, **kwargs) -> BedTool: # type: ignore | |
2470 """ | |
2471 Wraps `bedtools cluster`. | |
2472 """ | |
2473 | |
2474 @_log_to_history | |
2475 @_wraps(prog="unionBedGraphs") | |
2476 def union_bedgraphs(self, *args, **kwargs) -> BedTool: # type: ignore | |
2477 """ | |
2478 Wraps `bedtools unionbedg`. | |
2479 | |
2480 Warning: using the `header=True` kwarg will result in a file that is | |
2481 not in true BED format, which may break downstream analysis. | |
2482 """ | |
2483 if "header" in kwargs: | |
2484 if kwargs["header"] is True: | |
2485 warn("Using header=True with unionbedg will result in a file that is not in true BED format, which may break downstream analysis.") | |
2486 | |
2487 | |
2488 # Alias for backwards compatibility | |
2489 unionbedg = union_bedgraphs | |
2490 | |
2491 @_log_to_history | |
2492 @_wraps(prog="windowMaker", uses_genome=True, genome_none_if=["b"], other="b", arg_order=["w"]) | |
2493 def window_maker(self, *args, **kwargs) -> BedTool: # type: ignore | |
2494 """ | |
2495 Wraps `bedtools makewindows`. | |
2496 """ | |
2497 | |
2498 # Alias for backwards compatibility | |
2499 makewindows = window_maker | |
2500 | |
2501 @_log_to_history | |
2502 @_wraps(prog="expandCols", implicit="i") | |
2503 def expand(self, *args, **kwargs) -> BedTool: # type: ignore | |
2504 """ | |
2505 Wraps `bedtools expand` | |
2506 """ | |
2507 | |
2508 @_log_to_history | |
2509 @_wraps(prog="linksBed", implicit="i", add_to_bedtool={"stdout": "links_html"}) | |
2510 def links(self, *args, **kwargs) -> BedTool: # type: ignore | |
2511 """ | |
2512 Wraps `linksBed`. | |
2513 | |
2514 The resulting BedTool will assign a value to the attribute `links_html`. This | |
2515 attribute is a temp filename containing the HTML links. | |
2516 """ | |
2517 | |
2518 @_log_to_history | |
2519 @_wraps(prog="bedToIgv", implicit="i", add_to_bedtool={"stdout": "igv_script"}) | |
2520 def igv(self, *args, **kwargs) -> BedTool: # type: ignore | |
2521 """ | |
2522 Wraps `bedtools igv`. | |
2523 | |
2524 The resulting BedTool will assign a value to the attribute `igv_script`. This | |
2525 attribute is a temp filename containing the IGV script. | |
2526 """ | |
2527 | |
2528 @_log_to_history | |
2529 @_wraps( | |
2530 prog="bamToFastq", | |
2531 implicit="i", | |
2532 bam="i", | |
2533 make_tempfile_for="fq", | |
2534 add_to_bedtool={"fq": "fastq"}, | |
2535 ) | |
2536 def bam_to_fastq(self, *args, **kwargs) -> BedTool: # type: ignore | |
2537 """ | |
2538 Wraps `bedtools bamtofastq`. | |
2539 | |
2540 The `fq` argument is required. | |
2541 | |
2542 The resulting BedTool will assign a value to the attribute `fastq`. | |
2543 """ | |
2544 | |
2545 # Alias for backwards compatibility | |
2546 bamtofastq = bam_to_fastq | |
2547 | |
2548 @_wraps( | |
2549 prog="jaccard", | |
2550 implicit="a", | |
2551 other="b", | |
2552 does_not_return_bedtool=_jaccard_output_to_dict, | |
2553 ) | |
2554 def jaccard(self, *args, **kwargs) -> dict[str, Any]: # type: ignore | |
2555 """ | |
2556 Returns a dictionary with keys (intersection, union, jaccard). | |
2557 """ | |
2558 | |
2559 @_wraps( | |
2560 prog="reldist", | |
2561 implicit="a", | |
2562 other="b", | |
2563 does_not_return_bedtool=_reldist_output_handler, | |
2564 ) | |
2565 def reldist(self, *args, **kwargs) -> BedTool | dict[str, Any]: # type: ignore | |
2566 """ | |
2567 If detail=False, then return a dictionary with keys (reldist, count, | |
2568 total, fraction), which is the summary of the bedtools reldist. | |
2569 | |
2570 Otherwise return a BedTool, with the relative distance for each | |
2571 interval in A in the last column. | |
2572 """ | |
2573 if "detail" in kwargs: | |
2574 if kwargs["detail"] is False: | |
2575 warn("Using detail=False with reldist will return a dictionary with keys (reldist, count, total, fraction), which is the summary of the bedtools reldist." | |
2576 "Not a BedTool object.") | |
2577 | |
2578 | |
2579 @_wraps(prog="sample", implicit="i", bam="i") | |
2580 def sample(self, *args, **kwargs) -> BedTool: # type: ignore | |
2581 """ | |
2582 Wraps 'sample'. | |
2583 """ | |
2584 | |
2585 @_wraps( | |
2586 prog="fisher", | |
2587 implicit="a", | |
2588 other="b", | |
2589 uses_genome=True, | |
2590 does_not_return_bedtool=FisherOutput, | |
2591 ) | |
2592 def fisher(self, *args, **kwargs) -> FisherOutput: # type: ignore | |
2593 """ | |
2594 Wraps 'fisher'. Returns an object representing the output. | |
2595 | |
2596 >>> a = pybedtools.example_bedtool('a.bed') | |
2597 >>> b = pybedtools.example_bedtool('b.bed') | |
2598 >>> f = a.fisher(b, genome='hg19') | |
2599 >>> print(f) # doctest: +NORMALIZE_WHITESPACE | |
2600 # Number of query intervals: 4 | |
2601 # Number of db intervals: 2 | |
2602 # Number of overlaps: 3 | |
2603 # Number of possible intervals (estimated): 13958448 | |
2604 # phyper(3 - 1, 4, 13958448 - 4, 2, lower.tail=F) | |
2605 # Contingency Table Of Counts | |
2606 #_________________________________________ | |
2607 # | in -b | not in -b | | |
2608 # in -a | 3 | 1 | | |
2609 # not in -a | 0 | 13958444 | | |
2610 #_________________________________________ | |
2611 # p-values for fisher's exact test | |
2612 left right two-tail ratio | |
2613 1 8.8247e-21 8.8247e-21 inf | |
2614 <BLANKLINE> | |
2615 | |
2616 | |
2617 >>> f.table['not in -a']['in -b'] | |
2618 0 | |
2619 | |
2620 >>> f.table['not in -a']['not in -b'] | |
2621 13958444 | |
2622 | |
2623 >>> f.table['in -a']['in -b'] | |
2624 3 | |
2625 | |
2626 >>> f.table['in -a']['not in -b'] | |
2627 1 | |
2628 | |
2629 >>> f.two_tail | |
2630 8.8247e-21 | |
2631 """ | |
2632 | |
2633 @_wraps(prog="split", implicit="i", does_not_return_bedtool=SplitOutput) | |
2634 def splitbed(self, *args, **kwargs) -> SplitOutput: # type: ignore | |
2635 """ | |
2636 Wraps 'bedtools split'. | |
2637 | |
2638 BedTool objects have long had a `split` method which splits intervals | |
2639 according to a custom function. Now that BEDTools has a `split` tool, | |
2640 the method name conflicts. To maintain backwards compatibility, the | |
2641 method wrapping the BEDTools command is called `splitbed`. | |
2642 | |
2643 Since this tool does not return a single BED file, the method parses | |
2644 the output and returns a SplitOutput object, which includes an | |
2645 attribute, `bedtools`, that is a list of BedTool objects created from | |
2646 the split files. | |
2647 | |
2648 To keep the working directory clean, you may want to consider using | |
2649 `prefix=BedTool._tmp()` to get a temp file that will be deleted when | |
2650 Python exits cleanly. | |
2651 | |
2652 >>> a = pybedtools.example_bedtool('a.bed') | |
2653 >>> s = a.splitbed(n=2, p="split") | |
2654 >>> assert len(a) == 4, len(a) | |
2655 >>> assert len(s.bedtools) == 2 | |
2656 >>> print(s.bedtools[0]) # doctest: +NORMALIZE_WHITESPACE | |
2657 chr1 150 500 feature3 0 - | |
2658 <BLANKLINE> | |
2659 >>> print(s.bedtools[1]) # doctest: +NORMALIZE_WHITESPACE | |
2660 chr1 100 200 feature2 0 + | |
2661 chr1 1 100 feature1 0 + | |
2662 chr1 900 950 feature4 0 + | |
2663 <BLANKLINE> | |
2664 """ | |
2665 | |
2666 @_wraps(prog="spacing", implicit="i") | |
2667 def spacing(self, *args, **kwargs) -> BedTool: # type: ignore | |
2668 """ | |
2669 Wraps `bedtools spacing` | |
2670 | |
2671 >>> a = pybedtools.example_bedtool('a.bed') | |
2672 >>> print(a.spacing()) # doctest: +NORMALIZE_WHITESPACE | |
2673 chr1 1 100 feature1 0 + . | |
2674 chr1 100 200 feature2 0 + 0 | |
2675 chr1 150 500 feature3 0 - -1 | |
2676 chr1 900 950 feature4 0 + 400 | |
2677 """ | |
2678 | |
2679 def count(self) -> int: | |
2680 """ | |
2681 Count the number features in this BedTool. | |
2682 | |
2683 Number of features in BED file. Does the same thing as len(self), which | |
2684 actually just calls this method. | |
2685 | |
2686 Only counts the actual features. Ignores any track lines, browser | |
2687 lines, lines starting with a "#", or blank lines. | |
2688 | |
2689 Example usage: | |
2690 | |
2691 >>> a = pybedtools.example_bedtool('a.bed') | |
2692 >>> a.count() | |
2693 4 | |
2694 """ | |
2695 if hasattr(self, "next") or hasattr(self, "__next__"): | |
2696 return sum(1 for _ in self) | |
2697 return sum(1 for _ in iter(self)) | |
2698 | |
2699 def print_sequence(self) -> str: | |
2700 """ | |
2701 Print the sequence that was retrieved by BedTool.sequence. | |
2702 See usage example in :meth:`BedTool.sequence`. | |
2703 | |
2704 Returns: | |
2705 str: The sequence as a string. | |
2706 | |
2707 Raises: | |
2708 ValueError: If the sequence has not been generated using .sequence(fasta). | |
2709 """ | |
2710 if self.seqfn is None: | |
2711 raise ValueError("Use .sequence(fasta) to get the sequence first") | |
2712 f = open(self.seqfn) | |
2713 s = f.read() | |
2714 f.close() | |
2715 return s | |
2716 | |
2717 def save_seqs(self, fn:str) -> BedTool: | |
2718 """ | |
2719 Save sequences, after calling BedTool.sequence. | |
2720 | |
2721 In order to use this function, you need to have called | |
2722 the :meth:`BedTool.sequence()` method. | |
2723 | |
2724 A new BedTool object is returned which references the newly saved file. | |
2725 | |
2726 Example usage: | |
2727 | |
2728 >>> a = pybedtools.BedTool(''' | |
2729 ... chr1 1 10 | |
2730 ... chr1 50 55''', from_string=True) | |
2731 >>> fasta = pybedtools.example_filename('test.fa') | |
2732 >>> a = a.sequence(fi=fasta) | |
2733 >>> print(open(a.seqfn).read()) | |
2734 >chr1:1-10 | |
2735 GATGAGTCT | |
2736 >chr1:50-55 | |
2737 CCATC | |
2738 <BLANKLINE> | |
2739 >>> b = a.save_seqs('example.fa') | |
2740 >>> assert open(b.fn).read() == open(a.fn).read() | |
2741 >>> if os.path.exists('example.fa'): | |
2742 ... os.unlink('example.fa') | |
2743 """ | |
2744 | |
2745 if self.seqfn is None: | |
2746 raise ValueError("Use .sequence(fasta) to get the sequence first") | |
2747 | |
2748 with open(fn, "w") as fout: | |
2749 if self.seqfn is None: | |
2750 raise ValueError("Use .sequence(fasta) to get the sequence first") | |
2751 with open(self.seqfn) as seqfile: | |
2752 fout.write(seqfile.read()) | |
2753 | |
2754 new_bedtool = BedTool(self.fn) | |
2755 new_bedtool.seqfn = fn | |
2756 return new_bedtool | |
2757 | |
2758 def randomstats( | |
2759 self, | |
2760 other: BedTool, | |
2761 iterations: int, | |
2762 new: bool = False, | |
2763 genome_fn: Optional[str] = None, | |
2764 include_distribution: bool = False, | |
2765 **kwargs | |
2766 ) -> dict[str, Any]: | |
2767 """ | |
2768 Dictionary of results from many randomly shuffled intersections. | |
2769 | |
2770 Sends args and kwargs to :meth:`BedTool.randomintersection` and | |
2771 compiles results into a dictionary with useful stats. Requires | |
2772 numpy. | |
2773 | |
2774 If `include_distribution` is True, then the dictionary will include the | |
2775 full distribution; otherwise, the distribution is deleted and cleaned | |
2776 up to save on memory usage. | |
2777 | |
2778 This is one possible way of assigning significance to overlaps between | |
2779 two files. See, for example: | |
2780 | |
2781 Negre N, Brown CD, Shah PK, Kheradpour P, Morrison CA, et al. 2010 | |
2782 A Comprehensive Map of Insulator Elements for the Drosophila | |
2783 Genome. PLoS Genet 6(1): e1000814. doi:10.1371/journal.pgen.1000814 | |
2784 | |
2785 Example usage: | |
2786 | |
2787 Make chromsizes a very small genome for this example: | |
2788 | |
2789 >>> chromsizes = {'chr1':(1,1000)} | |
2790 >>> a = pybedtools.example_bedtool('a.bed').set_chromsizes(chromsizes) | |
2791 >>> b = pybedtools.example_bedtool('b.bed') | |
2792 >>> try: | |
2793 ... results = a.randomstats(b, 100, debug=True) | |
2794 ... except ImportError: | |
2795 ... pass | |
2796 | |
2797 *results* is a dictionary that you can inspect. | |
2798 | |
2799 (Note that the following examples are not run as part of the doctests | |
2800 to avoid forcing users to install NumPy just to pass tests) | |
2801 | |
2802 The actual overlap:: | |
2803 | |
2804 print(results['actual']) | |
2805 3 | |
2806 | |
2807 The median of all randomized overlaps:: | |
2808 | |
2809 print(results['median randomized']) | |
2810 2.0 | |
2811 | |
2812 The percentile of the actual overlap in the distribution of randomized | |
2813 overlaps, which can be used to get an empirical p-value:: | |
2814 | |
2815 print(results['percentile']) | |
2816 90.0 | |
2817 """ | |
2818 if ("intersect_kwargs" not in kwargs) or (kwargs["intersect_kwargs"] is None): | |
2819 kwargs["intersect_kwargs"] = {"u": True} | |
2820 try: | |
2821 import numpy as np | |
2822 except ImportError: | |
2823 raise ImportError("Need to install NumPy for stats...") | |
2824 | |
2825 def percentileofscore(a, score): | |
2826 """ | |
2827 copied from scipy.stats.percentileofscore, to avoid dependency on | |
2828 scipy. | |
2829 """ | |
2830 a = np.array(a) | |
2831 n = len(a) | |
2832 | |
2833 if not (np.any(a == score)): | |
2834 a = np.append(a, score) | |
2835 a_len = np.array(list(range(len(a)))) | |
2836 else: | |
2837 a_len = np.array(list(range(len(a)))) + 1.0 | |
2838 | |
2839 a = np.sort(a) | |
2840 idx = tuple([a == score]) | |
2841 pct = (np.mean(a_len[idx]) / n) * 100.0 | |
2842 return pct | |
2843 | |
2844 if isinstance(other, str): | |
2845 other = BedTool(other) | |
2846 else: | |
2847 assert isinstance( | |
2848 other, BedTool | |
2849 ), "Either filename or another BedTool instance required" | |
2850 | |
2851 # Actual (unshuffled) counts. | |
2852 i_kwargs = kwargs["intersect_kwargs"] | |
2853 actual = len(self.intersect(other, **i_kwargs)) | |
2854 | |
2855 # List of counts from randomly shuffled versions. | |
2856 # Length of counts == *iterations*. | |
2857 | |
2858 if not new: | |
2859 distribution = self.randomintersection( | |
2860 other, iterations=iterations, **kwargs | |
2861 ) | |
2862 else: | |
2863 # use new mechanism | |
2864 if genome_fn is None: | |
2865 raise ValueError( | |
2866 "`genome_fn` must be provided if using the " | |
2867 "new _randomintersection mechanism" | |
2868 ) | |
2869 distribution = self._randomintersection( | |
2870 other, iterations=iterations, genome_fn=genome_fn, **kwargs | |
2871 ) | |
2872 | |
2873 distribution = np.array(list(distribution)) | |
2874 | |
2875 # Median of distribution | |
2876 med_count = np.median(distribution) | |
2877 | |
2878 n = float(len(distribution)) | |
2879 | |
2880 frac_above = sum(distribution > actual) / n | |
2881 frac_below = sum(distribution < actual) / n | |
2882 | |
2883 normalized = actual / med_count | |
2884 | |
2885 lower_thresh = 2.5 | |
2886 upper_thresh = 97.5 | |
2887 lower, upper = np.percentile(distribution, [lower_thresh, upper_thresh]) | |
2888 | |
2889 actual_percentile = percentileofscore(distribution, actual) | |
2890 d = { | |
2891 "iterations": iterations, | |
2892 "actual": actual, | |
2893 "file_a": self.fn, | |
2894 "file_b": other.fn, | |
2895 self.fn: len(self), | |
2896 other.fn: len(other), | |
2897 "self": len(self), | |
2898 "other": len(other), | |
2899 "frac randomized above actual": frac_above, | |
2900 "frac randomized below actual": frac_below, | |
2901 "median randomized": med_count, | |
2902 "normalized": normalized, | |
2903 "percentile": actual_percentile, | |
2904 "lower_%sth" % lower_thresh: lower, | |
2905 "upper_%sth" % upper_thresh: upper, | |
2906 } | |
2907 if include_distribution: | |
2908 d["distribution"] = distribution | |
2909 else: | |
2910 del distribution | |
2911 return d | |
2912 | |
2913 def random_op(self, *args, **kwargs): | |
2914 """ | |
2915 For backwards compatibility; see BedTool.parallel_apply instead. | |
2916 """ | |
2917 return self.parallel_apply(*args, **kwargs) | |
2918 | |
2919 def parallel_apply( | |
2920 self, | |
2921 iterations: int, | |
2922 func: Callable, | |
2923 func_args: Iterable, | |
2924 func_kwargs: dict, | |
2925 processes: int=1, | |
2926 _orig_pool: Optional[Pool] = None # type: ignore | |
2927 ): | |
2928 """ | |
2929 Generalized method for applying a function in parallel. | |
2930 | |
2931 Typically used when having to do many random shufflings. | |
2932 | |
2933 `func_args` and `func_kwargs` will be passed to `func` each time in | |
2934 `iterations`, and these iterations will be split across `processes` | |
2935 processes. | |
2936 | |
2937 Notes on the function, `func`: | |
2938 | |
2939 * the function should manually remove any tempfiles created. This | |
2940 is because the BedTool.TEMPFILES list of auto-created tempfiles | |
2941 does not share state across processes, so things will not get | |
2942 cleaned up automatically as they do in a single-process | |
2943 pybedtools session. | |
2944 | |
2945 * this includes deleting any "chromsizes" or genome files -- | |
2946 generally it will be best to require a genome filename in | |
2947 `func_kwargs` if you'll be using any BedTool methods that accept | |
2948 the `g` kwarg. | |
2949 | |
2950 * the function should be a module-level function (rather than a | |
2951 class method) because class methods can't be pickled across | |
2952 process boundaries | |
2953 | |
2954 * the function can have any signature and have any return value | |
2955 | |
2956 `_orig_pool` can be a previously-created multiprocessing.Pool instance; | |
2957 otherwise, a new Pool will be created with `processes` | |
2958 """ | |
2959 if processes == 1: | |
2960 for _ in range(iterations): | |
2961 yield func(*func_args, **func_kwargs) | |
2962 raise StopIteration | |
2963 | |
2964 if _orig_pool: | |
2965 p = _orig_pool | |
2966 else: | |
2967 p = Pool(processes) | |
2968 iterations_each = [iterations / processes] * processes | |
2969 iterations_each[-1] += iterations % processes | |
2970 | |
2971 # FYI some useful info on apply_async: | |
2972 # http://stackoverflow.com/questions/8533318/ | |
2973 # python-multiprocessing-pool-when-to-use-apply-apply-async-or-map | |
2974 # | |
2975 # Here, we don't care about the order, and don't want the subprocesses | |
2976 # to block. | |
2977 results = [ | |
2978 p.apply_async(func, func_args, func_kwargs) for _ in range(iterations) | |
2979 ] | |
2980 for r in results: | |
2981 yield r.get() | |
2982 raise StopIteration | |
2983 | |
2984 def random_jaccard( | |
2985 self, | |
2986 other: BedTool, | |
2987 genome_fn: Optional[str] = None, | |
2988 iterations: Optional[int] = None, | |
2989 processes: int = 1, | |
2990 _orig_pool: Optional[Pool] = None, | |
2991 shuffle_kwargs: Optional[dict[str, Any]] = None, | |
2992 jaccard_kwargs: Optional[dict[str, Any]] = None, | |
2993 ) -> list: | |
2994 """ | |
2995 Computes the naive Jaccard statistic (intersection divided by union). | |
2996 | |
2997 .. note:: | |
2998 | |
2999 If you don't need the randomization functionality of this method, | |
3000 you can use the simpler BedTool.jaccard method instead. | |
3001 | |
3002 See Favorov et al. (2012) PLoS Comput Biol 8(5): e1002529 for more | |
3003 info on the Jaccard statistic for intersections. | |
3004 | |
3005 If `iterations` is None, then do not perform random shufflings. | |
3006 | |
3007 If `iterations` is an integer, perform `iterations` random shufflings, | |
3008 each time computing the Jaccard statistic to build an empirical | |
3009 distribution. `genome_fn` will also be needed; optional `processes` | |
3010 will split the iterations across multiple CPUs. | |
3011 | |
3012 Returns a tuple of the observed Jaccard statistic and a list of the | |
3013 randomized statistics (which will be an empty list if `iterations` was | |
3014 None). | |
3015 """ | |
3016 if shuffle_kwargs is None: | |
3017 shuffle_kwargs = {} | |
3018 if jaccard_kwargs is None: | |
3019 jaccard_kwargs = {} | |
3020 if not genome_fn: | |
3021 raise ValueError("Need a genome filename in order to perform randomization") | |
3022 return list( | |
3023 self.parallel_apply( | |
3024 iterations=iterations if iterations else 1, | |
3025 func=pybedtools.stats.random_jaccard, | |
3026 func_args=(self, other), | |
3027 func_kwargs=dict( | |
3028 genome_fn=genome_fn, | |
3029 shuffle_kwargs=shuffle_kwargs, | |
3030 jaccard_kwargs=jaccard_kwargs, | |
3031 ), | |
3032 processes=processes, | |
3033 _orig_pool=_orig_pool, | |
3034 ) | |
3035 ) | |
3036 | |
3037 def _randomintersection( | |
3038 self, | |
3039 other: BedTool, | |
3040 iterations: int, | |
3041 genome_fn: str, | |
3042 intersect_kwargs: Optional[dict[str, Any]] = None, | |
3043 _orig_pool:Optional[Pool] = None, | |
3044 shuffle_kwargs: Optional[dict[str, Any]] = None, | |
3045 processes: int = 1, | |
3046 ): | |
3047 """ | |
3048 Re-implementation of BedTool.randomintersection using the new | |
3049 `random_op` method | |
3050 """ | |
3051 if shuffle_kwargs is None: | |
3052 shuffle_kwargs = {} | |
3053 if intersect_kwargs is None: | |
3054 intersect_kwargs = dict(u=True) | |
3055 if not genome_fn: | |
3056 raise ValueError("Need a genome filename in order to perform randomization") | |
3057 return list( | |
3058 self.parallel_apply( | |
3059 iterations=iterations, | |
3060 func=pybedtools.stats.random_intersection, | |
3061 func_args=(self, other), | |
3062 func_kwargs=dict( | |
3063 genome_fn=genome_fn, | |
3064 shuffle_kwargs=shuffle_kwargs, | |
3065 intersect_kwargs=intersect_kwargs, | |
3066 ), | |
3067 processes=processes, | |
3068 _orig_pool=_orig_pool, | |
3069 ) | |
3070 ) | |
3071 | |
3072 def randomintersection_bp( | |
3073 self, | |
3074 other: BedTool, | |
3075 iterations: int, | |
3076 genome_fn: str, | |
3077 intersect_kwargs: Optional[dict[str, Any]] = None, | |
3078 shuffle_kwargs: Optional[dict[str, Any]] = None, | |
3079 processes: int = 1, | |
3080 _orig_pool:Optional[Pool] = None, | |
3081 ) -> list[int]: | |
3082 """ | |
3083 Like randomintersection, but return the bp overlap instead of the | |
3084 number of intersecting intervals. | |
3085 """ | |
3086 if shuffle_kwargs is None: | |
3087 shuffle_kwargs = {} | |
3088 if intersect_kwargs is None: | |
3089 intersect_kwargs = {} | |
3090 if not genome_fn: | |
3091 raise ValueError("Need a genome filename in order to perform randomization") | |
3092 return list( | |
3093 self.parallel_apply( | |
3094 iterations=iterations, | |
3095 func=pybedtools.stats.random_intersection_bp, | |
3096 func_args=(self, other), | |
3097 func_kwargs=dict( | |
3098 genome_fn=genome_fn, | |
3099 shuffle_kwargs=shuffle_kwargs, | |
3100 intersect_kwargs=intersect_kwargs, | |
3101 ), | |
3102 processes=processes, | |
3103 _orig_pool=_orig_pool, | |
3104 ) | |
3105 ) | |
3106 | |
3107 def randomintersection( | |
3108 self, | |
3109 other: BedTool, | |
3110 iterations: int, | |
3111 intersect_kwargs: Optional[dict[str, Any]] = None, | |
3112 shuffle_kwargs: Optional[dict[str, Any]] = None, | |
3113 debug: bool = False, | |
3114 report_iterations: bool = False, | |
3115 processes: Optional[int] = None, | |
3116 _orig_processes: Optional[int] = None, | |
3117 ) -> Iterator[int]: | |
3118 """ | |
3119 Perform `iterations` shufflings, each time intersecting with `other`. | |
3120 | |
3121 Returns a generator of integers where each integer is the number of | |
3122 intersections of a shuffled file with *other*. This distribution can | |
3123 be used in downstream analysis for things like empirical p-values. | |
3124 | |
3125 *intersect_kwargs* and *shuffle_kwargs* are passed to self.intersect() | |
3126 and self.shuffle() respectively. By default for intersect, u=True is | |
3127 specified -- but s=True might be a useful option for strand-specific | |
3128 work. | |
3129 | |
3130 Useful kwargs for *shuffle_kwargs* are chrom, excl, or incl. If you | |
3131 use the "seed" kwarg, that seed will be used *each* time shuffleBed is | |
3132 called -- so all your randomization results will be identical for each | |
3133 iteration. To get around this and to allow for tests, debug=True will | |
3134 set the seed to the iteration number. You may also break up the | |
3135 intersections across multiple processes with *processes* > 1. | |
3136 | |
3137 Example usage: | |
3138 | |
3139 >>> chromsizes = {'chr1':(0, 1000)} | |
3140 >>> a = pybedtools.example_bedtool('a.bed') | |
3141 >>> a = a.set_chromsizes(chromsizes) | |
3142 >>> b = pybedtools.example_bedtool('b.bed') | |
3143 >>> results = a.randomintersection(b, 10, debug=True) | |
3144 >>> print(list(results)) | |
3145 [1, 0, 1, 2, 4, 2, 2, 1, 2, 4] | |
3146 | |
3147 """ | |
3148 if processes is not None: | |
3149 p = Pool(processes) | |
3150 iterations_each = [iterations // processes] * processes | |
3151 iterations_each[-1] += iterations % processes | |
3152 results = [ | |
3153 p.apply_async( | |
3154 _call_randomintersect, | |
3155 (self, other, it), | |
3156 dict( | |
3157 intersect_kwargs=intersect_kwargs, | |
3158 shuffle_kwargs=shuffle_kwargs, | |
3159 debug=debug, | |
3160 report_iterations=report_iterations, | |
3161 _orig_processes=processes, | |
3162 ), | |
3163 ) | |
3164 for it in iterations_each | |
3165 ] | |
3166 for r in results: | |
3167 for value in r.get(): | |
3168 yield value | |
3169 raise StopIteration | |
3170 | |
3171 if shuffle_kwargs is None: | |
3172 shuffle_kwargs = {} | |
3173 if intersect_kwargs is None: | |
3174 intersect_kwargs = {"u": True} | |
3175 | |
3176 if "u" not in intersect_kwargs: | |
3177 intersect_kwargs["u"] = True | |
3178 | |
3179 resort = intersect_kwargs.get("sorted", False) | |
3180 | |
3181 for i in range(iterations): | |
3182 if debug: | |
3183 shuffle_kwargs["seed"] = i | |
3184 if report_iterations: | |
3185 if _orig_processes > 1: | |
3186 msg = "\rapprox (total across %s processes): %s" % ( | |
3187 _orig_processes, | |
3188 i * _orig_processes, | |
3189 ) | |
3190 else: | |
3191 msg = "\r%s" % i | |
3192 sys.stderr.write(msg) | |
3193 sys.stderr.flush() | |
3194 | |
3195 # Re-sort if sorted=True in kwargs | |
3196 if resort: | |
3197 tmp0 = self.shuffle(**shuffle_kwargs) | |
3198 tmp = tmp0.sort() | |
3199 else: | |
3200 tmp = self.shuffle(**shuffle_kwargs) | |
3201 | |
3202 tmp2 = tmp.intersect(other, stream=True, **intersect_kwargs) | |
3203 | |
3204 yield len(tmp2) | |
3205 | |
3206 # Close the open stdouts from subprocess.Popen calls. Note: doing | |
3207 # this in self.__del__ doesn't fix the open file limit bug; it | |
3208 # needs to be done here. | |
3209 # if resort: | |
3210 # tmp0.fn.close() | |
3211 # tmp.fn.close() | |
3212 tmp2.fn.close() | |
3213 del tmp | |
3214 del tmp2 | |
3215 | |
3216 @_log_to_history | |
3217 def cat(self, *others: Iterable[str|Path|BedTool], **kwargs) -> BedTool: | |
3218 """ | |
3219 Concatenate interval files together. | |
3220 | |
3221 Concatenates two BedTool objects (or an object and a file) and does an | |
3222 optional post-merge of the features. | |
3223 | |
3224 *postmerge=True* by default; use *postmerge=False* if you want to keep | |
3225 features separate. | |
3226 | |
3227 *force_truncate=False* by default; *force_truncate=True* to truncate | |
3228 all files to chrom, start, stop. | |
3229 | |
3230 When *force_truncate=False* and *postmerge=False*, the output will | |
3231 contain the smallest number of fields observed across all inputs. This | |
3232 maintains compatibility with BEDTools programs, which assume constant | |
3233 number of fields in all lines of a file. | |
3234 | |
3235 Other kwargs are sent to :meth:`BedTool.merge` (and assuming that | |
3236 *postmerge=True*). | |
3237 | |
3238 Example usage: | |
3239 | |
3240 >>> a = pybedtools.example_bedtool('a.bed') | |
3241 >>> b = pybedtools.example_bedtool('b.bed') | |
3242 >>> print(a.cat(b)) #doctest: +NORMALIZE_WHITESPACE | |
3243 chr1 1 500 | |
3244 chr1 800 950 | |
3245 <BLANKLINE> | |
3246 >>> print(a.cat(*[b,b], | |
3247 ... postmerge=False)) #doctest: +NORMALIZE_WHITESPACE | |
3248 chr1 1 100 feature1 0 + | |
3249 chr1 100 200 feature2 0 + | |
3250 chr1 150 500 feature3 0 - | |
3251 chr1 900 950 feature4 0 + | |
3252 chr1 155 200 feature5 0 - | |
3253 chr1 800 901 feature6 0 + | |
3254 chr1 155 200 feature5 0 - | |
3255 chr1 800 901 feature6 0 + | |
3256 <BLANKLINE> | |
3257 """ | |
3258 same_type = None | |
3259 same_field_num = None | |
3260 field_nums = set() | |
3261 | |
3262 assert len(others) > 0, "You must specify at least one other bedfile!" | |
3263 other_beds = [] | |
3264 for other in others: | |
3265 if isinstance(other, (str, Path)): | |
3266 other = BedTool(other) | |
3267 else: | |
3268 assert isinstance( | |
3269 other, BedTool | |
3270 ), "Either filename or another BedTool instance required" | |
3271 other_beds.append(other) | |
3272 | |
3273 # postmerge and force_truncate don't get passed on to merge | |
3274 postmerge = kwargs.pop("postmerge", True) | |
3275 force_truncate = kwargs.pop("force_truncate", False) | |
3276 stream_merge = kwargs.get("stream", False) | |
3277 if stream_merge and postmerge: | |
3278 raise ValueError( | |
3279 "The post-merge step in the `cat()` method " | |
3280 "performs a sort, which uses stream=True. Using " | |
3281 "stream=True for the merge as well will result in a " | |
3282 "deadlock!" | |
3283 ) | |
3284 | |
3285 # if filetypes and field counts are the same, don't truncate | |
3286 if not force_truncate: | |
3287 try: | |
3288 filetypes = set( | |
3289 [self.file_type] + [i.file_type for i in other_beds] | |
3290 ).difference(["empty"]) | |
3291 field_nums = ( | |
3292 set([self.field_count()] + [i.field_count() for i in other_beds]) | |
3293 .difference([None]) | |
3294 .difference([0]) | |
3295 ) | |
3296 same_field_num = len(field_nums) == 1 | |
3297 same_type = len(set(filetypes)) == 1 | |
3298 except ValueError: | |
3299 raise ValueError( | |
3300 "Can't check filetype or field count -- " | |
3301 "is one of the files you're merging a 'streaming' " | |
3302 "BedTool? If so, use .saveas() to save to file first" | |
3303 ) | |
3304 | |
3305 tmp = self._tmp() | |
3306 | |
3307 if not force_truncate and same_type and same_field_num: | |
3308 with open(tmp, "w") as TMP: | |
3309 for f in self: | |
3310 TMP.write(str(f)) | |
3311 for other in other_beds: | |
3312 for f in other: | |
3313 TMP.write(str(f)) | |
3314 | |
3315 # Types match, so we can use the min number of fields observed across | |
3316 # all inputs | |
3317 elif not force_truncate and same_type: | |
3318 minfields = min(field_nums) | |
3319 with open(tmp, "w") as TMP: | |
3320 for f in self: | |
3321 TMP.write("\t".join(f.fields[:minfields]) + "\n") | |
3322 for other in other_beds: | |
3323 for f in other: | |
3324 TMP.write("\t".join(f.fields[:minfields]) + "\n") | |
3325 | |
3326 # Otherwise, use the zero-based chrom/start/stop to create a BED3, | |
3327 # which will work when catting a GFF and a BED together. | |
3328 else: | |
3329 with open(tmp, "w") as TMP: | |
3330 for f in self: | |
3331 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end)) | |
3332 for other in other_beds: | |
3333 for f in other: | |
3334 TMP.write("%s\t%i\t%i\n" % (f.chrom, f.start, f.end)) | |
3335 | |
3336 c = BedTool(tmp) | |
3337 if postmerge: | |
3338 d = c.sort(stream=True).merge(**kwargs) | |
3339 | |
3340 # Explicitly delete -- needed when using multiprocessing | |
3341 os.unlink(tmp) | |
3342 return d | |
3343 else: | |
3344 return c | |
3345 | |
3346 @_log_to_history | |
3347 def saveas(self, fn: Optional[str] = None, trackline: Optional[str] = None, compressed: Optional[bool] = None) -> BedTool: | |
3348 """ | |
3349 Make a copy of the BedTool. | |
3350 | |
3351 Optionally adds `trackline` to the beginning of the file. | |
3352 | |
3353 Optionally compresses output using gzip. | |
3354 | |
3355 if the filename extension is .gz, or compressed=True, | |
3356 the output is compressed using gzip | |
3357 | |
3358 Returns a new BedTool for the newly saved file. | |
3359 | |
3360 A newline is automatically added to the trackline if it does not | |
3361 already have one. | |
3362 | |
3363 Example usage: | |
3364 | |
3365 >>> a = pybedtools.example_bedtool('a.bed') | |
3366 >>> b = a.saveas('other.bed') | |
3367 >>> b.fn | |
3368 'other.bed' | |
3369 >>> print(b == a) | |
3370 True | |
3371 | |
3372 >>> b = a.saveas('other.bed', trackline="name='test run' color=0,55,0") | |
3373 >>> open(b.fn).readline() | |
3374 "name='test run' color=0,55,0\\n" | |
3375 >>> if os.path.exists('other.bed'): | |
3376 ... os.unlink('other.bed') | |
3377 """ | |
3378 if fn is None: | |
3379 fn = self._tmp() | |
3380 | |
3381 # Default to compressed if extension is .gz | |
3382 if compressed is None: | |
3383 __, extension = os.path.splitext(fn) | |
3384 if extension == ".gz": | |
3385 compressed = True | |
3386 else: | |
3387 compressed = False | |
3388 | |
3389 in_compressed = isinstance(self.fn, str) and isGZIP(self.fn) | |
3390 | |
3391 fn = self._collapse( | |
3392 self, | |
3393 fn=fn, | |
3394 trackline=trackline, | |
3395 in_compressed=in_compressed, | |
3396 out_compressed=compressed, | |
3397 ) | |
3398 return BedTool(fn) | |
3399 | |
3400 @_log_to_history | |
3401 def moveto(self, fn: Optional[str]=None) -> BedTool: | |
3402 """ | |
3403 Move to a new filename (can be much quicker than BedTool.saveas()) | |
3404 | |
3405 Move BED file to new filename, `fn`. | |
3406 | |
3407 Returns a new BedTool for the new file. | |
3408 | |
3409 Example usage: | |
3410 | |
3411 >>> # make a copy so we don't mess up the example file | |
3412 >>> a = pybedtools.example_bedtool('a.bed').saveas() | |
3413 >>> a_contents = str(a) | |
3414 >>> b = a.moveto('other.bed') | |
3415 >>> b.fn | |
3416 'other.bed' | |
3417 >>> b == a_contents | |
3418 True | |
3419 """ | |
3420 if not isinstance(self.fn, str): | |
3421 fn = self._collapse(self, fn=fn) | |
3422 else: | |
3423 shutil.move(self.fn, fn) | |
3424 return BedTool(fn) | |
3425 | |
3426 @_log_to_history | |
3427 def random_subset(self, n: Optional[int] = None, f: Optional[float] = None, seed: Optional[float|int]=None) -> BedTool: | |
3428 """ | |
3429 Return a BedTool containing a random subset. | |
3430 | |
3431 NOTE: using `n` will be slower and use more memory than using `f`. | |
3432 | |
3433 Parameters | |
3434 ---------- | |
3435 | |
3436 n : int | |
3437 Number of features to return. Only one of `n` or `f` can be provided. | |
3438 | |
3439 f : float, 0 <= f <= 1 | |
3440 Fraction of features to return. Cannot be provided with `n`. | |
3441 | |
3442 seed : float or int | |
3443 Set random.seed | |
3444 | |
3445 Example | |
3446 ------- | |
3447 | |
3448 >>> seed = 0 # only for test, otherwise use None | |
3449 | |
3450 `n` will always give the same number of returned features, but will be | |
3451 slower since it is creating an index and then shuffling it. | |
3452 | |
3453 >>> a = pybedtools.example_bedtool('a.bed') | |
3454 >>> b = a.random_subset(n=2) | |
3455 >>> len(b) | |
3456 2 | |
3457 | |
3458 Using a fraction `f` will be faster but depending on seed will result | |
3459 in slightly different total numbers. | |
3460 | |
3461 >>> a = pybedtools.example_bedtool('x.bam') | |
3462 >>> len(a) | |
3463 45593 | |
3464 >>> b = a.random_subset(f=0.4, seed=seed) | |
3465 >>> len(b) | |
3466 18316 | |
3467 | |
3468 Check that we have approximately the right fraction | |
3469 >>> print('{0:.2f}'.format(len(b) / len(a))) | |
3470 0.40 | |
3471 | |
3472 """ | |
3473 if ((n is None) and (f is None)) or ((n is not None) and (f is not None)): | |
3474 raise ValueError("Exactly one of `n` or `f` must be provided") | |
3475 | |
3476 tmpfn = self._tmp() | |
3477 if seed is not None: | |
3478 random.seed(seed) | |
3479 | |
3480 if n: | |
3481 idxs = list(range(len(self))) | |
3482 random.shuffle(idxs) | |
3483 idxs = idxs[:n] | |
3484 with open(tmpfn, "w") as tmp: | |
3485 for i, feature in enumerate(self): | |
3486 if i in idxs: | |
3487 tmp.write(str(feature)) | |
3488 | |
3489 elif f: | |
3490 with open(tmpfn, "w") as tmp: | |
3491 for i in self: | |
3492 if random.random() <= f: | |
3493 tmp.write(str(i)) | |
3494 | |
3495 return BedTool(tmpfn) | |
3496 | |
3497 def total_coverage(self) -> int: | |
3498 """ | |
3499 Return the total number of bases covered by this interval file. | |
3500 | |
3501 Does a self.merge() first to remove potentially multiple-counting | |
3502 bases. | |
3503 | |
3504 Example usage: | |
3505 | |
3506 >>> a = pybedtools.example_bedtool('a.bed') | |
3507 | |
3508 This does a merge() first, so this is what the total coverage is | |
3509 counting: | |
3510 | |
3511 >>> print(a.merge()) #doctest: +NORMALIZE_WHITESPACE | |
3512 chr1 1 500 | |
3513 chr1 900 950 | |
3514 <BLANKLINE> | |
3515 | |
3516 >>> print(a.total_coverage()) | |
3517 549 | |
3518 """ | |
3519 b = self.merge() | |
3520 total_bp = 0 | |
3521 for feature in b.features(): | |
3522 total_bp += len(feature) | |
3523 return total_bp | |
3524 | |
3525 @_log_to_history | |
3526 def with_attrs(self, **kwargs) -> BedTool: | |
3527 """ | |
3528 Helper method for adding attributes in the middle of a pipeline. | |
3529 | |
3530 Given arbitrary keyword arguments, turns the keys and values into | |
3531 attributes. Useful for labeling BedTools at creation time. | |
3532 | |
3533 Example usage: | |
3534 | |
3535 >>> # add a "label" attribute to each BedTool | |
3536 >>> a = pybedtools.example_bedtool('a.bed')\ | |
3537 .with_attrs(label='transcription factor 1') | |
3538 >>> b = pybedtools.example_bedtool('b.bed')\ | |
3539 .with_attrs(label='transcription factor 2') | |
3540 >>> for i in [a, b]: | |
3541 ... print('{0} features for {1}'.format(i.count(), i.label)) | |
3542 4 features for transcription factor 1 | |
3543 2 features for transcription factor 2 | |
3544 | |
3545 """ | |
3546 for key, value in list(kwargs.items()): | |
3547 setattr(self, key, value) | |
3548 return self | |
3549 | |
3550 def as_intervalfile(self) -> IntervalFile: | |
3551 """ | |
3552 Returns an IntervalFile of this BedTool for low-level interface. | |
3553 """ | |
3554 if not isinstance(self.fn, str): | |
3555 fn = self._collapse(self.fn) | |
3556 else: | |
3557 fn = self.fn | |
3558 return IntervalFile(fn) | |
3559 | |
3560 def liftover(self, chainfile: str, unmapped: Optional[str] = None, liftover_args: str = "") -> BedTool: | |
3561 """ | |
3562 Returns a new BedTool of the liftedOver features, saving the unmapped | |
3563 ones as `unmapped`. If `unmapped` is None, then discards the unmapped | |
3564 features. | |
3565 | |
3566 `liftover_args` is a string of additional args that is passed, | |
3567 verbatim, to liftOver. | |
3568 | |
3569 Needs `liftOver` from UCSC to be on the path and a `chainfile` | |
3570 downloaded from UCSC. | |
3571 """ | |
3572 result = BedTool._tmp() | |
3573 if unmapped is None: | |
3574 unmapped = BedTool._tmp() | |
3575 cmds = ["liftOver", liftover_args, self.fn, chainfile, result, unmapped] | |
3576 os.system(" ".join(cmds)) | |
3577 return BedTool(result) | |
3578 | |
3579 def absolute_distance(self, other: BedTool, closest_kwargs: Optional[dict[str, Any]]=None, use_midpoints: bool=False) -> Iterator[int]: | |
3580 """ | |
3581 Returns an iterator of the *absolute* distances between features in | |
3582 self and other. | |
3583 | |
3584 If `use_midpoints` is True, then only use the midpoints of features | |
3585 (which will return values where features are overlapping). Otherwise, | |
3586 when features overlap the value will always be zero. | |
3587 | |
3588 `closest_kwargs` are passed to self.closest(); either `d` or | |
3589 'D` are required in order to get back distance values (`d=True` is | |
3590 default) | |
3591 """ | |
3592 from .featurefuncs import midpoint | |
3593 | |
3594 if closest_kwargs is None: | |
3595 closest_kwargs = {"d": True} | |
3596 | |
3597 if "D" not in closest_kwargs: | |
3598 closest_kwargs.update(dict(d=True)) | |
3599 | |
3600 if use_midpoints: | |
3601 mid_self = self.each(midpoint).saveas() | |
3602 mid_other = other.each(midpoint).saveas() | |
3603 c = mid_self.closest(mid_other, stream=True, **closest_kwargs) | |
3604 else: | |
3605 c = self.closest(other, stream=True, **closest_kwargs) | |
3606 for i in c: | |
3607 yield int(i[-1]) | |
3608 | |
3609 def relative_distance(self, other: BedTool, genome:Optional[dict|str] =None, g: Optional[str]=None) -> Iterator[float]: | |
3610 """ | |
3611 Returns an iterator of relative distances between features in self and | |
3612 other. | |
3613 | |
3614 First computes the midpoints of self and other, then returns distances | |
3615 of each feature in `other` relative to the distance between `self` | |
3616 features. | |
3617 | |
3618 Requires either `genome` (dictionary of chromsizes or assembly name) or | |
3619 `g` (filename of chromsizes file). | |
3620 """ | |
3621 g_dict = {} | |
3622 if (genome is None) and (g is None): | |
3623 raise ValueError("Need either `genome` or `g` arg for relative distance") | |
3624 elif genome and g: | |
3625 raise ValueError("Please specify only one of `genome` or `g`") | |
3626 elif genome: | |
3627 g_dict = dict(genome=genome) | |
3628 elif g: | |
3629 g_dict = dict(g=g) | |
3630 | |
3631 from .featurefuncs import midpoint | |
3632 | |
3633 # This gets the space between features in self. | |
3634 c = self.each(midpoint).complement(**g_dict) | |
3635 | |
3636 hits = c.intersect(other, wao=True, stream=True) # TODO: should this be other or mid_other? | |
3637 for i in hits: | |
3638 yield float(i[-1]) / len(i) | |
3639 | |
3640 def colormap_normalize(self, | |
3641 vmin: Optional[float|int]=None, | |
3642 vmax: Optional[float|int]=None, | |
3643 percentile: bool=False, | |
3644 log: bool=False) -> mcolors.LogNorm|mcolors.Normalize: | |
3645 """ | |
3646 Returns a normalization instance for use by featurefuncs.add_color(). | |
3647 | |
3648 Parameters | |
3649 ---------- | |
3650 vmin, vmax : float, int, or None | |
3651 `vmin` and `vmax` set the colormap bounds; if None then | |
3652 these will be determined from the scores in the BED file. | |
3653 | |
3654 log : bool | |
3655 If True, put the scores on a log scale; of course be careful | |
3656 if you have negative scores | |
3657 | |
3658 percentile : bool | |
3659 If True, interpret vmin and vmax as a percentile in the range | |
3660 [0,100] rather than absolute values. | |
3661 """ | |
3662 field_count = self.field_count() | |
3663 if (self.file_type != "bed") or (field_count < 5): | |
3664 raise ValueError("colorizing only works for BED files with score " "fields") | |
3665 try: | |
3666 import matplotlib.colors as mcolors | |
3667 except ImportError: | |
3668 raise ImportError("matplotlib.colors must be installed to use colormap_normalize") | |
3669 | |
3670 try: | |
3671 import numpy as np | |
3672 except ImportError: | |
3673 raise ImportError("numpy must be installed to use colormap_normalize") | |
3674 | |
3675 if log: | |
3676 norm = mcolors.LogNorm() | |
3677 else: | |
3678 norm = mcolors.Normalize() | |
3679 | |
3680 scores = np.array([i.score for i in self], dtype=float) | |
3681 scores = scores[np.isfinite(scores)] | |
3682 norm.autoscale(scores) | |
3683 | |
3684 if vmin is not None: | |
3685 if percentile: | |
3686 vmin = float(np.percentile(scores, vmin)) | |
3687 norm.vmin = vmin | |
3688 if vmax is not None: | |
3689 if percentile: | |
3690 vmax = float(np.percentile(scores, vmax)) | |
3691 norm.vmax = vmax | |
3692 | |
3693 return norm | |
3694 | |
3695 def at(self, inds: list[int]) -> BedTool: | |
3696 """ | |
3697 Returns a new BedTool with only intervals at lines `inds` | |
3698 | |
3699 Parameters | |
3700 ---------- | |
3701 inds : List[int] | |
3702 List of line numbers | |
3703 | |
3704 Returns | |
3705 ------- | |
3706 BedTool | |
3707 New BedTool with only intervals at `inds` | |
3708 """ | |
3709 length = len(inds) | |
3710 | |
3711 def _gen(): | |
3712 k = 0 | |
3713 for i, feature in enumerate(self): | |
3714 if i == inds[k]: | |
3715 yield feature | |
3716 k += 1 | |
3717 if k == length: | |
3718 break | |
3719 | |
3720 return BedTool(_gen()).saveas() | |
3721 | |
3722 def to_dataframe(self, disable_auto_names: bool = False, *args, **kwargs) -> pd.DataFrame: | |
3723 """ | |
3724 Create a pandas.DataFrame, passing args and kwargs to pandas.read_csv | |
3725 The separator kwarg `sep` is given a tab `\\t` as value by default. | |
3726 | |
3727 Parameters | |
3728 ---------- | |
3729 disable_auto_names : bool | |
3730 By default, the created dataframe fills in column names | |
3731 automatically according to the detected filetype (e.g., "chrom", | |
3732 "start", "end" for a BED3 file). Set this argument to True to | |
3733 disable this behavior. | |
3734 """ | |
3735 # Complain if BAM or if not a file | |
3736 if self._isbam: | |
3737 raise ValueError("BAM not supported for converting to DataFrame") | |
3738 if not isinstance(self.fn, str): | |
3739 raise ValueError("use .saveas() to make sure self.fn is a file") | |
3740 | |
3741 try: | |
3742 import pandas | |
3743 except ImportError: | |
3744 raise ImportError("pandas must be installed to convert to pandas.DataFrame") | |
3745 # Otherwise we're good: | |
3746 names = kwargs.get("names", None) | |
3747 if names is None and not disable_auto_names: | |
3748 try: | |
3749 _names = settings._column_names[self.file_type][: self.field_count()] | |
3750 if len(_names) < self.field_count(): | |
3751 warn( | |
3752 "Default names for filetype %s are:\n%s\nbut file has " | |
3753 "%s fields; you can supply custom names with the " | |
3754 "`names` kwarg" % (self.file_type, _names, self.field_count()) | |
3755 ) | |
3756 _names = None | |
3757 except KeyError: | |
3758 _names = None | |
3759 kwargs["names"] = _names | |
3760 | |
3761 if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0: | |
3762 return pandas.read_csv(self.fn, *args, sep="\t", **kwargs) # type: ignore | |
3763 else: | |
3764 return pandas.DataFrame() | |
3765 | |
3766 def tail(self, lines:int = 10, as_string: bool = False) -> Optional[str]: | |
3767 """ | |
3768 Like `head`, but prints last 10 lines of the file by default. | |
3769 | |
3770 To avoid consuming iterables, this only works with file-based, non-BAM | |
3771 BedTool objects. | |
3772 | |
3773 Use `as_string=True` to return a string. | |
3774 """ | |
3775 if self._isbam: | |
3776 raise ValueError("tail() not yet implemented for BAM files") | |
3777 if not isinstance(self.fn, str): | |
3778 raise ValueError( | |
3779 "tail() not implemented for non-file-based " | |
3780 "BedTool objects. Please use saveas() first." | |
3781 ) | |
3782 bufsize = 8192 | |
3783 offset = bufsize | |
3784 f = open(self.fn, "rb") | |
3785 | |
3786 # whence=2 arg means relative to end (i.e., go to the end) | |
3787 f.seek(0, 2) | |
3788 file_size = f.tell() | |
3789 data = [] | |
3790 while True: | |
3791 if file_size < bufsize: | |
3792 offset = file_size | |
3793 f.seek(-offset, 2) | |
3794 chunk = f.read(offset) | |
3795 data.extend(chunk.splitlines(True)) | |
3796 if len(data) >= lines or offset == file_size: | |
3797 break | |
3798 offset += bufsize | |
3799 | |
3800 result = "".join([i.decode() for i in data[-lines:]]) | |
3801 if as_string: | |
3802 return result | |
3803 else: | |
3804 print(result) | |
3805 | |
3806 | |
3807 class BAM(object): | |
3808 def __init__(self, stream): | |
3809 """ | |
3810 Wraps pysam.Samfile so that it yields pybedtools.Interval objects when | |
3811 iterated over. | |
3812 | |
3813 The pysam.Samfile can be accessed via the .pysam_bamfile attribute. | |
3814 """ | |
3815 self.stream = stream | |
3816 if not isinstance(self.stream, str): | |
3817 raise ValueError("Only files are supported, not streams") | |
3818 self.pysam_bamfile = pysam.Samfile(self.stream) | |
3819 | |
3820 def _aligned_segment_to_interval(self, r): | |
3821 | |
3822 if r.rname >= 0: | |
3823 rname = self.pysam_bamfile.get_reference_name(r.rname) | |
3824 else: | |
3825 rname = "*" | |
3826 | |
3827 if r.rnext >= 0: | |
3828 if r.rnext == r.rname: | |
3829 rnext = "=" | |
3830 else: | |
3831 rnext = self.pysam_bamfile.get_reference_name(r.rnext) | |
3832 else: | |
3833 rnext = "*" | |
3834 | |
3835 # SAM spec says if unavailable should be set to 0. Pysam sets to -1. | |
3836 | |
3837 if r.pnext <= 0: | |
3838 pnext = "0" | |
3839 else: | |
3840 # +1 here because cbedtools.pyx expects SAM -- which is 1-based -- | |
3841 # but pysam uses 0-based. | |
3842 pnext = str(r.pnext + 1) | |
3843 | |
3844 if r.cigarstring: | |
3845 cigarstring = r.cigarstring | |
3846 else: | |
3847 cigarstring = "*" | |
3848 | |
3849 # Rudimentary support. | |
3850 # TODO: remove when refactoring to new BAM iterating | |
3851 tags = [] | |
3852 for k, v in r.tags: | |
3853 if isinstance(v, int): | |
3854 t = "i" | |
3855 elif isinstance(v, float): | |
3856 t = "f" | |
3857 else: | |
3858 t = "Z" | |
3859 tags.append("{0}:{1}:{2}".format(k, t, v)) | |
3860 | |
3861 tags = "\t".join(tags) | |
3862 | |
3863 if r.seq: | |
3864 seq = r.seq | |
3865 else: | |
3866 seq = "*" | |
3867 | |
3868 if r.qual: | |
3869 qual = r.qual | |
3870 else: | |
3871 qual = "*" | |
3872 | |
3873 fields = [ | |
3874 r.qname, | |
3875 str(r.flag), | |
3876 rname, | |
3877 # +1 here because cbedtools.pyx expects SAM -- which is 1-based -- | |
3878 # but pysam uses 0-based. | |
3879 str(r.pos + 1), | |
3880 str(r.mapq), | |
3881 cigarstring, | |
3882 rnext, | |
3883 pnext, | |
3884 str(r.tlen), | |
3885 seq, | |
3886 qual, | |
3887 ] | |
3888 if tags: | |
3889 fields.append(tags) | |
3890 | |
3891 if None in fields: | |
3892 raise ValueError("Found 'None' in fields: %s" % fields) | |
3893 return create_interval_from_list(fields) | |
3894 | |
3895 def __iter__(self): | |
3896 return self | |
3897 | |
3898 # TODO: this is PAINFUL but it ensures that existing tests work. Once all | |
3899 # tests work, the new behavior will be to yield pysam AlignedSegment | |
3900 # objects directly. | |
3901 def __next__(self): | |
3902 return self._aligned_segment_to_interval(next(self.pysam_bamfile)) | |
3903 | |
3904 def next(self): | |
3905 return self.__next__() | |
3906 | |
3907 | |
3908 class History(list): | |
3909 def __init__(self): | |
3910 """ | |
3911 Represents one or many HistorySteps. Mostly used for nicely formatting | |
3912 a series of HistorySteps. | |
3913 """ | |
3914 list.__init__(self) | |
3915 | |
3916 | |
3917 class HistoryStep(object): | |
3918 def __init__(self, method, args, kwargs, bedtool_instance, parent_tag, result_tag): | |
3919 """ | |
3920 Class to represent one step in the history. | |
3921 | |
3922 Mostly used for its __repr__ method, to try and exactly replicate code | |
3923 that can be pasted to re-do history steps | |
3924 """ | |
3925 try: | |
3926 self.method = method._name | |
3927 except AttributeError: | |
3928 self.method = method.__name__ | |
3929 | |
3930 self.args = args | |
3931 self.kwargs = kwargs | |
3932 self.fn = bedtool_instance.fn | |
3933 self.parent_tag = parent_tag | |
3934 self.result_tag = result_tag | |
3935 | |
3936 def _clean_arg(self, arg: str|BedTool) -> str: | |
3937 """ | |
3938 Wrap strings in quotes and convert bedtool instances to filenames. | |
3939 """ | |
3940 if isinstance(arg, BedTool): | |
3941 arg = arg.fn | |
3942 if isinstance(arg, str): | |
3943 arg = '"%s"' % arg | |
3944 return arg | |
3945 | |
3946 def __repr__(self): | |
3947 # Still not sure whether to use pybedtools.bedtool() or bedtool() | |
3948 s = "" | |
3949 s += "<HistoryStep> " | |
3950 if os.path.exists(self.fn): | |
3951 s += 'BedTool("%(fn)s").%(method)s(%%s%%s)' % self.__dict__ | |
3952 else: | |
3953 s += 'BedTool("MISSING FILE: %(fn)s")' % self.__dict__ | |
3954 s += ".%(method)s(%%s%%s)" % self.__dict__ | |
3955 | |
3956 # Format args and kwargs | |
3957 args_string = ",".join(map(self._clean_arg, self.args)) | |
3958 kwargs_string = ",".join( | |
3959 ["%s=%s" % (i[0], self._clean_arg(i[1])) for i in list(self.kwargs.items())] | |
3960 ) | |
3961 # stick a comma on the end if there's something here | |
3962 if len(args_string) > 0: | |
3963 args_string += ", " | |
3964 | |
3965 s = s % (args_string, kwargs_string) | |
3966 s += ", parent tag: %s" % self.parent_tag | |
3967 s += ", result tag: %s" % self.result_tag | |
3968 return s | |
3969 | |
3970 | |
3971 def example_bedtool(fn): | |
3972 """ | |
3973 Return a bedtool using a bed file from the pybedtools examples directory. | |
3974 Use :func:`list_example_files` to see a list of files that are included. | |
3975 """ | |
3976 fn = os.path.join(filenames.data_dir(), fn) | |
3977 if not os.path.exists(fn): | |
3978 msg = "%s does not exist" % fn | |
3979 raise FileNotFoundError(msg) | |
3980 return BedTool(fn) | |
3981 | |
3982 | |
3983 if __name__ == "__main__": | |
3984 import doctest | |
3985 | |
3986 doctest.testmod(optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) |