comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/BioSeqDatabase.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 # Copyright 2002 by Andrew Dalke. All rights reserved.
2 # Revisions 2007-2016 copyright by Peter Cock. All rights reserved.
3 # Revisions 2009 copyright by Cymon J. Cox. All rights reserved.
4 # Revisions 2013-2014 copyright by Tiago Antao. All rights reserved.
5 #
6 # This file is part of the Biopython distribution and governed by your
7 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
8 # Please see the LICENSE file that should have been included as part of this
9 # package.
10 #
11 # Note that BioSQL (including the database schema and scripts) is
12 # available and licensed separately. Please consult www.biosql.org
13 """Connect with a BioSQL database and load Biopython like objects from it.
14
15 This provides interfaces for loading biological objects from a relational
16 database, and is compatible with the BioSQL standards.
17 """
18 import os
19
20 from . import BioSeq
21 from . import Loader
22 from . import DBUtils
23
24
25 _POSTGRES_RULES_PRESENT = False # Hack for BioSQL Bug 2839
26
27
28 def open_database(driver="MySQLdb", **kwargs):
29 """Load an existing BioSQL-style database.
30
31 This function is the easiest way to retrieve a connection to a
32 database, doing something like::
33
34 from BioSQL import BioSeqDatabase
35 server = BioSeqDatabase.open_database(user="root", db="minidb")
36
37 Arguments:
38 - driver - The name of the database driver to use for connecting. The
39 driver should implement the python DB API. By default, the MySQLdb
40 driver is used.
41 - user -the username to connect to the database with.
42 - password, passwd - the password to connect with
43 - host - the hostname of the database
44 - database or db - the name of the database
45
46 """
47 if driver == "psycopg":
48 raise ValueError(
49 "Using BioSQL with psycopg (version one) is no "
50 "longer supported. Use psycopg2 instead."
51 )
52
53 if os.name == "java":
54 from com.ziclix.python.sql import zxJDBC
55
56 module = zxJDBC
57 if driver in ["MySQLdb"]:
58 jdbc_driver = "com.mysql.jdbc.Driver"
59 url_pref = "jdbc:mysql://" + kwargs["host"] + "/"
60 elif driver in ["psycopg2"]:
61 jdbc_driver = "org.postgresql.Driver"
62 url_pref = "jdbc:postgresql://" + kwargs["host"] + "/"
63
64 else:
65 module = __import__(driver, fromlist=["connect"])
66 connect = module.connect
67
68 # Different drivers use different keywords...
69 kw = kwargs.copy()
70 if driver in ["MySQLdb", "mysql.connector"] and os.name != "java":
71 if "database" in kw:
72 kw["db"] = kw["database"]
73 del kw["database"]
74 if "password" in kw:
75 kw["passwd"] = kw["password"]
76 del kw["password"]
77 # kw["charset"] = "utf8"
78 # kw["use_unicode"] = True
79 else:
80 # DB-API recommendations
81 if "db" in kw:
82 kw["database"] = kw["db"]
83 del kw["db"]
84 if "passwd" in kw:
85 kw["password"] = kw["passwd"]
86 del kw["passwd"]
87 if driver in ["psycopg2", "pgdb"] and not kw.get("database"):
88 kw["database"] = "template1"
89 # SQLite connect takes the database name as input
90 if os.name == "java":
91 if driver in ["MySQLdb"]:
92 conn = connect(
93 url_pref + kw.get("database", "mysql"),
94 kw["user"],
95 kw["password"],
96 jdbc_driver,
97 )
98 elif driver in ["psycopg2"]:
99 conn = connect(
100 url_pref + kw.get("database", "postgresql") + "?stringtype=unspecified",
101 kw["user"],
102 kw["password"],
103 jdbc_driver,
104 )
105 elif driver in ["sqlite3"]:
106 conn = connect(kw["database"])
107 else:
108 conn = connect(**kw)
109
110 if os.name == "java":
111 server = DBServer(conn, module, driver)
112 else:
113 server = DBServer(conn, module)
114
115 # Sets MySQL to allow double quotes, rather than only backticks
116 if driver in ["MySQLdb", "mysql.connector"]:
117 server.adaptor.execute("SET sql_mode='ANSI_QUOTES';")
118
119 # TODO - Remove the following once BioSQL Bug 2839 is fixed.
120 # Test for RULES in PostgreSQL schema, see also Bug 2833.
121 if driver in ["psycopg2", "pgdb"]:
122 sql = (
123 "SELECT ev_class FROM pg_rewrite WHERE "
124 "rulename='rule_bioentry_i1' OR "
125 "rulename='rule_bioentry_i2';"
126 )
127 if server.adaptor.execute_and_fetchall(sql):
128 import warnings
129 from Bio import BiopythonWarning
130
131 warnings.warn(
132 "Your BioSQL PostgreSQL schema includes some rules "
133 "currently required for bioperl-db but which may"
134 "cause problems loading data using Biopython (see "
135 "BioSQL's RedMine Bug 2839 aka GitHub Issue 4 "
136 "https://github.com/biosql/biosql/issues/4). "
137 "If you do not use BioPerl, please remove these "
138 "rules. Biopython should cope with the rules "
139 "present, but with a performance penalty when "
140 "loading new records.",
141 BiopythonWarning,
142 )
143 global _POSTGRES_RULES_PRESENT
144 _POSTGRES_RULES_PRESENT = True
145
146 elif driver == "sqlite3":
147 # Tell SQLite that we want to use foreign keys
148 # https://www.sqlite.org/foreignkeys.html#fk_enable
149 server.adaptor.execute("PRAGMA foreign_keys = ON")
150
151 return server
152
153
154 class DBServer:
155 """Represents a BioSQL database containing namespaces (sub-databases).
156
157 This acts like a Python dictionary, giving access to each namespace
158 (defined by a row in the biodatabase table) as a BioSeqDatabase object.
159 """
160
161 def __init__(self, conn, module, module_name=None):
162 """Create a DBServer object.
163
164 Arguments:
165 - conn - A database connection object
166 - module - The module used to create the database connection
167 - module_name - Optionally, the name of the module. Default: module.__name__
168
169 Normally you would not want to create a DBServer object yourself.
170 Instead use the open_database function, which returns an instance of DBServer.
171 """
172 self.module = module
173 if module_name is None:
174 module_name = module.__name__
175 if module_name == "mysql.connector":
176 wrap_cursor = True
177 else:
178 wrap_cursor = False
179 # Get module specific Adaptor or the base (general) Adaptor
180 Adapt = _interface_specific_adaptors.get(module_name, Adaptor)
181 self.adaptor = Adapt(
182 conn, DBUtils.get_dbutils(module_name), wrap_cursor=wrap_cursor
183 )
184 self.module_name = module_name
185
186 def __repr__(self):
187 """Return a short description of the class name and database connection."""
188 return f"{self.__class__.__name__}({self.adaptor.conn!r})"
189
190 def __getitem__(self, name):
191 """Return a BioSeqDatabase object.
192
193 Arguments:
194 - name - The name of the BioSeqDatabase
195
196 """
197 return BioSeqDatabase(self.adaptor, name)
198
199 def __len__(self):
200 """Return number of namespaces (sub-databases) in this database."""
201 sql = "SELECT COUNT(name) FROM biodatabase;"
202 return int(self.adaptor.execute_and_fetch_col0(sql)[0])
203
204 def __contains__(self, value):
205 """Check if a namespace (sub-database) in this database."""
206 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;"
207 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
208
209 def __iter__(self):
210 """Iterate over namespaces (sub-databases) in the database."""
211 # TODO - Iterate over the cursor, much more efficient
212 return iter(self.adaptor.list_biodatabase_names())
213
214 def keys(self):
215 """Iterate over namespaces (sub-databases) in the database."""
216 return iter(self)
217
218 def values(self):
219 """Iterate over BioSeqDatabase objects in the database."""
220 for key in self:
221 yield self[key]
222
223 def items(self):
224 """Iterate over (namespace, BioSeqDatabase) in the database."""
225 for key in self:
226 yield key, self[key]
227
228 def __delitem__(self, name):
229 """Remove a namespace and all its entries."""
230 if name not in self:
231 raise KeyError(name)
232 db_id = self.adaptor.fetch_dbid_by_dbname(name)
233 remover = Loader.DatabaseRemover(self.adaptor, db_id)
234 remover.remove()
235
236 def new_database(self, db_name, authority=None, description=None):
237 """Add a new database to the server and return it."""
238 # make the database
239 sql = (
240 "INSERT INTO biodatabase (name, authority, description)"
241 " VALUES (%s, %s, %s)"
242 )
243 self.adaptor.execute(sql, (db_name, authority, description))
244 return BioSeqDatabase(self.adaptor, db_name)
245
246 def load_database_sql(self, sql_file):
247 """Load a database schema into the given database.
248
249 This is used to create tables, etc when a database is first created.
250 sql_file should specify the complete path to a file containing
251 SQL entries for building the tables.
252 """
253 # Not sophisticated enough for PG schema. Is it needed by MySQL?
254 # Looks like we need this more complicated way for both. Leaving it
255 # the default and removing the simple-minded approach.
256
257 # read the file with all comment lines removed
258 sql = ""
259 with open(sql_file) as sql_handle:
260 for line in sql_handle:
261 if line.startswith("--"): # don't include comment lines
262 pass
263 elif line.startswith("#"): # ditto for MySQL comments
264 pass
265 elif line.strip(): # only include non-blank lines
266 sql += line.strip() + " "
267
268 # two ways to load the SQL
269 # 1. PostgreSQL can load it all at once and actually needs to
270 # due to FUNCTION defines at the end of the SQL which mess up
271 # the splitting by semicolons
272 if self.module_name in ["psycopg2", "pgdb"]:
273 self.adaptor.cursor.execute(sql)
274 # 2. MySQL needs the database loading split up into single lines of
275 # SQL executed one at a time
276 elif self.module_name in ["mysql.connector", "MySQLdb", "sqlite3"]:
277 sql_parts = sql.split(";") # one line per sql command
278 # don't use the last item, it's blank
279 for sql_line in sql_parts[:-1]:
280 self.adaptor.cursor.execute(sql_line)
281 else:
282 raise ValueError(f"Module {self.module_name} not supported by the loader.")
283
284 def commit(self):
285 """Commit the current transaction to the database."""
286 return self.adaptor.commit()
287
288 def rollback(self):
289 """Roll-back the current transaction."""
290 return self.adaptor.rollback()
291
292 def close(self):
293 """Close the connection. No further activity possible."""
294 return self.adaptor.close()
295
296
297 class _CursorWrapper:
298 """A wrapper for mysql.connector resolving bytestring representations."""
299
300 def __init__(self, real_cursor):
301 self.real_cursor = real_cursor
302
303 def execute(self, operation, params=None, multi=False):
304 """Execute a sql statement."""
305 self.real_cursor.execute(operation, params, multi)
306
307 def executemany(self, operation, params):
308 """Execute many sql statements."""
309 self.real_cursor.executemany(operation, params)
310
311 def _convert_tuple(self, tuple_):
312 """Decode any bytestrings present in the row (PRIVATE)."""
313 tuple_list = list(tuple_)
314 for i, elem in enumerate(tuple_list):
315 if isinstance(elem, bytes):
316 tuple_list[i] = elem.decode("utf-8")
317 return tuple(tuple_list)
318
319 def _convert_list(self, lst):
320 ret_lst = []
321 for tuple_ in lst:
322 new_tuple = self._convert_tuple(tuple_)
323 ret_lst.append(new_tuple)
324 return ret_lst
325
326 def fetchall(self):
327 rv = self.real_cursor.fetchall()
328 return self._convert_list(rv)
329
330 def fetchone(self):
331 tuple_ = self.real_cursor.fetchone()
332 return self._convert_tuple(tuple_)
333
334
335 class Adaptor:
336 """High level wrapper for a database connection and cursor.
337
338 Most database calls in BioSQL are done indirectly though this adaptor
339 class. This provides helper methods for fetching data and executing
340 sql.
341 """
342
343 def __init__(self, conn, dbutils, wrap_cursor=False):
344 """Create an Adaptor object.
345
346 Arguments:
347 - conn - A database connection
348 - dbutils - A BioSQL.DBUtils object
349 - wrap_cursor - Optional, whether to wrap the cursor object
350
351 """
352 self.conn = conn
353 if wrap_cursor:
354 self.cursor = _CursorWrapper(conn.cursor())
355 else:
356 self.cursor = conn.cursor()
357 self.dbutils = dbutils
358
359 def last_id(self, table):
360 """Return the last row id for the selected table."""
361 return self.dbutils.last_id(self.cursor, table)
362
363 def autocommit(self, y=True):
364 """Set the autocommit mode. True values enable; False value disable."""
365 return self.dbutils.autocommit(self.conn, y)
366
367 def commit(self):
368 """Commit the current transaction."""
369 return self.conn.commit()
370
371 def rollback(self):
372 """Roll-back the current transaction."""
373 return self.conn.rollback()
374
375 def close(self):
376 """Close the connection. No further activity possible."""
377 return self.conn.close()
378
379 def fetch_dbid_by_dbname(self, dbname):
380 """Return the internal id for the sub-database using its name."""
381 self.execute(
382 "select biodatabase_id from biodatabase where name = %s", (dbname,)
383 )
384 rv = self.cursor.fetchall()
385 if not rv:
386 raise KeyError(f"Cannot find biodatabase with name {dbname!r}")
387 return rv[0][0]
388
389 def fetch_seqid_by_display_id(self, dbid, name):
390 """Return the internal id for a sequence using its display id.
391
392 Arguments:
393 - dbid - the internal id for the sub-database
394 - name - the name of the sequence. Corresponds to the
395 name column of the bioentry table of the SQL schema
396
397 """
398 sql = "select bioentry_id from bioentry where name = %s"
399 fields = [name]
400 if dbid:
401 sql += " and biodatabase_id = %s"
402 fields.append(dbid)
403 self.execute(sql, fields)
404 rv = self.cursor.fetchall()
405 if not rv:
406 raise IndexError(f"Cannot find display id {name!r}")
407 if len(rv) > 1:
408 raise IndexError(f"More than one entry with display id {name!r}")
409 return rv[0][0]
410
411 def fetch_seqid_by_accession(self, dbid, name):
412 """Return the internal id for a sequence using its accession.
413
414 Arguments:
415 - dbid - the internal id for the sub-database
416 - name - the accession of the sequence. Corresponds to the
417 accession column of the bioentry table of the SQL schema
418
419 """
420 sql = "select bioentry_id from bioentry where accession = %s"
421 fields = [name]
422 if dbid:
423 sql += " and biodatabase_id = %s"
424 fields.append(dbid)
425 self.execute(sql, fields)
426 rv = self.cursor.fetchall()
427 if not rv:
428 raise IndexError(f"Cannot find accession {name!r}")
429 if len(rv) > 1:
430 raise IndexError(f"More than one entry with accession {name!r}")
431 return rv[0][0]
432
433 def fetch_seqids_by_accession(self, dbid, name):
434 """Return a list internal ids using an accession.
435
436 Arguments:
437 - dbid - the internal id for the sub-database
438 - name - the accession of the sequence. Corresponds to the
439 accession column of the bioentry table of the SQL schema
440
441 """
442 sql = "select bioentry_id from bioentry where accession = %s"
443 fields = [name]
444 if dbid:
445 sql += " and biodatabase_id = %s"
446 fields.append(dbid)
447 return self.execute_and_fetch_col0(sql, fields)
448
449 def fetch_seqid_by_version(self, dbid, name):
450 """Return the internal id for a sequence using its accession and version.
451
452 Arguments:
453 - dbid - the internal id for the sub-database
454 - name - the accession of the sequence containing a version number.
455 Must correspond to <accession>.<version>
456
457 """
458 acc_version = name.split(".")
459 if len(acc_version) > 2:
460 raise IndexError(f"Bad version {name!r}")
461 acc = acc_version[0]
462 if len(acc_version) == 2:
463 version = acc_version[1]
464 else:
465 version = "0"
466 sql = "SELECT bioentry_id FROM bioentry WHERE accession = %s AND version = %s"
467 fields = [acc, version]
468 if dbid:
469 sql += " and biodatabase_id = %s"
470 fields.append(dbid)
471 self.execute(sql, fields)
472 rv = self.cursor.fetchall()
473 if not rv:
474 raise IndexError(f"Cannot find version {name!r}")
475 if len(rv) > 1:
476 raise IndexError(f"More than one entry with version {name!r}")
477 return rv[0][0]
478
479 def fetch_seqid_by_identifier(self, dbid, identifier):
480 """Return the internal id for a sequence using its identifier.
481
482 Arguments:
483 - dbid - the internal id for the sub-database
484 - identifier - the identifier of the sequence. Corresponds to
485 the identifier column of the bioentry table in the SQL schema.
486
487 """
488 # YB: was fetch_seqid_by_seqid
489 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s"
490 fields = [identifier]
491 if dbid:
492 sql += " and biodatabase_id = %s"
493 fields.append(dbid)
494 self.execute(sql, fields)
495 rv = self.cursor.fetchall()
496 if not rv:
497 raise IndexError(f"Cannot find display id {identifier!r}")
498 return rv[0][0]
499
500 def list_biodatabase_names(self):
501 """Return a list of all of the sub-databases."""
502 return self.execute_and_fetch_col0("SELECT name FROM biodatabase")
503
504 def list_bioentry_ids(self, dbid):
505 """Return a list of internal ids for all of the sequences in a sub-databae.
506
507 Arguments:
508 - dbid - The internal id for a sub-database
509
510 """
511 return self.execute_and_fetch_col0(
512 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", (dbid,)
513 )
514
515 def list_bioentry_display_ids(self, dbid):
516 """Return a list of all sequence names in a sub-databae.
517
518 Arguments:
519 - dbid - The internal id for a sub-database
520
521 """
522 return self.execute_and_fetch_col0(
523 "SELECT name FROM bioentry WHERE biodatabase_id = %s", (dbid,)
524 )
525
526 def list_any_ids(self, sql, args):
527 """Return ids given a SQL statement to select for them.
528
529 This assumes that the given SQL does a SELECT statement that
530 returns a list of items. This parses them out of the 2D list
531 they come as and just returns them in a list.
532 """
533 return self.execute_and_fetch_col0(sql, args)
534
535 def execute_one(self, sql, args=None):
536 """Execute sql that returns 1 record, and return the record."""
537 self.execute(sql, args or ())
538 rv = self.cursor.fetchall()
539 if len(rv) != 1:
540 raise ValueError(f"Expected 1 response, got {len(rv)}.")
541 return rv[0]
542
543 def execute(self, sql, args=None):
544 """Just execute an sql command."""
545 if os.name == "java":
546 sql = sql.replace("%s", "?")
547 self.dbutils.execute(self.cursor, sql, args)
548
549 def executemany(self, sql, args):
550 """Execute many sql commands."""
551 if os.name == "java":
552 sql = sql.replace("%s", "?")
553 self.dbutils.executemany(self.cursor, sql, args)
554
555 def get_subseq_as_string(self, seqid, start, end):
556 """Return a substring of a sequence.
557
558 Arguments:
559 - seqid - The internal id for the sequence
560 - start - The start position of the sequence; 0-indexed
561 - end - The end position of the sequence
562
563 """
564 length = end - start
565 # XXX Check this on MySQL and PostgreSQL. substr should be general,
566 # does it need dbutils?
567 # return self.execute_one(
568 # """select SUBSTRING(seq FROM %s FOR %s)
569 # from biosequence where bioentry_id = %s""",
570 # (start+1, length, seqid))[0]
571 return self.execute_one(
572 "SELECT SUBSTR(seq, %s, %s) FROM biosequence WHERE bioentry_id = %s",
573 (start + 1, length, seqid),
574 )[0]
575
576 def execute_and_fetch_col0(self, sql, args=None):
577 """Return a list of values from the first column in the row."""
578 self.execute(sql, args or ())
579 return [field[0] for field in self.cursor.fetchall()]
580
581 def execute_and_fetchall(self, sql, args=None):
582 """Return a list of tuples of all rows."""
583 self.execute(sql, args or ())
584 return self.cursor.fetchall()
585
586
587 class MysqlConnectorAdaptor(Adaptor):
588 """A BioSQL Adaptor class with fixes for the MySQL interface.
589
590 BioSQL was failing due to returns of bytearray objects from
591 the mysql-connector-python database connector. This adaptor
592 class scrubs returns of bytearrays and of byte strings converting
593 them to string objects instead. This adaptor class was made in
594 response to backwards incompatible changes added to
595 mysql-connector-python in release 2.0.0 of the package.
596 """
597
598 @staticmethod
599 def _bytearray_to_str(s):
600 """If s is bytes or bytearray, convert to a string (PRIVATE)."""
601 if isinstance(s, (bytes, bytearray)):
602 return s.decode()
603 return s
604
605 def execute_one(self, sql, args=None):
606 """Execute sql that returns 1 record, and return the record."""
607 out = super().execute_one(sql, args)
608 return tuple(self._bytearray_to_str(v) for v in out)
609
610 def execute_and_fetch_col0(self, sql, args=None):
611 """Return a list of values from the first column in the row."""
612 out = super().execute_and_fetch_col0(sql, args)
613 return [self._bytearray_to_str(column) for column in out]
614
615 def execute_and_fetchall(self, sql, args=None):
616 """Return a list of tuples of all rows."""
617 out = super().execute_and_fetchall(sql, args)
618 return [tuple(self._bytearray_to_str(v) for v in o) for o in out]
619
620
621 _interface_specific_adaptors = {
622 # If SQL interfaces require a specific adaptor, use this to map the adaptor
623 "mysql.connector": MysqlConnectorAdaptor,
624 "MySQLdb": MysqlConnectorAdaptor,
625 }
626
627 _allowed_lookups = {
628 # Lookup name / function name to get id, function to list all ids
629 "primary_id": "fetch_seqid_by_identifier",
630 "gi": "fetch_seqid_by_identifier",
631 "display_id": "fetch_seqid_by_display_id",
632 "name": "fetch_seqid_by_display_id",
633 "accession": "fetch_seqid_by_accession",
634 "version": "fetch_seqid_by_version",
635 }
636
637
638 class BioSeqDatabase:
639 """Represents a namespace (sub-database) within the BioSQL database.
640
641 i.e. One row in the biodatabase table, and all all rows in the bioentry
642 table associated with it.
643 """
644
645 def __init__(self, adaptor, name):
646 """Create a BioDatabase object.
647
648 Arguments:
649 - adaptor - A BioSQL.Adaptor object
650 - name - The name of the sub-database (namespace)
651
652 """
653 self.adaptor = adaptor
654 self.name = name
655 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
656
657 def __repr__(self):
658 """Return a short summary of the BioSeqDatabase."""
659 return f"BioSeqDatabase({self.adaptor!r}, {self.name!r})"
660
661 def get_Seq_by_id(self, name):
662 """Get a DBSeqRecord object by its name.
663
664 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN')
665
666 The name of this method is misleading since it returns a DBSeqRecord
667 rather than a Seq object, and presumably was to mirror BioPerl.
668 """
669 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name)
670 return BioSeq.DBSeqRecord(self.adaptor, seqid)
671
672 def get_Seq_by_acc(self, name):
673 """Get a DBSeqRecord object by accession number.
674
675 Example: seq_rec = db.get_Seq_by_acc('X77802')
676
677 The name of this method is misleading since it returns a DBSeqRecord
678 rather than a Seq object, and presumably was to mirror BioPerl.
679 """
680 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name)
681 return BioSeq.DBSeqRecord(self.adaptor, seqid)
682
683 def get_Seq_by_ver(self, name):
684 """Get a DBSeqRecord object by version number.
685
686 Example: seq_rec = db.get_Seq_by_ver('X77802.1')
687
688 The name of this method is misleading since it returns a DBSeqRecord
689 rather than a Seq object, and presumably was to mirror BioPerl.
690 """
691 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name)
692 return BioSeq.DBSeqRecord(self.adaptor, seqid)
693
694 def get_Seqs_by_acc(self, name):
695 """Get a list of DBSeqRecord objects by accession number.
696
697 Example: seq_recs = db.get_Seq_by_acc('X77802')
698
699 The name of this method is misleading since it returns a list of
700 DBSeqRecord objects rather than a list of Seq objects, and presumably
701 was to mirror BioPerl.
702 """
703 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name)
704 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
705
706 def __getitem__(self, key):
707 """Return a DBSeqRecord for one of the sequences in the sub-database.
708
709 Arguments:
710 - key - The internal id for the sequence
711
712 """
713 record = BioSeq.DBSeqRecord(self.adaptor, key)
714 if record._biodatabase_id != self.dbid:
715 raise KeyError(f"Entry {key!r} does exist, but not in current name space")
716 return record
717
718 def __delitem__(self, key):
719 """Remove an entry and all its annotation."""
720 if key not in self:
721 raise KeyError(
722 f"Entry {key!r} cannot be deleted. It was not found or is invalid"
723 )
724 # Assuming this will automatically cascade to the other tables...
725 sql = "DELETE FROM bioentry WHERE biodatabase_id=%s AND bioentry_id=%s;"
726 self.adaptor.execute(sql, (self.dbid, key))
727
728 def __len__(self):
729 """Return number of records in this namespace (sub database)."""
730 sql = "SELECT COUNT(bioentry_id) FROM bioentry WHERE biodatabase_id=%s;"
731 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid,))[0])
732
733 def __contains__(self, value):
734 """Check if a primary (internal) id is this namespace (sub database)."""
735 sql = (
736 "SELECT COUNT(bioentry_id) FROM bioentry "
737 "WHERE biodatabase_id=%s AND bioentry_id=%s;"
738 )
739 # The bioentry_id field is an integer in the schema.
740 # PostgreSQL will throw an error if we use a non integer in the query.
741 try:
742 bioentry_id = int(value)
743 except ValueError:
744 return False
745 return bool(
746 self.adaptor.execute_and_fetch_col0(sql, (self.dbid, bioentry_id))[0]
747 )
748
749 def __iter__(self):
750 """Iterate over ids (which may not be meaningful outside this database)."""
751 # TODO - Iterate over the cursor, much more efficient
752 return iter(self.adaptor.list_bioentry_ids(self.dbid))
753
754 def keys(self):
755 """Iterate over ids (which may not be meaningful outside this database)."""
756 return iter(self)
757
758 def values(self):
759 """Iterate over DBSeqRecord objects in the namespace (sub database)."""
760 for key in self:
761 yield self[key]
762
763 def items(self):
764 """Iterate over (id, DBSeqRecord) for the namespace (sub database)."""
765 for key in self:
766 yield key, self[key]
767
768 def lookup(self, **kwargs):
769 """Return a DBSeqRecord using an acceptable identifier.
770
771 Arguments:
772 - kwargs - A single key-value pair where the key is one
773 of primary_id, gi, display_id, name, accession, version
774
775 """
776 if len(kwargs) != 1:
777 raise TypeError("single key/value parameter expected")
778 k, v = list(kwargs.items())[0]
779 if k not in _allowed_lookups:
780 raise TypeError(
781 f"lookup() expects one of {list(_allowed_lookups.keys())!r}, not {k!r}"
782 )
783 lookup_name = _allowed_lookups[k]
784 lookup_func = getattr(self.adaptor, lookup_name)
785 seqid = lookup_func(self.dbid, v)
786 return BioSeq.DBSeqRecord(self.adaptor, seqid)
787
788 def load(self, record_iterator, fetch_NCBI_taxonomy=False):
789 """Load a set of SeqRecords into the BioSQL database.
790
791 record_iterator is either a list of SeqRecord objects, or an
792 Iterator object that returns SeqRecord objects (such as the
793 output from the Bio.SeqIO.parse() function), which will be
794 used to populate the database.
795
796 fetch_NCBI_taxonomy is boolean flag allowing or preventing
797 connection to the taxonomic database on the NCBI server
798 (via Bio.Entrez) to fetch a detailed taxonomy for each
799 SeqRecord.
800
801 Example::
802
803 from Bio import SeqIO
804 count = db.load(SeqIO.parse(open(filename), format))
805
806 Returns the number of records loaded.
807 """
808 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, fetch_NCBI_taxonomy)
809 num_records = 0
810 global _POSTGRES_RULES_PRESENT
811 for cur_record in record_iterator:
812 num_records += 1
813 # Hack to work around BioSQL Bug 2839 - If using PostgreSQL and
814 # the RULES are present check for a duplicate record before loading
815 if _POSTGRES_RULES_PRESENT:
816 # Recreate what the Loader's _load_bioentry_table will do:
817 if cur_record.id.count(".") == 1:
818 accession, version = cur_record.id.split(".")
819 try:
820 version = int(version)
821 except ValueError:
822 accession = cur_record.id
823 version = 0
824 else:
825 accession = cur_record.id
826 version = 0
827 gi = cur_record.annotations.get("gi")
828 sql = (
829 "SELECT bioentry_id FROM bioentry "
830 "WHERE (identifier = '%s' AND biodatabase_id = '%s') "
831 "OR (accession = '%s' AND version = '%s' AND biodatabase_id = '%s')"
832 )
833 self.adaptor.execute(
834 sql % (gi, self.dbid, accession, version, self.dbid)
835 )
836 if self.adaptor.cursor.fetchone():
837 raise self.adaptor.conn.IntegrityError(
838 "Duplicate record detected: record has not been inserted"
839 )
840 # End of hack
841 db_loader.load_seqrecord(cur_record)
842 return num_records