Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/BioSeqDatabase.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # Copyright 2002 by Andrew Dalke. All rights reserved. | |
2 # Revisions 2007-2016 copyright by Peter Cock. All rights reserved. | |
3 # Revisions 2009 copyright by Cymon J. Cox. All rights reserved. | |
4 # Revisions 2013-2014 copyright by Tiago Antao. All rights reserved. | |
5 # | |
6 # This file is part of the Biopython distribution and governed by your | |
7 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
8 # Please see the LICENSE file that should have been included as part of this | |
9 # package. | |
10 # | |
11 # Note that BioSQL (including the database schema and scripts) is | |
12 # available and licensed separately. Please consult www.biosql.org | |
13 """Connect with a BioSQL database and load Biopython like objects from it. | |
14 | |
15 This provides interfaces for loading biological objects from a relational | |
16 database, and is compatible with the BioSQL standards. | |
17 """ | |
18 import os | |
19 | |
20 from . import BioSeq | |
21 from . import Loader | |
22 from . import DBUtils | |
23 | |
24 | |
25 _POSTGRES_RULES_PRESENT = False # Hack for BioSQL Bug 2839 | |
26 | |
27 | |
28 def open_database(driver="MySQLdb", **kwargs): | |
29 """Load an existing BioSQL-style database. | |
30 | |
31 This function is the easiest way to retrieve a connection to a | |
32 database, doing something like:: | |
33 | |
34 from BioSQL import BioSeqDatabase | |
35 server = BioSeqDatabase.open_database(user="root", db="minidb") | |
36 | |
37 Arguments: | |
38 - driver - The name of the database driver to use for connecting. The | |
39 driver should implement the python DB API. By default, the MySQLdb | |
40 driver is used. | |
41 - user -the username to connect to the database with. | |
42 - password, passwd - the password to connect with | |
43 - host - the hostname of the database | |
44 - database or db - the name of the database | |
45 | |
46 """ | |
47 if driver == "psycopg": | |
48 raise ValueError( | |
49 "Using BioSQL with psycopg (version one) is no " | |
50 "longer supported. Use psycopg2 instead." | |
51 ) | |
52 | |
53 if os.name == "java": | |
54 from com.ziclix.python.sql import zxJDBC | |
55 | |
56 module = zxJDBC | |
57 if driver in ["MySQLdb"]: | |
58 jdbc_driver = "com.mysql.jdbc.Driver" | |
59 url_pref = "jdbc:mysql://" + kwargs["host"] + "/" | |
60 elif driver in ["psycopg2"]: | |
61 jdbc_driver = "org.postgresql.Driver" | |
62 url_pref = "jdbc:postgresql://" + kwargs["host"] + "/" | |
63 | |
64 else: | |
65 module = __import__(driver, fromlist=["connect"]) | |
66 connect = module.connect | |
67 | |
68 # Different drivers use different keywords... | |
69 kw = kwargs.copy() | |
70 if driver in ["MySQLdb", "mysql.connector"] and os.name != "java": | |
71 if "database" in kw: | |
72 kw["db"] = kw["database"] | |
73 del kw["database"] | |
74 if "password" in kw: | |
75 kw["passwd"] = kw["password"] | |
76 del kw["password"] | |
77 # kw["charset"] = "utf8" | |
78 # kw["use_unicode"] = True | |
79 else: | |
80 # DB-API recommendations | |
81 if "db" in kw: | |
82 kw["database"] = kw["db"] | |
83 del kw["db"] | |
84 if "passwd" in kw: | |
85 kw["password"] = kw["passwd"] | |
86 del kw["passwd"] | |
87 if driver in ["psycopg2", "pgdb"] and not kw.get("database"): | |
88 kw["database"] = "template1" | |
89 # SQLite connect takes the database name as input | |
90 if os.name == "java": | |
91 if driver in ["MySQLdb"]: | |
92 conn = connect( | |
93 url_pref + kw.get("database", "mysql"), | |
94 kw["user"], | |
95 kw["password"], | |
96 jdbc_driver, | |
97 ) | |
98 elif driver in ["psycopg2"]: | |
99 conn = connect( | |
100 url_pref + kw.get("database", "postgresql") + "?stringtype=unspecified", | |
101 kw["user"], | |
102 kw["password"], | |
103 jdbc_driver, | |
104 ) | |
105 elif driver in ["sqlite3"]: | |
106 conn = connect(kw["database"]) | |
107 else: | |
108 conn = connect(**kw) | |
109 | |
110 if os.name == "java": | |
111 server = DBServer(conn, module, driver) | |
112 else: | |
113 server = DBServer(conn, module) | |
114 | |
115 # Sets MySQL to allow double quotes, rather than only backticks | |
116 if driver in ["MySQLdb", "mysql.connector"]: | |
117 server.adaptor.execute("SET sql_mode='ANSI_QUOTES';") | |
118 | |
119 # TODO - Remove the following once BioSQL Bug 2839 is fixed. | |
120 # Test for RULES in PostgreSQL schema, see also Bug 2833. | |
121 if driver in ["psycopg2", "pgdb"]: | |
122 sql = ( | |
123 "SELECT ev_class FROM pg_rewrite WHERE " | |
124 "rulename='rule_bioentry_i1' OR " | |
125 "rulename='rule_bioentry_i2';" | |
126 ) | |
127 if server.adaptor.execute_and_fetchall(sql): | |
128 import warnings | |
129 from Bio import BiopythonWarning | |
130 | |
131 warnings.warn( | |
132 "Your BioSQL PostgreSQL schema includes some rules " | |
133 "currently required for bioperl-db but which may" | |
134 "cause problems loading data using Biopython (see " | |
135 "BioSQL's RedMine Bug 2839 aka GitHub Issue 4 " | |
136 "https://github.com/biosql/biosql/issues/4). " | |
137 "If you do not use BioPerl, please remove these " | |
138 "rules. Biopython should cope with the rules " | |
139 "present, but with a performance penalty when " | |
140 "loading new records.", | |
141 BiopythonWarning, | |
142 ) | |
143 global _POSTGRES_RULES_PRESENT | |
144 _POSTGRES_RULES_PRESENT = True | |
145 | |
146 elif driver == "sqlite3": | |
147 # Tell SQLite that we want to use foreign keys | |
148 # https://www.sqlite.org/foreignkeys.html#fk_enable | |
149 server.adaptor.execute("PRAGMA foreign_keys = ON") | |
150 | |
151 return server | |
152 | |
153 | |
154 class DBServer: | |
155 """Represents a BioSQL database containing namespaces (sub-databases). | |
156 | |
157 This acts like a Python dictionary, giving access to each namespace | |
158 (defined by a row in the biodatabase table) as a BioSeqDatabase object. | |
159 """ | |
160 | |
161 def __init__(self, conn, module, module_name=None): | |
162 """Create a DBServer object. | |
163 | |
164 Arguments: | |
165 - conn - A database connection object | |
166 - module - The module used to create the database connection | |
167 - module_name - Optionally, the name of the module. Default: module.__name__ | |
168 | |
169 Normally you would not want to create a DBServer object yourself. | |
170 Instead use the open_database function, which returns an instance of DBServer. | |
171 """ | |
172 self.module = module | |
173 if module_name is None: | |
174 module_name = module.__name__ | |
175 if module_name == "mysql.connector": | |
176 wrap_cursor = True | |
177 else: | |
178 wrap_cursor = False | |
179 # Get module specific Adaptor or the base (general) Adaptor | |
180 Adapt = _interface_specific_adaptors.get(module_name, Adaptor) | |
181 self.adaptor = Adapt( | |
182 conn, DBUtils.get_dbutils(module_name), wrap_cursor=wrap_cursor | |
183 ) | |
184 self.module_name = module_name | |
185 | |
186 def __repr__(self): | |
187 """Return a short description of the class name and database connection.""" | |
188 return f"{self.__class__.__name__}({self.adaptor.conn!r})" | |
189 | |
190 def __getitem__(self, name): | |
191 """Return a BioSeqDatabase object. | |
192 | |
193 Arguments: | |
194 - name - The name of the BioSeqDatabase | |
195 | |
196 """ | |
197 return BioSeqDatabase(self.adaptor, name) | |
198 | |
199 def __len__(self): | |
200 """Return number of namespaces (sub-databases) in this database.""" | |
201 sql = "SELECT COUNT(name) FROM biodatabase;" | |
202 return int(self.adaptor.execute_and_fetch_col0(sql)[0]) | |
203 | |
204 def __contains__(self, value): | |
205 """Check if a namespace (sub-database) in this database.""" | |
206 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;" | |
207 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0]) | |
208 | |
209 def __iter__(self): | |
210 """Iterate over namespaces (sub-databases) in the database.""" | |
211 # TODO - Iterate over the cursor, much more efficient | |
212 return iter(self.adaptor.list_biodatabase_names()) | |
213 | |
214 def keys(self): | |
215 """Iterate over namespaces (sub-databases) in the database.""" | |
216 return iter(self) | |
217 | |
218 def values(self): | |
219 """Iterate over BioSeqDatabase objects in the database.""" | |
220 for key in self: | |
221 yield self[key] | |
222 | |
223 def items(self): | |
224 """Iterate over (namespace, BioSeqDatabase) in the database.""" | |
225 for key in self: | |
226 yield key, self[key] | |
227 | |
228 def __delitem__(self, name): | |
229 """Remove a namespace and all its entries.""" | |
230 if name not in self: | |
231 raise KeyError(name) | |
232 db_id = self.adaptor.fetch_dbid_by_dbname(name) | |
233 remover = Loader.DatabaseRemover(self.adaptor, db_id) | |
234 remover.remove() | |
235 | |
236 def new_database(self, db_name, authority=None, description=None): | |
237 """Add a new database to the server and return it.""" | |
238 # make the database | |
239 sql = ( | |
240 "INSERT INTO biodatabase (name, authority, description)" | |
241 " VALUES (%s, %s, %s)" | |
242 ) | |
243 self.adaptor.execute(sql, (db_name, authority, description)) | |
244 return BioSeqDatabase(self.adaptor, db_name) | |
245 | |
246 def load_database_sql(self, sql_file): | |
247 """Load a database schema into the given database. | |
248 | |
249 This is used to create tables, etc when a database is first created. | |
250 sql_file should specify the complete path to a file containing | |
251 SQL entries for building the tables. | |
252 """ | |
253 # Not sophisticated enough for PG schema. Is it needed by MySQL? | |
254 # Looks like we need this more complicated way for both. Leaving it | |
255 # the default and removing the simple-minded approach. | |
256 | |
257 # read the file with all comment lines removed | |
258 sql = "" | |
259 with open(sql_file) as sql_handle: | |
260 for line in sql_handle: | |
261 if line.startswith("--"): # don't include comment lines | |
262 pass | |
263 elif line.startswith("#"): # ditto for MySQL comments | |
264 pass | |
265 elif line.strip(): # only include non-blank lines | |
266 sql += line.strip() + " " | |
267 | |
268 # two ways to load the SQL | |
269 # 1. PostgreSQL can load it all at once and actually needs to | |
270 # due to FUNCTION defines at the end of the SQL which mess up | |
271 # the splitting by semicolons | |
272 if self.module_name in ["psycopg2", "pgdb"]: | |
273 self.adaptor.cursor.execute(sql) | |
274 # 2. MySQL needs the database loading split up into single lines of | |
275 # SQL executed one at a time | |
276 elif self.module_name in ["mysql.connector", "MySQLdb", "sqlite3"]: | |
277 sql_parts = sql.split(";") # one line per sql command | |
278 # don't use the last item, it's blank | |
279 for sql_line in sql_parts[:-1]: | |
280 self.adaptor.cursor.execute(sql_line) | |
281 else: | |
282 raise ValueError(f"Module {self.module_name} not supported by the loader.") | |
283 | |
284 def commit(self): | |
285 """Commit the current transaction to the database.""" | |
286 return self.adaptor.commit() | |
287 | |
288 def rollback(self): | |
289 """Roll-back the current transaction.""" | |
290 return self.adaptor.rollback() | |
291 | |
292 def close(self): | |
293 """Close the connection. No further activity possible.""" | |
294 return self.adaptor.close() | |
295 | |
296 | |
297 class _CursorWrapper: | |
298 """A wrapper for mysql.connector resolving bytestring representations.""" | |
299 | |
300 def __init__(self, real_cursor): | |
301 self.real_cursor = real_cursor | |
302 | |
303 def execute(self, operation, params=None, multi=False): | |
304 """Execute a sql statement.""" | |
305 self.real_cursor.execute(operation, params, multi) | |
306 | |
307 def executemany(self, operation, params): | |
308 """Execute many sql statements.""" | |
309 self.real_cursor.executemany(operation, params) | |
310 | |
311 def _convert_tuple(self, tuple_): | |
312 """Decode any bytestrings present in the row (PRIVATE).""" | |
313 tuple_list = list(tuple_) | |
314 for i, elem in enumerate(tuple_list): | |
315 if isinstance(elem, bytes): | |
316 tuple_list[i] = elem.decode("utf-8") | |
317 return tuple(tuple_list) | |
318 | |
319 def _convert_list(self, lst): | |
320 ret_lst = [] | |
321 for tuple_ in lst: | |
322 new_tuple = self._convert_tuple(tuple_) | |
323 ret_lst.append(new_tuple) | |
324 return ret_lst | |
325 | |
326 def fetchall(self): | |
327 rv = self.real_cursor.fetchall() | |
328 return self._convert_list(rv) | |
329 | |
330 def fetchone(self): | |
331 tuple_ = self.real_cursor.fetchone() | |
332 return self._convert_tuple(tuple_) | |
333 | |
334 | |
335 class Adaptor: | |
336 """High level wrapper for a database connection and cursor. | |
337 | |
338 Most database calls in BioSQL are done indirectly though this adaptor | |
339 class. This provides helper methods for fetching data and executing | |
340 sql. | |
341 """ | |
342 | |
343 def __init__(self, conn, dbutils, wrap_cursor=False): | |
344 """Create an Adaptor object. | |
345 | |
346 Arguments: | |
347 - conn - A database connection | |
348 - dbutils - A BioSQL.DBUtils object | |
349 - wrap_cursor - Optional, whether to wrap the cursor object | |
350 | |
351 """ | |
352 self.conn = conn | |
353 if wrap_cursor: | |
354 self.cursor = _CursorWrapper(conn.cursor()) | |
355 else: | |
356 self.cursor = conn.cursor() | |
357 self.dbutils = dbutils | |
358 | |
359 def last_id(self, table): | |
360 """Return the last row id for the selected table.""" | |
361 return self.dbutils.last_id(self.cursor, table) | |
362 | |
363 def autocommit(self, y=True): | |
364 """Set the autocommit mode. True values enable; False value disable.""" | |
365 return self.dbutils.autocommit(self.conn, y) | |
366 | |
367 def commit(self): | |
368 """Commit the current transaction.""" | |
369 return self.conn.commit() | |
370 | |
371 def rollback(self): | |
372 """Roll-back the current transaction.""" | |
373 return self.conn.rollback() | |
374 | |
375 def close(self): | |
376 """Close the connection. No further activity possible.""" | |
377 return self.conn.close() | |
378 | |
379 def fetch_dbid_by_dbname(self, dbname): | |
380 """Return the internal id for the sub-database using its name.""" | |
381 self.execute( | |
382 "select biodatabase_id from biodatabase where name = %s", (dbname,) | |
383 ) | |
384 rv = self.cursor.fetchall() | |
385 if not rv: | |
386 raise KeyError(f"Cannot find biodatabase with name {dbname!r}") | |
387 return rv[0][0] | |
388 | |
389 def fetch_seqid_by_display_id(self, dbid, name): | |
390 """Return the internal id for a sequence using its display id. | |
391 | |
392 Arguments: | |
393 - dbid - the internal id for the sub-database | |
394 - name - the name of the sequence. Corresponds to the | |
395 name column of the bioentry table of the SQL schema | |
396 | |
397 """ | |
398 sql = "select bioentry_id from bioentry where name = %s" | |
399 fields = [name] | |
400 if dbid: | |
401 sql += " and biodatabase_id = %s" | |
402 fields.append(dbid) | |
403 self.execute(sql, fields) | |
404 rv = self.cursor.fetchall() | |
405 if not rv: | |
406 raise IndexError(f"Cannot find display id {name!r}") | |
407 if len(rv) > 1: | |
408 raise IndexError(f"More than one entry with display id {name!r}") | |
409 return rv[0][0] | |
410 | |
411 def fetch_seqid_by_accession(self, dbid, name): | |
412 """Return the internal id for a sequence using its accession. | |
413 | |
414 Arguments: | |
415 - dbid - the internal id for the sub-database | |
416 - name - the accession of the sequence. Corresponds to the | |
417 accession column of the bioentry table of the SQL schema | |
418 | |
419 """ | |
420 sql = "select bioentry_id from bioentry where accession = %s" | |
421 fields = [name] | |
422 if dbid: | |
423 sql += " and biodatabase_id = %s" | |
424 fields.append(dbid) | |
425 self.execute(sql, fields) | |
426 rv = self.cursor.fetchall() | |
427 if not rv: | |
428 raise IndexError(f"Cannot find accession {name!r}") | |
429 if len(rv) > 1: | |
430 raise IndexError(f"More than one entry with accession {name!r}") | |
431 return rv[0][0] | |
432 | |
433 def fetch_seqids_by_accession(self, dbid, name): | |
434 """Return a list internal ids using an accession. | |
435 | |
436 Arguments: | |
437 - dbid - the internal id for the sub-database | |
438 - name - the accession of the sequence. Corresponds to the | |
439 accession column of the bioentry table of the SQL schema | |
440 | |
441 """ | |
442 sql = "select bioentry_id from bioentry where accession = %s" | |
443 fields = [name] | |
444 if dbid: | |
445 sql += " and biodatabase_id = %s" | |
446 fields.append(dbid) | |
447 return self.execute_and_fetch_col0(sql, fields) | |
448 | |
449 def fetch_seqid_by_version(self, dbid, name): | |
450 """Return the internal id for a sequence using its accession and version. | |
451 | |
452 Arguments: | |
453 - dbid - the internal id for the sub-database | |
454 - name - the accession of the sequence containing a version number. | |
455 Must correspond to <accession>.<version> | |
456 | |
457 """ | |
458 acc_version = name.split(".") | |
459 if len(acc_version) > 2: | |
460 raise IndexError(f"Bad version {name!r}") | |
461 acc = acc_version[0] | |
462 if len(acc_version) == 2: | |
463 version = acc_version[1] | |
464 else: | |
465 version = "0" | |
466 sql = "SELECT bioentry_id FROM bioentry WHERE accession = %s AND version = %s" | |
467 fields = [acc, version] | |
468 if dbid: | |
469 sql += " and biodatabase_id = %s" | |
470 fields.append(dbid) | |
471 self.execute(sql, fields) | |
472 rv = self.cursor.fetchall() | |
473 if not rv: | |
474 raise IndexError(f"Cannot find version {name!r}") | |
475 if len(rv) > 1: | |
476 raise IndexError(f"More than one entry with version {name!r}") | |
477 return rv[0][0] | |
478 | |
479 def fetch_seqid_by_identifier(self, dbid, identifier): | |
480 """Return the internal id for a sequence using its identifier. | |
481 | |
482 Arguments: | |
483 - dbid - the internal id for the sub-database | |
484 - identifier - the identifier of the sequence. Corresponds to | |
485 the identifier column of the bioentry table in the SQL schema. | |
486 | |
487 """ | |
488 # YB: was fetch_seqid_by_seqid | |
489 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" | |
490 fields = [identifier] | |
491 if dbid: | |
492 sql += " and biodatabase_id = %s" | |
493 fields.append(dbid) | |
494 self.execute(sql, fields) | |
495 rv = self.cursor.fetchall() | |
496 if not rv: | |
497 raise IndexError(f"Cannot find display id {identifier!r}") | |
498 return rv[0][0] | |
499 | |
500 def list_biodatabase_names(self): | |
501 """Return a list of all of the sub-databases.""" | |
502 return self.execute_and_fetch_col0("SELECT name FROM biodatabase") | |
503 | |
504 def list_bioentry_ids(self, dbid): | |
505 """Return a list of internal ids for all of the sequences in a sub-databae. | |
506 | |
507 Arguments: | |
508 - dbid - The internal id for a sub-database | |
509 | |
510 """ | |
511 return self.execute_and_fetch_col0( | |
512 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", (dbid,) | |
513 ) | |
514 | |
515 def list_bioentry_display_ids(self, dbid): | |
516 """Return a list of all sequence names in a sub-databae. | |
517 | |
518 Arguments: | |
519 - dbid - The internal id for a sub-database | |
520 | |
521 """ | |
522 return self.execute_and_fetch_col0( | |
523 "SELECT name FROM bioentry WHERE biodatabase_id = %s", (dbid,) | |
524 ) | |
525 | |
526 def list_any_ids(self, sql, args): | |
527 """Return ids given a SQL statement to select for them. | |
528 | |
529 This assumes that the given SQL does a SELECT statement that | |
530 returns a list of items. This parses them out of the 2D list | |
531 they come as and just returns them in a list. | |
532 """ | |
533 return self.execute_and_fetch_col0(sql, args) | |
534 | |
535 def execute_one(self, sql, args=None): | |
536 """Execute sql that returns 1 record, and return the record.""" | |
537 self.execute(sql, args or ()) | |
538 rv = self.cursor.fetchall() | |
539 if len(rv) != 1: | |
540 raise ValueError(f"Expected 1 response, got {len(rv)}.") | |
541 return rv[0] | |
542 | |
543 def execute(self, sql, args=None): | |
544 """Just execute an sql command.""" | |
545 if os.name == "java": | |
546 sql = sql.replace("%s", "?") | |
547 self.dbutils.execute(self.cursor, sql, args) | |
548 | |
549 def executemany(self, sql, args): | |
550 """Execute many sql commands.""" | |
551 if os.name == "java": | |
552 sql = sql.replace("%s", "?") | |
553 self.dbutils.executemany(self.cursor, sql, args) | |
554 | |
555 def get_subseq_as_string(self, seqid, start, end): | |
556 """Return a substring of a sequence. | |
557 | |
558 Arguments: | |
559 - seqid - The internal id for the sequence | |
560 - start - The start position of the sequence; 0-indexed | |
561 - end - The end position of the sequence | |
562 | |
563 """ | |
564 length = end - start | |
565 # XXX Check this on MySQL and PostgreSQL. substr should be general, | |
566 # does it need dbutils? | |
567 # return self.execute_one( | |
568 # """select SUBSTRING(seq FROM %s FOR %s) | |
569 # from biosequence where bioentry_id = %s""", | |
570 # (start+1, length, seqid))[0] | |
571 return self.execute_one( | |
572 "SELECT SUBSTR(seq, %s, %s) FROM biosequence WHERE bioentry_id = %s", | |
573 (start + 1, length, seqid), | |
574 )[0] | |
575 | |
576 def execute_and_fetch_col0(self, sql, args=None): | |
577 """Return a list of values from the first column in the row.""" | |
578 self.execute(sql, args or ()) | |
579 return [field[0] for field in self.cursor.fetchall()] | |
580 | |
581 def execute_and_fetchall(self, sql, args=None): | |
582 """Return a list of tuples of all rows.""" | |
583 self.execute(sql, args or ()) | |
584 return self.cursor.fetchall() | |
585 | |
586 | |
587 class MysqlConnectorAdaptor(Adaptor): | |
588 """A BioSQL Adaptor class with fixes for the MySQL interface. | |
589 | |
590 BioSQL was failing due to returns of bytearray objects from | |
591 the mysql-connector-python database connector. This adaptor | |
592 class scrubs returns of bytearrays and of byte strings converting | |
593 them to string objects instead. This adaptor class was made in | |
594 response to backwards incompatible changes added to | |
595 mysql-connector-python in release 2.0.0 of the package. | |
596 """ | |
597 | |
598 @staticmethod | |
599 def _bytearray_to_str(s): | |
600 """If s is bytes or bytearray, convert to a string (PRIVATE).""" | |
601 if isinstance(s, (bytes, bytearray)): | |
602 return s.decode() | |
603 return s | |
604 | |
605 def execute_one(self, sql, args=None): | |
606 """Execute sql that returns 1 record, and return the record.""" | |
607 out = super().execute_one(sql, args) | |
608 return tuple(self._bytearray_to_str(v) for v in out) | |
609 | |
610 def execute_and_fetch_col0(self, sql, args=None): | |
611 """Return a list of values from the first column in the row.""" | |
612 out = super().execute_and_fetch_col0(sql, args) | |
613 return [self._bytearray_to_str(column) for column in out] | |
614 | |
615 def execute_and_fetchall(self, sql, args=None): | |
616 """Return a list of tuples of all rows.""" | |
617 out = super().execute_and_fetchall(sql, args) | |
618 return [tuple(self._bytearray_to_str(v) for v in o) for o in out] | |
619 | |
620 | |
621 _interface_specific_adaptors = { | |
622 # If SQL interfaces require a specific adaptor, use this to map the adaptor | |
623 "mysql.connector": MysqlConnectorAdaptor, | |
624 "MySQLdb": MysqlConnectorAdaptor, | |
625 } | |
626 | |
627 _allowed_lookups = { | |
628 # Lookup name / function name to get id, function to list all ids | |
629 "primary_id": "fetch_seqid_by_identifier", | |
630 "gi": "fetch_seqid_by_identifier", | |
631 "display_id": "fetch_seqid_by_display_id", | |
632 "name": "fetch_seqid_by_display_id", | |
633 "accession": "fetch_seqid_by_accession", | |
634 "version": "fetch_seqid_by_version", | |
635 } | |
636 | |
637 | |
638 class BioSeqDatabase: | |
639 """Represents a namespace (sub-database) within the BioSQL database. | |
640 | |
641 i.e. One row in the biodatabase table, and all all rows in the bioentry | |
642 table associated with it. | |
643 """ | |
644 | |
645 def __init__(self, adaptor, name): | |
646 """Create a BioDatabase object. | |
647 | |
648 Arguments: | |
649 - adaptor - A BioSQL.Adaptor object | |
650 - name - The name of the sub-database (namespace) | |
651 | |
652 """ | |
653 self.adaptor = adaptor | |
654 self.name = name | |
655 self.dbid = self.adaptor.fetch_dbid_by_dbname(name) | |
656 | |
657 def __repr__(self): | |
658 """Return a short summary of the BioSeqDatabase.""" | |
659 return f"BioSeqDatabase({self.adaptor!r}, {self.name!r})" | |
660 | |
661 def get_Seq_by_id(self, name): | |
662 """Get a DBSeqRecord object by its name. | |
663 | |
664 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN') | |
665 | |
666 The name of this method is misleading since it returns a DBSeqRecord | |
667 rather than a Seq object, and presumably was to mirror BioPerl. | |
668 """ | |
669 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) | |
670 return BioSeq.DBSeqRecord(self.adaptor, seqid) | |
671 | |
672 def get_Seq_by_acc(self, name): | |
673 """Get a DBSeqRecord object by accession number. | |
674 | |
675 Example: seq_rec = db.get_Seq_by_acc('X77802') | |
676 | |
677 The name of this method is misleading since it returns a DBSeqRecord | |
678 rather than a Seq object, and presumably was to mirror BioPerl. | |
679 """ | |
680 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) | |
681 return BioSeq.DBSeqRecord(self.adaptor, seqid) | |
682 | |
683 def get_Seq_by_ver(self, name): | |
684 """Get a DBSeqRecord object by version number. | |
685 | |
686 Example: seq_rec = db.get_Seq_by_ver('X77802.1') | |
687 | |
688 The name of this method is misleading since it returns a DBSeqRecord | |
689 rather than a Seq object, and presumably was to mirror BioPerl. | |
690 """ | |
691 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) | |
692 return BioSeq.DBSeqRecord(self.adaptor, seqid) | |
693 | |
694 def get_Seqs_by_acc(self, name): | |
695 """Get a list of DBSeqRecord objects by accession number. | |
696 | |
697 Example: seq_recs = db.get_Seq_by_acc('X77802') | |
698 | |
699 The name of this method is misleading since it returns a list of | |
700 DBSeqRecord objects rather than a list of Seq objects, and presumably | |
701 was to mirror BioPerl. | |
702 """ | |
703 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) | |
704 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids] | |
705 | |
706 def __getitem__(self, key): | |
707 """Return a DBSeqRecord for one of the sequences in the sub-database. | |
708 | |
709 Arguments: | |
710 - key - The internal id for the sequence | |
711 | |
712 """ | |
713 record = BioSeq.DBSeqRecord(self.adaptor, key) | |
714 if record._biodatabase_id != self.dbid: | |
715 raise KeyError(f"Entry {key!r} does exist, but not in current name space") | |
716 return record | |
717 | |
718 def __delitem__(self, key): | |
719 """Remove an entry and all its annotation.""" | |
720 if key not in self: | |
721 raise KeyError( | |
722 f"Entry {key!r} cannot be deleted. It was not found or is invalid" | |
723 ) | |
724 # Assuming this will automatically cascade to the other tables... | |
725 sql = "DELETE FROM bioentry WHERE biodatabase_id=%s AND bioentry_id=%s;" | |
726 self.adaptor.execute(sql, (self.dbid, key)) | |
727 | |
728 def __len__(self): | |
729 """Return number of records in this namespace (sub database).""" | |
730 sql = "SELECT COUNT(bioentry_id) FROM bioentry WHERE biodatabase_id=%s;" | |
731 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid,))[0]) | |
732 | |
733 def __contains__(self, value): | |
734 """Check if a primary (internal) id is this namespace (sub database).""" | |
735 sql = ( | |
736 "SELECT COUNT(bioentry_id) FROM bioentry " | |
737 "WHERE biodatabase_id=%s AND bioentry_id=%s;" | |
738 ) | |
739 # The bioentry_id field is an integer in the schema. | |
740 # PostgreSQL will throw an error if we use a non integer in the query. | |
741 try: | |
742 bioentry_id = int(value) | |
743 except ValueError: | |
744 return False | |
745 return bool( | |
746 self.adaptor.execute_and_fetch_col0(sql, (self.dbid, bioentry_id))[0] | |
747 ) | |
748 | |
749 def __iter__(self): | |
750 """Iterate over ids (which may not be meaningful outside this database).""" | |
751 # TODO - Iterate over the cursor, much more efficient | |
752 return iter(self.adaptor.list_bioentry_ids(self.dbid)) | |
753 | |
754 def keys(self): | |
755 """Iterate over ids (which may not be meaningful outside this database).""" | |
756 return iter(self) | |
757 | |
758 def values(self): | |
759 """Iterate over DBSeqRecord objects in the namespace (sub database).""" | |
760 for key in self: | |
761 yield self[key] | |
762 | |
763 def items(self): | |
764 """Iterate over (id, DBSeqRecord) for the namespace (sub database).""" | |
765 for key in self: | |
766 yield key, self[key] | |
767 | |
768 def lookup(self, **kwargs): | |
769 """Return a DBSeqRecord using an acceptable identifier. | |
770 | |
771 Arguments: | |
772 - kwargs - A single key-value pair where the key is one | |
773 of primary_id, gi, display_id, name, accession, version | |
774 | |
775 """ | |
776 if len(kwargs) != 1: | |
777 raise TypeError("single key/value parameter expected") | |
778 k, v = list(kwargs.items())[0] | |
779 if k not in _allowed_lookups: | |
780 raise TypeError( | |
781 f"lookup() expects one of {list(_allowed_lookups.keys())!r}, not {k!r}" | |
782 ) | |
783 lookup_name = _allowed_lookups[k] | |
784 lookup_func = getattr(self.adaptor, lookup_name) | |
785 seqid = lookup_func(self.dbid, v) | |
786 return BioSeq.DBSeqRecord(self.adaptor, seqid) | |
787 | |
788 def load(self, record_iterator, fetch_NCBI_taxonomy=False): | |
789 """Load a set of SeqRecords into the BioSQL database. | |
790 | |
791 record_iterator is either a list of SeqRecord objects, or an | |
792 Iterator object that returns SeqRecord objects (such as the | |
793 output from the Bio.SeqIO.parse() function), which will be | |
794 used to populate the database. | |
795 | |
796 fetch_NCBI_taxonomy is boolean flag allowing or preventing | |
797 connection to the taxonomic database on the NCBI server | |
798 (via Bio.Entrez) to fetch a detailed taxonomy for each | |
799 SeqRecord. | |
800 | |
801 Example:: | |
802 | |
803 from Bio import SeqIO | |
804 count = db.load(SeqIO.parse(open(filename), format)) | |
805 | |
806 Returns the number of records loaded. | |
807 """ | |
808 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, fetch_NCBI_taxonomy) | |
809 num_records = 0 | |
810 global _POSTGRES_RULES_PRESENT | |
811 for cur_record in record_iterator: | |
812 num_records += 1 | |
813 # Hack to work around BioSQL Bug 2839 - If using PostgreSQL and | |
814 # the RULES are present check for a duplicate record before loading | |
815 if _POSTGRES_RULES_PRESENT: | |
816 # Recreate what the Loader's _load_bioentry_table will do: | |
817 if cur_record.id.count(".") == 1: | |
818 accession, version = cur_record.id.split(".") | |
819 try: | |
820 version = int(version) | |
821 except ValueError: | |
822 accession = cur_record.id | |
823 version = 0 | |
824 else: | |
825 accession = cur_record.id | |
826 version = 0 | |
827 gi = cur_record.annotations.get("gi") | |
828 sql = ( | |
829 "SELECT bioentry_id FROM bioentry " | |
830 "WHERE (identifier = '%s' AND biodatabase_id = '%s') " | |
831 "OR (accession = '%s' AND version = '%s' AND biodatabase_id = '%s')" | |
832 ) | |
833 self.adaptor.execute( | |
834 sql % (gi, self.dbid, accession, version, self.dbid) | |
835 ) | |
836 if self.adaptor.cursor.fetchone(): | |
837 raise self.adaptor.conn.IntegrityError( | |
838 "Duplicate record detected: record has not been inserted" | |
839 ) | |
840 # End of hack | |
841 db_loader.load_seqrecord(cur_record) | |
842 return num_records |