jpayne@68
|
1 # Copyright 2002 by Andrew Dalke. All rights reserved.
|
jpayne@68
|
2 # Revisions 2007-2016 copyright by Peter Cock. All rights reserved.
|
jpayne@68
|
3 # Revisions 2009 copyright by Cymon J. Cox. All rights reserved.
|
jpayne@68
|
4 # Revisions 2013-2014 copyright by Tiago Antao. All rights reserved.
|
jpayne@68
|
5 #
|
jpayne@68
|
6 # This file is part of the Biopython distribution and governed by your
|
jpayne@68
|
7 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
jpayne@68
|
8 # Please see the LICENSE file that should have been included as part of this
|
jpayne@68
|
9 # package.
|
jpayne@68
|
10 #
|
jpayne@68
|
11 # Note that BioSQL (including the database schema and scripts) is
|
jpayne@68
|
12 # available and licensed separately. Please consult www.biosql.org
|
jpayne@68
|
13 """Connect with a BioSQL database and load Biopython like objects from it.
|
jpayne@68
|
14
|
jpayne@68
|
15 This provides interfaces for loading biological objects from a relational
|
jpayne@68
|
16 database, and is compatible with the BioSQL standards.
|
jpayne@68
|
17 """
|
jpayne@68
|
18 import os
|
jpayne@68
|
19
|
jpayne@68
|
20 from . import BioSeq
|
jpayne@68
|
21 from . import Loader
|
jpayne@68
|
22 from . import DBUtils
|
jpayne@68
|
23
|
jpayne@68
|
24
|
jpayne@68
|
25 _POSTGRES_RULES_PRESENT = False # Hack for BioSQL Bug 2839
|
jpayne@68
|
26
|
jpayne@68
|
27
|
jpayne@68
|
28 def open_database(driver="MySQLdb", **kwargs):
|
jpayne@68
|
29 """Load an existing BioSQL-style database.
|
jpayne@68
|
30
|
jpayne@68
|
31 This function is the easiest way to retrieve a connection to a
|
jpayne@68
|
32 database, doing something like::
|
jpayne@68
|
33
|
jpayne@68
|
34 from BioSQL import BioSeqDatabase
|
jpayne@68
|
35 server = BioSeqDatabase.open_database(user="root", db="minidb")
|
jpayne@68
|
36
|
jpayne@68
|
37 Arguments:
|
jpayne@68
|
38 - driver - The name of the database driver to use for connecting. The
|
jpayne@68
|
39 driver should implement the python DB API. By default, the MySQLdb
|
jpayne@68
|
40 driver is used.
|
jpayne@68
|
41 - user -the username to connect to the database with.
|
jpayne@68
|
42 - password, passwd - the password to connect with
|
jpayne@68
|
43 - host - the hostname of the database
|
jpayne@68
|
44 - database or db - the name of the database
|
jpayne@68
|
45
|
jpayne@68
|
46 """
|
jpayne@68
|
47 if driver == "psycopg":
|
jpayne@68
|
48 raise ValueError(
|
jpayne@68
|
49 "Using BioSQL with psycopg (version one) is no "
|
jpayne@68
|
50 "longer supported. Use psycopg2 instead."
|
jpayne@68
|
51 )
|
jpayne@68
|
52
|
jpayne@68
|
53 if os.name == "java":
|
jpayne@68
|
54 from com.ziclix.python.sql import zxJDBC
|
jpayne@68
|
55
|
jpayne@68
|
56 module = zxJDBC
|
jpayne@68
|
57 if driver in ["MySQLdb"]:
|
jpayne@68
|
58 jdbc_driver = "com.mysql.jdbc.Driver"
|
jpayne@68
|
59 url_pref = "jdbc:mysql://" + kwargs["host"] + "/"
|
jpayne@68
|
60 elif driver in ["psycopg2"]:
|
jpayne@68
|
61 jdbc_driver = "org.postgresql.Driver"
|
jpayne@68
|
62 url_pref = "jdbc:postgresql://" + kwargs["host"] + "/"
|
jpayne@68
|
63
|
jpayne@68
|
64 else:
|
jpayne@68
|
65 module = __import__(driver, fromlist=["connect"])
|
jpayne@68
|
66 connect = module.connect
|
jpayne@68
|
67
|
jpayne@68
|
68 # Different drivers use different keywords...
|
jpayne@68
|
69 kw = kwargs.copy()
|
jpayne@68
|
70 if driver in ["MySQLdb", "mysql.connector"] and os.name != "java":
|
jpayne@68
|
71 if "database" in kw:
|
jpayne@68
|
72 kw["db"] = kw["database"]
|
jpayne@68
|
73 del kw["database"]
|
jpayne@68
|
74 if "password" in kw:
|
jpayne@68
|
75 kw["passwd"] = kw["password"]
|
jpayne@68
|
76 del kw["password"]
|
jpayne@68
|
77 # kw["charset"] = "utf8"
|
jpayne@68
|
78 # kw["use_unicode"] = True
|
jpayne@68
|
79 else:
|
jpayne@68
|
80 # DB-API recommendations
|
jpayne@68
|
81 if "db" in kw:
|
jpayne@68
|
82 kw["database"] = kw["db"]
|
jpayne@68
|
83 del kw["db"]
|
jpayne@68
|
84 if "passwd" in kw:
|
jpayne@68
|
85 kw["password"] = kw["passwd"]
|
jpayne@68
|
86 del kw["passwd"]
|
jpayne@68
|
87 if driver in ["psycopg2", "pgdb"] and not kw.get("database"):
|
jpayne@68
|
88 kw["database"] = "template1"
|
jpayne@68
|
89 # SQLite connect takes the database name as input
|
jpayne@68
|
90 if os.name == "java":
|
jpayne@68
|
91 if driver in ["MySQLdb"]:
|
jpayne@68
|
92 conn = connect(
|
jpayne@68
|
93 url_pref + kw.get("database", "mysql"),
|
jpayne@68
|
94 kw["user"],
|
jpayne@68
|
95 kw["password"],
|
jpayne@68
|
96 jdbc_driver,
|
jpayne@68
|
97 )
|
jpayne@68
|
98 elif driver in ["psycopg2"]:
|
jpayne@68
|
99 conn = connect(
|
jpayne@68
|
100 url_pref + kw.get("database", "postgresql") + "?stringtype=unspecified",
|
jpayne@68
|
101 kw["user"],
|
jpayne@68
|
102 kw["password"],
|
jpayne@68
|
103 jdbc_driver,
|
jpayne@68
|
104 )
|
jpayne@68
|
105 elif driver in ["sqlite3"]:
|
jpayne@68
|
106 conn = connect(kw["database"])
|
jpayne@68
|
107 else:
|
jpayne@68
|
108 conn = connect(**kw)
|
jpayne@68
|
109
|
jpayne@68
|
110 if os.name == "java":
|
jpayne@68
|
111 server = DBServer(conn, module, driver)
|
jpayne@68
|
112 else:
|
jpayne@68
|
113 server = DBServer(conn, module)
|
jpayne@68
|
114
|
jpayne@68
|
115 # Sets MySQL to allow double quotes, rather than only backticks
|
jpayne@68
|
116 if driver in ["MySQLdb", "mysql.connector"]:
|
jpayne@68
|
117 server.adaptor.execute("SET sql_mode='ANSI_QUOTES';")
|
jpayne@68
|
118
|
jpayne@68
|
119 # TODO - Remove the following once BioSQL Bug 2839 is fixed.
|
jpayne@68
|
120 # Test for RULES in PostgreSQL schema, see also Bug 2833.
|
jpayne@68
|
121 if driver in ["psycopg2", "pgdb"]:
|
jpayne@68
|
122 sql = (
|
jpayne@68
|
123 "SELECT ev_class FROM pg_rewrite WHERE "
|
jpayne@68
|
124 "rulename='rule_bioentry_i1' OR "
|
jpayne@68
|
125 "rulename='rule_bioentry_i2';"
|
jpayne@68
|
126 )
|
jpayne@68
|
127 if server.adaptor.execute_and_fetchall(sql):
|
jpayne@68
|
128 import warnings
|
jpayne@68
|
129 from Bio import BiopythonWarning
|
jpayne@68
|
130
|
jpayne@68
|
131 warnings.warn(
|
jpayne@68
|
132 "Your BioSQL PostgreSQL schema includes some rules "
|
jpayne@68
|
133 "currently required for bioperl-db but which may"
|
jpayne@68
|
134 "cause problems loading data using Biopython (see "
|
jpayne@68
|
135 "BioSQL's RedMine Bug 2839 aka GitHub Issue 4 "
|
jpayne@68
|
136 "https://github.com/biosql/biosql/issues/4). "
|
jpayne@68
|
137 "If you do not use BioPerl, please remove these "
|
jpayne@68
|
138 "rules. Biopython should cope with the rules "
|
jpayne@68
|
139 "present, but with a performance penalty when "
|
jpayne@68
|
140 "loading new records.",
|
jpayne@68
|
141 BiopythonWarning,
|
jpayne@68
|
142 )
|
jpayne@68
|
143 global _POSTGRES_RULES_PRESENT
|
jpayne@68
|
144 _POSTGRES_RULES_PRESENT = True
|
jpayne@68
|
145
|
jpayne@68
|
146 elif driver == "sqlite3":
|
jpayne@68
|
147 # Tell SQLite that we want to use foreign keys
|
jpayne@68
|
148 # https://www.sqlite.org/foreignkeys.html#fk_enable
|
jpayne@68
|
149 server.adaptor.execute("PRAGMA foreign_keys = ON")
|
jpayne@68
|
150
|
jpayne@68
|
151 return server
|
jpayne@68
|
152
|
jpayne@68
|
153
|
jpayne@68
|
154 class DBServer:
|
jpayne@68
|
155 """Represents a BioSQL database containing namespaces (sub-databases).
|
jpayne@68
|
156
|
jpayne@68
|
157 This acts like a Python dictionary, giving access to each namespace
|
jpayne@68
|
158 (defined by a row in the biodatabase table) as a BioSeqDatabase object.
|
jpayne@68
|
159 """
|
jpayne@68
|
160
|
jpayne@68
|
161 def __init__(self, conn, module, module_name=None):
|
jpayne@68
|
162 """Create a DBServer object.
|
jpayne@68
|
163
|
jpayne@68
|
164 Arguments:
|
jpayne@68
|
165 - conn - A database connection object
|
jpayne@68
|
166 - module - The module used to create the database connection
|
jpayne@68
|
167 - module_name - Optionally, the name of the module. Default: module.__name__
|
jpayne@68
|
168
|
jpayne@68
|
169 Normally you would not want to create a DBServer object yourself.
|
jpayne@68
|
170 Instead use the open_database function, which returns an instance of DBServer.
|
jpayne@68
|
171 """
|
jpayne@68
|
172 self.module = module
|
jpayne@68
|
173 if module_name is None:
|
jpayne@68
|
174 module_name = module.__name__
|
jpayne@68
|
175 if module_name == "mysql.connector":
|
jpayne@68
|
176 wrap_cursor = True
|
jpayne@68
|
177 else:
|
jpayne@68
|
178 wrap_cursor = False
|
jpayne@68
|
179 # Get module specific Adaptor or the base (general) Adaptor
|
jpayne@68
|
180 Adapt = _interface_specific_adaptors.get(module_name, Adaptor)
|
jpayne@68
|
181 self.adaptor = Adapt(
|
jpayne@68
|
182 conn, DBUtils.get_dbutils(module_name), wrap_cursor=wrap_cursor
|
jpayne@68
|
183 )
|
jpayne@68
|
184 self.module_name = module_name
|
jpayne@68
|
185
|
jpayne@68
|
186 def __repr__(self):
|
jpayne@68
|
187 """Return a short description of the class name and database connection."""
|
jpayne@68
|
188 return f"{self.__class__.__name__}({self.adaptor.conn!r})"
|
jpayne@68
|
189
|
jpayne@68
|
190 def __getitem__(self, name):
|
jpayne@68
|
191 """Return a BioSeqDatabase object.
|
jpayne@68
|
192
|
jpayne@68
|
193 Arguments:
|
jpayne@68
|
194 - name - The name of the BioSeqDatabase
|
jpayne@68
|
195
|
jpayne@68
|
196 """
|
jpayne@68
|
197 return BioSeqDatabase(self.adaptor, name)
|
jpayne@68
|
198
|
jpayne@68
|
199 def __len__(self):
|
jpayne@68
|
200 """Return number of namespaces (sub-databases) in this database."""
|
jpayne@68
|
201 sql = "SELECT COUNT(name) FROM biodatabase;"
|
jpayne@68
|
202 return int(self.adaptor.execute_and_fetch_col0(sql)[0])
|
jpayne@68
|
203
|
jpayne@68
|
204 def __contains__(self, value):
|
jpayne@68
|
205 """Check if a namespace (sub-database) in this database."""
|
jpayne@68
|
206 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;"
|
jpayne@68
|
207 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
|
jpayne@68
|
208
|
jpayne@68
|
209 def __iter__(self):
|
jpayne@68
|
210 """Iterate over namespaces (sub-databases) in the database."""
|
jpayne@68
|
211 # TODO - Iterate over the cursor, much more efficient
|
jpayne@68
|
212 return iter(self.adaptor.list_biodatabase_names())
|
jpayne@68
|
213
|
jpayne@68
|
214 def keys(self):
|
jpayne@68
|
215 """Iterate over namespaces (sub-databases) in the database."""
|
jpayne@68
|
216 return iter(self)
|
jpayne@68
|
217
|
jpayne@68
|
218 def values(self):
|
jpayne@68
|
219 """Iterate over BioSeqDatabase objects in the database."""
|
jpayne@68
|
220 for key in self:
|
jpayne@68
|
221 yield self[key]
|
jpayne@68
|
222
|
jpayne@68
|
223 def items(self):
|
jpayne@68
|
224 """Iterate over (namespace, BioSeqDatabase) in the database."""
|
jpayne@68
|
225 for key in self:
|
jpayne@68
|
226 yield key, self[key]
|
jpayne@68
|
227
|
jpayne@68
|
228 def __delitem__(self, name):
|
jpayne@68
|
229 """Remove a namespace and all its entries."""
|
jpayne@68
|
230 if name not in self:
|
jpayne@68
|
231 raise KeyError(name)
|
jpayne@68
|
232 db_id = self.adaptor.fetch_dbid_by_dbname(name)
|
jpayne@68
|
233 remover = Loader.DatabaseRemover(self.adaptor, db_id)
|
jpayne@68
|
234 remover.remove()
|
jpayne@68
|
235
|
jpayne@68
|
236 def new_database(self, db_name, authority=None, description=None):
|
jpayne@68
|
237 """Add a new database to the server and return it."""
|
jpayne@68
|
238 # make the database
|
jpayne@68
|
239 sql = (
|
jpayne@68
|
240 "INSERT INTO biodatabase (name, authority, description)"
|
jpayne@68
|
241 " VALUES (%s, %s, %s)"
|
jpayne@68
|
242 )
|
jpayne@68
|
243 self.adaptor.execute(sql, (db_name, authority, description))
|
jpayne@68
|
244 return BioSeqDatabase(self.adaptor, db_name)
|
jpayne@68
|
245
|
jpayne@68
|
246 def load_database_sql(self, sql_file):
|
jpayne@68
|
247 """Load a database schema into the given database.
|
jpayne@68
|
248
|
jpayne@68
|
249 This is used to create tables, etc when a database is first created.
|
jpayne@68
|
250 sql_file should specify the complete path to a file containing
|
jpayne@68
|
251 SQL entries for building the tables.
|
jpayne@68
|
252 """
|
jpayne@68
|
253 # Not sophisticated enough for PG schema. Is it needed by MySQL?
|
jpayne@68
|
254 # Looks like we need this more complicated way for both. Leaving it
|
jpayne@68
|
255 # the default and removing the simple-minded approach.
|
jpayne@68
|
256
|
jpayne@68
|
257 # read the file with all comment lines removed
|
jpayne@68
|
258 sql = ""
|
jpayne@68
|
259 with open(sql_file) as sql_handle:
|
jpayne@68
|
260 for line in sql_handle:
|
jpayne@68
|
261 if line.startswith("--"): # don't include comment lines
|
jpayne@68
|
262 pass
|
jpayne@68
|
263 elif line.startswith("#"): # ditto for MySQL comments
|
jpayne@68
|
264 pass
|
jpayne@68
|
265 elif line.strip(): # only include non-blank lines
|
jpayne@68
|
266 sql += line.strip() + " "
|
jpayne@68
|
267
|
jpayne@68
|
268 # two ways to load the SQL
|
jpayne@68
|
269 # 1. PostgreSQL can load it all at once and actually needs to
|
jpayne@68
|
270 # due to FUNCTION defines at the end of the SQL which mess up
|
jpayne@68
|
271 # the splitting by semicolons
|
jpayne@68
|
272 if self.module_name in ["psycopg2", "pgdb"]:
|
jpayne@68
|
273 self.adaptor.cursor.execute(sql)
|
jpayne@68
|
274 # 2. MySQL needs the database loading split up into single lines of
|
jpayne@68
|
275 # SQL executed one at a time
|
jpayne@68
|
276 elif self.module_name in ["mysql.connector", "MySQLdb", "sqlite3"]:
|
jpayne@68
|
277 sql_parts = sql.split(";") # one line per sql command
|
jpayne@68
|
278 # don't use the last item, it's blank
|
jpayne@68
|
279 for sql_line in sql_parts[:-1]:
|
jpayne@68
|
280 self.adaptor.cursor.execute(sql_line)
|
jpayne@68
|
281 else:
|
jpayne@68
|
282 raise ValueError(f"Module {self.module_name} not supported by the loader.")
|
jpayne@68
|
283
|
jpayne@68
|
284 def commit(self):
|
jpayne@68
|
285 """Commit the current transaction to the database."""
|
jpayne@68
|
286 return self.adaptor.commit()
|
jpayne@68
|
287
|
jpayne@68
|
288 def rollback(self):
|
jpayne@68
|
289 """Roll-back the current transaction."""
|
jpayne@68
|
290 return self.adaptor.rollback()
|
jpayne@68
|
291
|
jpayne@68
|
292 def close(self):
|
jpayne@68
|
293 """Close the connection. No further activity possible."""
|
jpayne@68
|
294 return self.adaptor.close()
|
jpayne@68
|
295
|
jpayne@68
|
296
|
jpayne@68
|
297 class _CursorWrapper:
|
jpayne@68
|
298 """A wrapper for mysql.connector resolving bytestring representations."""
|
jpayne@68
|
299
|
jpayne@68
|
300 def __init__(self, real_cursor):
|
jpayne@68
|
301 self.real_cursor = real_cursor
|
jpayne@68
|
302
|
jpayne@68
|
303 def execute(self, operation, params=None, multi=False):
|
jpayne@68
|
304 """Execute a sql statement."""
|
jpayne@68
|
305 self.real_cursor.execute(operation, params, multi)
|
jpayne@68
|
306
|
jpayne@68
|
307 def executemany(self, operation, params):
|
jpayne@68
|
308 """Execute many sql statements."""
|
jpayne@68
|
309 self.real_cursor.executemany(operation, params)
|
jpayne@68
|
310
|
jpayne@68
|
311 def _convert_tuple(self, tuple_):
|
jpayne@68
|
312 """Decode any bytestrings present in the row (PRIVATE)."""
|
jpayne@68
|
313 tuple_list = list(tuple_)
|
jpayne@68
|
314 for i, elem in enumerate(tuple_list):
|
jpayne@68
|
315 if isinstance(elem, bytes):
|
jpayne@68
|
316 tuple_list[i] = elem.decode("utf-8")
|
jpayne@68
|
317 return tuple(tuple_list)
|
jpayne@68
|
318
|
jpayne@68
|
319 def _convert_list(self, lst):
|
jpayne@68
|
320 ret_lst = []
|
jpayne@68
|
321 for tuple_ in lst:
|
jpayne@68
|
322 new_tuple = self._convert_tuple(tuple_)
|
jpayne@68
|
323 ret_lst.append(new_tuple)
|
jpayne@68
|
324 return ret_lst
|
jpayne@68
|
325
|
jpayne@68
|
326 def fetchall(self):
|
jpayne@68
|
327 rv = self.real_cursor.fetchall()
|
jpayne@68
|
328 return self._convert_list(rv)
|
jpayne@68
|
329
|
jpayne@68
|
330 def fetchone(self):
|
jpayne@68
|
331 tuple_ = self.real_cursor.fetchone()
|
jpayne@68
|
332 return self._convert_tuple(tuple_)
|
jpayne@68
|
333
|
jpayne@68
|
334
|
jpayne@68
|
335 class Adaptor:
|
jpayne@68
|
336 """High level wrapper for a database connection and cursor.
|
jpayne@68
|
337
|
jpayne@68
|
338 Most database calls in BioSQL are done indirectly though this adaptor
|
jpayne@68
|
339 class. This provides helper methods for fetching data and executing
|
jpayne@68
|
340 sql.
|
jpayne@68
|
341 """
|
jpayne@68
|
342
|
jpayne@68
|
343 def __init__(self, conn, dbutils, wrap_cursor=False):
|
jpayne@68
|
344 """Create an Adaptor object.
|
jpayne@68
|
345
|
jpayne@68
|
346 Arguments:
|
jpayne@68
|
347 - conn - A database connection
|
jpayne@68
|
348 - dbutils - A BioSQL.DBUtils object
|
jpayne@68
|
349 - wrap_cursor - Optional, whether to wrap the cursor object
|
jpayne@68
|
350
|
jpayne@68
|
351 """
|
jpayne@68
|
352 self.conn = conn
|
jpayne@68
|
353 if wrap_cursor:
|
jpayne@68
|
354 self.cursor = _CursorWrapper(conn.cursor())
|
jpayne@68
|
355 else:
|
jpayne@68
|
356 self.cursor = conn.cursor()
|
jpayne@68
|
357 self.dbutils = dbutils
|
jpayne@68
|
358
|
jpayne@68
|
359 def last_id(self, table):
|
jpayne@68
|
360 """Return the last row id for the selected table."""
|
jpayne@68
|
361 return self.dbutils.last_id(self.cursor, table)
|
jpayne@68
|
362
|
jpayne@68
|
363 def autocommit(self, y=True):
|
jpayne@68
|
364 """Set the autocommit mode. True values enable; False value disable."""
|
jpayne@68
|
365 return self.dbutils.autocommit(self.conn, y)
|
jpayne@68
|
366
|
jpayne@68
|
367 def commit(self):
|
jpayne@68
|
368 """Commit the current transaction."""
|
jpayne@68
|
369 return self.conn.commit()
|
jpayne@68
|
370
|
jpayne@68
|
371 def rollback(self):
|
jpayne@68
|
372 """Roll-back the current transaction."""
|
jpayne@68
|
373 return self.conn.rollback()
|
jpayne@68
|
374
|
jpayne@68
|
375 def close(self):
|
jpayne@68
|
376 """Close the connection. No further activity possible."""
|
jpayne@68
|
377 return self.conn.close()
|
jpayne@68
|
378
|
jpayne@68
|
379 def fetch_dbid_by_dbname(self, dbname):
|
jpayne@68
|
380 """Return the internal id for the sub-database using its name."""
|
jpayne@68
|
381 self.execute(
|
jpayne@68
|
382 "select biodatabase_id from biodatabase where name = %s", (dbname,)
|
jpayne@68
|
383 )
|
jpayne@68
|
384 rv = self.cursor.fetchall()
|
jpayne@68
|
385 if not rv:
|
jpayne@68
|
386 raise KeyError(f"Cannot find biodatabase with name {dbname!r}")
|
jpayne@68
|
387 return rv[0][0]
|
jpayne@68
|
388
|
jpayne@68
|
389 def fetch_seqid_by_display_id(self, dbid, name):
|
jpayne@68
|
390 """Return the internal id for a sequence using its display id.
|
jpayne@68
|
391
|
jpayne@68
|
392 Arguments:
|
jpayne@68
|
393 - dbid - the internal id for the sub-database
|
jpayne@68
|
394 - name - the name of the sequence. Corresponds to the
|
jpayne@68
|
395 name column of the bioentry table of the SQL schema
|
jpayne@68
|
396
|
jpayne@68
|
397 """
|
jpayne@68
|
398 sql = "select bioentry_id from bioentry where name = %s"
|
jpayne@68
|
399 fields = [name]
|
jpayne@68
|
400 if dbid:
|
jpayne@68
|
401 sql += " and biodatabase_id = %s"
|
jpayne@68
|
402 fields.append(dbid)
|
jpayne@68
|
403 self.execute(sql, fields)
|
jpayne@68
|
404 rv = self.cursor.fetchall()
|
jpayne@68
|
405 if not rv:
|
jpayne@68
|
406 raise IndexError(f"Cannot find display id {name!r}")
|
jpayne@68
|
407 if len(rv) > 1:
|
jpayne@68
|
408 raise IndexError(f"More than one entry with display id {name!r}")
|
jpayne@68
|
409 return rv[0][0]
|
jpayne@68
|
410
|
jpayne@68
|
411 def fetch_seqid_by_accession(self, dbid, name):
|
jpayne@68
|
412 """Return the internal id for a sequence using its accession.
|
jpayne@68
|
413
|
jpayne@68
|
414 Arguments:
|
jpayne@68
|
415 - dbid - the internal id for the sub-database
|
jpayne@68
|
416 - name - the accession of the sequence. Corresponds to the
|
jpayne@68
|
417 accession column of the bioentry table of the SQL schema
|
jpayne@68
|
418
|
jpayne@68
|
419 """
|
jpayne@68
|
420 sql = "select bioentry_id from bioentry where accession = %s"
|
jpayne@68
|
421 fields = [name]
|
jpayne@68
|
422 if dbid:
|
jpayne@68
|
423 sql += " and biodatabase_id = %s"
|
jpayne@68
|
424 fields.append(dbid)
|
jpayne@68
|
425 self.execute(sql, fields)
|
jpayne@68
|
426 rv = self.cursor.fetchall()
|
jpayne@68
|
427 if not rv:
|
jpayne@68
|
428 raise IndexError(f"Cannot find accession {name!r}")
|
jpayne@68
|
429 if len(rv) > 1:
|
jpayne@68
|
430 raise IndexError(f"More than one entry with accession {name!r}")
|
jpayne@68
|
431 return rv[0][0]
|
jpayne@68
|
432
|
jpayne@68
|
433 def fetch_seqids_by_accession(self, dbid, name):
|
jpayne@68
|
434 """Return a list internal ids using an accession.
|
jpayne@68
|
435
|
jpayne@68
|
436 Arguments:
|
jpayne@68
|
437 - dbid - the internal id for the sub-database
|
jpayne@68
|
438 - name - the accession of the sequence. Corresponds to the
|
jpayne@68
|
439 accession column of the bioentry table of the SQL schema
|
jpayne@68
|
440
|
jpayne@68
|
441 """
|
jpayne@68
|
442 sql = "select bioentry_id from bioentry where accession = %s"
|
jpayne@68
|
443 fields = [name]
|
jpayne@68
|
444 if dbid:
|
jpayne@68
|
445 sql += " and biodatabase_id = %s"
|
jpayne@68
|
446 fields.append(dbid)
|
jpayne@68
|
447 return self.execute_and_fetch_col0(sql, fields)
|
jpayne@68
|
448
|
jpayne@68
|
449 def fetch_seqid_by_version(self, dbid, name):
|
jpayne@68
|
450 """Return the internal id for a sequence using its accession and version.
|
jpayne@68
|
451
|
jpayne@68
|
452 Arguments:
|
jpayne@68
|
453 - dbid - the internal id for the sub-database
|
jpayne@68
|
454 - name - the accession of the sequence containing a version number.
|
jpayne@68
|
455 Must correspond to <accession>.<version>
|
jpayne@68
|
456
|
jpayne@68
|
457 """
|
jpayne@68
|
458 acc_version = name.split(".")
|
jpayne@68
|
459 if len(acc_version) > 2:
|
jpayne@68
|
460 raise IndexError(f"Bad version {name!r}")
|
jpayne@68
|
461 acc = acc_version[0]
|
jpayne@68
|
462 if len(acc_version) == 2:
|
jpayne@68
|
463 version = acc_version[1]
|
jpayne@68
|
464 else:
|
jpayne@68
|
465 version = "0"
|
jpayne@68
|
466 sql = "SELECT bioentry_id FROM bioentry WHERE accession = %s AND version = %s"
|
jpayne@68
|
467 fields = [acc, version]
|
jpayne@68
|
468 if dbid:
|
jpayne@68
|
469 sql += " and biodatabase_id = %s"
|
jpayne@68
|
470 fields.append(dbid)
|
jpayne@68
|
471 self.execute(sql, fields)
|
jpayne@68
|
472 rv = self.cursor.fetchall()
|
jpayne@68
|
473 if not rv:
|
jpayne@68
|
474 raise IndexError(f"Cannot find version {name!r}")
|
jpayne@68
|
475 if len(rv) > 1:
|
jpayne@68
|
476 raise IndexError(f"More than one entry with version {name!r}")
|
jpayne@68
|
477 return rv[0][0]
|
jpayne@68
|
478
|
jpayne@68
|
479 def fetch_seqid_by_identifier(self, dbid, identifier):
|
jpayne@68
|
480 """Return the internal id for a sequence using its identifier.
|
jpayne@68
|
481
|
jpayne@68
|
482 Arguments:
|
jpayne@68
|
483 - dbid - the internal id for the sub-database
|
jpayne@68
|
484 - identifier - the identifier of the sequence. Corresponds to
|
jpayne@68
|
485 the identifier column of the bioentry table in the SQL schema.
|
jpayne@68
|
486
|
jpayne@68
|
487 """
|
jpayne@68
|
488 # YB: was fetch_seqid_by_seqid
|
jpayne@68
|
489 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s"
|
jpayne@68
|
490 fields = [identifier]
|
jpayne@68
|
491 if dbid:
|
jpayne@68
|
492 sql += " and biodatabase_id = %s"
|
jpayne@68
|
493 fields.append(dbid)
|
jpayne@68
|
494 self.execute(sql, fields)
|
jpayne@68
|
495 rv = self.cursor.fetchall()
|
jpayne@68
|
496 if not rv:
|
jpayne@68
|
497 raise IndexError(f"Cannot find display id {identifier!r}")
|
jpayne@68
|
498 return rv[0][0]
|
jpayne@68
|
499
|
jpayne@68
|
500 def list_biodatabase_names(self):
|
jpayne@68
|
501 """Return a list of all of the sub-databases."""
|
jpayne@68
|
502 return self.execute_and_fetch_col0("SELECT name FROM biodatabase")
|
jpayne@68
|
503
|
jpayne@68
|
504 def list_bioentry_ids(self, dbid):
|
jpayne@68
|
505 """Return a list of internal ids for all of the sequences in a sub-databae.
|
jpayne@68
|
506
|
jpayne@68
|
507 Arguments:
|
jpayne@68
|
508 - dbid - The internal id for a sub-database
|
jpayne@68
|
509
|
jpayne@68
|
510 """
|
jpayne@68
|
511 return self.execute_and_fetch_col0(
|
jpayne@68
|
512 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", (dbid,)
|
jpayne@68
|
513 )
|
jpayne@68
|
514
|
jpayne@68
|
515 def list_bioentry_display_ids(self, dbid):
|
jpayne@68
|
516 """Return a list of all sequence names in a sub-databae.
|
jpayne@68
|
517
|
jpayne@68
|
518 Arguments:
|
jpayne@68
|
519 - dbid - The internal id for a sub-database
|
jpayne@68
|
520
|
jpayne@68
|
521 """
|
jpayne@68
|
522 return self.execute_and_fetch_col0(
|
jpayne@68
|
523 "SELECT name FROM bioentry WHERE biodatabase_id = %s", (dbid,)
|
jpayne@68
|
524 )
|
jpayne@68
|
525
|
jpayne@68
|
526 def list_any_ids(self, sql, args):
|
jpayne@68
|
527 """Return ids given a SQL statement to select for them.
|
jpayne@68
|
528
|
jpayne@68
|
529 This assumes that the given SQL does a SELECT statement that
|
jpayne@68
|
530 returns a list of items. This parses them out of the 2D list
|
jpayne@68
|
531 they come as and just returns them in a list.
|
jpayne@68
|
532 """
|
jpayne@68
|
533 return self.execute_and_fetch_col0(sql, args)
|
jpayne@68
|
534
|
jpayne@68
|
535 def execute_one(self, sql, args=None):
|
jpayne@68
|
536 """Execute sql that returns 1 record, and return the record."""
|
jpayne@68
|
537 self.execute(sql, args or ())
|
jpayne@68
|
538 rv = self.cursor.fetchall()
|
jpayne@68
|
539 if len(rv) != 1:
|
jpayne@68
|
540 raise ValueError(f"Expected 1 response, got {len(rv)}.")
|
jpayne@68
|
541 return rv[0]
|
jpayne@68
|
542
|
jpayne@68
|
543 def execute(self, sql, args=None):
|
jpayne@68
|
544 """Just execute an sql command."""
|
jpayne@68
|
545 if os.name == "java":
|
jpayne@68
|
546 sql = sql.replace("%s", "?")
|
jpayne@68
|
547 self.dbutils.execute(self.cursor, sql, args)
|
jpayne@68
|
548
|
jpayne@68
|
549 def executemany(self, sql, args):
|
jpayne@68
|
550 """Execute many sql commands."""
|
jpayne@68
|
551 if os.name == "java":
|
jpayne@68
|
552 sql = sql.replace("%s", "?")
|
jpayne@68
|
553 self.dbutils.executemany(self.cursor, sql, args)
|
jpayne@68
|
554
|
jpayne@68
|
555 def get_subseq_as_string(self, seqid, start, end):
|
jpayne@68
|
556 """Return a substring of a sequence.
|
jpayne@68
|
557
|
jpayne@68
|
558 Arguments:
|
jpayne@68
|
559 - seqid - The internal id for the sequence
|
jpayne@68
|
560 - start - The start position of the sequence; 0-indexed
|
jpayne@68
|
561 - end - The end position of the sequence
|
jpayne@68
|
562
|
jpayne@68
|
563 """
|
jpayne@68
|
564 length = end - start
|
jpayne@68
|
565 # XXX Check this on MySQL and PostgreSQL. substr should be general,
|
jpayne@68
|
566 # does it need dbutils?
|
jpayne@68
|
567 # return self.execute_one(
|
jpayne@68
|
568 # """select SUBSTRING(seq FROM %s FOR %s)
|
jpayne@68
|
569 # from biosequence where bioentry_id = %s""",
|
jpayne@68
|
570 # (start+1, length, seqid))[0]
|
jpayne@68
|
571 return self.execute_one(
|
jpayne@68
|
572 "SELECT SUBSTR(seq, %s, %s) FROM biosequence WHERE bioentry_id = %s",
|
jpayne@68
|
573 (start + 1, length, seqid),
|
jpayne@68
|
574 )[0]
|
jpayne@68
|
575
|
jpayne@68
|
576 def execute_and_fetch_col0(self, sql, args=None):
|
jpayne@68
|
577 """Return a list of values from the first column in the row."""
|
jpayne@68
|
578 self.execute(sql, args or ())
|
jpayne@68
|
579 return [field[0] for field in self.cursor.fetchall()]
|
jpayne@68
|
580
|
jpayne@68
|
581 def execute_and_fetchall(self, sql, args=None):
|
jpayne@68
|
582 """Return a list of tuples of all rows."""
|
jpayne@68
|
583 self.execute(sql, args or ())
|
jpayne@68
|
584 return self.cursor.fetchall()
|
jpayne@68
|
585
|
jpayne@68
|
586
|
jpayne@68
|
587 class MysqlConnectorAdaptor(Adaptor):
|
jpayne@68
|
588 """A BioSQL Adaptor class with fixes for the MySQL interface.
|
jpayne@68
|
589
|
jpayne@68
|
590 BioSQL was failing due to returns of bytearray objects from
|
jpayne@68
|
591 the mysql-connector-python database connector. This adaptor
|
jpayne@68
|
592 class scrubs returns of bytearrays and of byte strings converting
|
jpayne@68
|
593 them to string objects instead. This adaptor class was made in
|
jpayne@68
|
594 response to backwards incompatible changes added to
|
jpayne@68
|
595 mysql-connector-python in release 2.0.0 of the package.
|
jpayne@68
|
596 """
|
jpayne@68
|
597
|
jpayne@68
|
598 @staticmethod
|
jpayne@68
|
599 def _bytearray_to_str(s):
|
jpayne@68
|
600 """If s is bytes or bytearray, convert to a string (PRIVATE)."""
|
jpayne@68
|
601 if isinstance(s, (bytes, bytearray)):
|
jpayne@68
|
602 return s.decode()
|
jpayne@68
|
603 return s
|
jpayne@68
|
604
|
jpayne@68
|
605 def execute_one(self, sql, args=None):
|
jpayne@68
|
606 """Execute sql that returns 1 record, and return the record."""
|
jpayne@68
|
607 out = super().execute_one(sql, args)
|
jpayne@68
|
608 return tuple(self._bytearray_to_str(v) for v in out)
|
jpayne@68
|
609
|
jpayne@68
|
610 def execute_and_fetch_col0(self, sql, args=None):
|
jpayne@68
|
611 """Return a list of values from the first column in the row."""
|
jpayne@68
|
612 out = super().execute_and_fetch_col0(sql, args)
|
jpayne@68
|
613 return [self._bytearray_to_str(column) for column in out]
|
jpayne@68
|
614
|
jpayne@68
|
615 def execute_and_fetchall(self, sql, args=None):
|
jpayne@68
|
616 """Return a list of tuples of all rows."""
|
jpayne@68
|
617 out = super().execute_and_fetchall(sql, args)
|
jpayne@68
|
618 return [tuple(self._bytearray_to_str(v) for v in o) for o in out]
|
jpayne@68
|
619
|
jpayne@68
|
620
|
jpayne@68
|
621 _interface_specific_adaptors = {
|
jpayne@68
|
622 # If SQL interfaces require a specific adaptor, use this to map the adaptor
|
jpayne@68
|
623 "mysql.connector": MysqlConnectorAdaptor,
|
jpayne@68
|
624 "MySQLdb": MysqlConnectorAdaptor,
|
jpayne@68
|
625 }
|
jpayne@68
|
626
|
jpayne@68
|
627 _allowed_lookups = {
|
jpayne@68
|
628 # Lookup name / function name to get id, function to list all ids
|
jpayne@68
|
629 "primary_id": "fetch_seqid_by_identifier",
|
jpayne@68
|
630 "gi": "fetch_seqid_by_identifier",
|
jpayne@68
|
631 "display_id": "fetch_seqid_by_display_id",
|
jpayne@68
|
632 "name": "fetch_seqid_by_display_id",
|
jpayne@68
|
633 "accession": "fetch_seqid_by_accession",
|
jpayne@68
|
634 "version": "fetch_seqid_by_version",
|
jpayne@68
|
635 }
|
jpayne@68
|
636
|
jpayne@68
|
637
|
jpayne@68
|
638 class BioSeqDatabase:
|
jpayne@68
|
639 """Represents a namespace (sub-database) within the BioSQL database.
|
jpayne@68
|
640
|
jpayne@68
|
641 i.e. One row in the biodatabase table, and all all rows in the bioentry
|
jpayne@68
|
642 table associated with it.
|
jpayne@68
|
643 """
|
jpayne@68
|
644
|
jpayne@68
|
645 def __init__(self, adaptor, name):
|
jpayne@68
|
646 """Create a BioDatabase object.
|
jpayne@68
|
647
|
jpayne@68
|
648 Arguments:
|
jpayne@68
|
649 - adaptor - A BioSQL.Adaptor object
|
jpayne@68
|
650 - name - The name of the sub-database (namespace)
|
jpayne@68
|
651
|
jpayne@68
|
652 """
|
jpayne@68
|
653 self.adaptor = adaptor
|
jpayne@68
|
654 self.name = name
|
jpayne@68
|
655 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
|
jpayne@68
|
656
|
jpayne@68
|
657 def __repr__(self):
|
jpayne@68
|
658 """Return a short summary of the BioSeqDatabase."""
|
jpayne@68
|
659 return f"BioSeqDatabase({self.adaptor!r}, {self.name!r})"
|
jpayne@68
|
660
|
jpayne@68
|
661 def get_Seq_by_id(self, name):
|
jpayne@68
|
662 """Get a DBSeqRecord object by its name.
|
jpayne@68
|
663
|
jpayne@68
|
664 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN')
|
jpayne@68
|
665
|
jpayne@68
|
666 The name of this method is misleading since it returns a DBSeqRecord
|
jpayne@68
|
667 rather than a Seq object, and presumably was to mirror BioPerl.
|
jpayne@68
|
668 """
|
jpayne@68
|
669 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name)
|
jpayne@68
|
670 return BioSeq.DBSeqRecord(self.adaptor, seqid)
|
jpayne@68
|
671
|
jpayne@68
|
672 def get_Seq_by_acc(self, name):
|
jpayne@68
|
673 """Get a DBSeqRecord object by accession number.
|
jpayne@68
|
674
|
jpayne@68
|
675 Example: seq_rec = db.get_Seq_by_acc('X77802')
|
jpayne@68
|
676
|
jpayne@68
|
677 The name of this method is misleading since it returns a DBSeqRecord
|
jpayne@68
|
678 rather than a Seq object, and presumably was to mirror BioPerl.
|
jpayne@68
|
679 """
|
jpayne@68
|
680 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name)
|
jpayne@68
|
681 return BioSeq.DBSeqRecord(self.adaptor, seqid)
|
jpayne@68
|
682
|
jpayne@68
|
683 def get_Seq_by_ver(self, name):
|
jpayne@68
|
684 """Get a DBSeqRecord object by version number.
|
jpayne@68
|
685
|
jpayne@68
|
686 Example: seq_rec = db.get_Seq_by_ver('X77802.1')
|
jpayne@68
|
687
|
jpayne@68
|
688 The name of this method is misleading since it returns a DBSeqRecord
|
jpayne@68
|
689 rather than a Seq object, and presumably was to mirror BioPerl.
|
jpayne@68
|
690 """
|
jpayne@68
|
691 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name)
|
jpayne@68
|
692 return BioSeq.DBSeqRecord(self.adaptor, seqid)
|
jpayne@68
|
693
|
jpayne@68
|
694 def get_Seqs_by_acc(self, name):
|
jpayne@68
|
695 """Get a list of DBSeqRecord objects by accession number.
|
jpayne@68
|
696
|
jpayne@68
|
697 Example: seq_recs = db.get_Seq_by_acc('X77802')
|
jpayne@68
|
698
|
jpayne@68
|
699 The name of this method is misleading since it returns a list of
|
jpayne@68
|
700 DBSeqRecord objects rather than a list of Seq objects, and presumably
|
jpayne@68
|
701 was to mirror BioPerl.
|
jpayne@68
|
702 """
|
jpayne@68
|
703 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name)
|
jpayne@68
|
704 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
|
jpayne@68
|
705
|
jpayne@68
|
706 def __getitem__(self, key):
|
jpayne@68
|
707 """Return a DBSeqRecord for one of the sequences in the sub-database.
|
jpayne@68
|
708
|
jpayne@68
|
709 Arguments:
|
jpayne@68
|
710 - key - The internal id for the sequence
|
jpayne@68
|
711
|
jpayne@68
|
712 """
|
jpayne@68
|
713 record = BioSeq.DBSeqRecord(self.adaptor, key)
|
jpayne@68
|
714 if record._biodatabase_id != self.dbid:
|
jpayne@68
|
715 raise KeyError(f"Entry {key!r} does exist, but not in current name space")
|
jpayne@68
|
716 return record
|
jpayne@68
|
717
|
jpayne@68
|
718 def __delitem__(self, key):
|
jpayne@68
|
719 """Remove an entry and all its annotation."""
|
jpayne@68
|
720 if key not in self:
|
jpayne@68
|
721 raise KeyError(
|
jpayne@68
|
722 f"Entry {key!r} cannot be deleted. It was not found or is invalid"
|
jpayne@68
|
723 )
|
jpayne@68
|
724 # Assuming this will automatically cascade to the other tables...
|
jpayne@68
|
725 sql = "DELETE FROM bioentry WHERE biodatabase_id=%s AND bioentry_id=%s;"
|
jpayne@68
|
726 self.adaptor.execute(sql, (self.dbid, key))
|
jpayne@68
|
727
|
jpayne@68
|
728 def __len__(self):
|
jpayne@68
|
729 """Return number of records in this namespace (sub database)."""
|
jpayne@68
|
730 sql = "SELECT COUNT(bioentry_id) FROM bioentry WHERE biodatabase_id=%s;"
|
jpayne@68
|
731 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid,))[0])
|
jpayne@68
|
732
|
jpayne@68
|
733 def __contains__(self, value):
|
jpayne@68
|
734 """Check if a primary (internal) id is this namespace (sub database)."""
|
jpayne@68
|
735 sql = (
|
jpayne@68
|
736 "SELECT COUNT(bioentry_id) FROM bioentry "
|
jpayne@68
|
737 "WHERE biodatabase_id=%s AND bioentry_id=%s;"
|
jpayne@68
|
738 )
|
jpayne@68
|
739 # The bioentry_id field is an integer in the schema.
|
jpayne@68
|
740 # PostgreSQL will throw an error if we use a non integer in the query.
|
jpayne@68
|
741 try:
|
jpayne@68
|
742 bioentry_id = int(value)
|
jpayne@68
|
743 except ValueError:
|
jpayne@68
|
744 return False
|
jpayne@68
|
745 return bool(
|
jpayne@68
|
746 self.adaptor.execute_and_fetch_col0(sql, (self.dbid, bioentry_id))[0]
|
jpayne@68
|
747 )
|
jpayne@68
|
748
|
jpayne@68
|
749 def __iter__(self):
|
jpayne@68
|
750 """Iterate over ids (which may not be meaningful outside this database)."""
|
jpayne@68
|
751 # TODO - Iterate over the cursor, much more efficient
|
jpayne@68
|
752 return iter(self.adaptor.list_bioentry_ids(self.dbid))
|
jpayne@68
|
753
|
jpayne@68
|
754 def keys(self):
|
jpayne@68
|
755 """Iterate over ids (which may not be meaningful outside this database)."""
|
jpayne@68
|
756 return iter(self)
|
jpayne@68
|
757
|
jpayne@68
|
758 def values(self):
|
jpayne@68
|
759 """Iterate over DBSeqRecord objects in the namespace (sub database)."""
|
jpayne@68
|
760 for key in self:
|
jpayne@68
|
761 yield self[key]
|
jpayne@68
|
762
|
jpayne@68
|
763 def items(self):
|
jpayne@68
|
764 """Iterate over (id, DBSeqRecord) for the namespace (sub database)."""
|
jpayne@68
|
765 for key in self:
|
jpayne@68
|
766 yield key, self[key]
|
jpayne@68
|
767
|
jpayne@68
|
768 def lookup(self, **kwargs):
|
jpayne@68
|
769 """Return a DBSeqRecord using an acceptable identifier.
|
jpayne@68
|
770
|
jpayne@68
|
771 Arguments:
|
jpayne@68
|
772 - kwargs - A single key-value pair where the key is one
|
jpayne@68
|
773 of primary_id, gi, display_id, name, accession, version
|
jpayne@68
|
774
|
jpayne@68
|
775 """
|
jpayne@68
|
776 if len(kwargs) != 1:
|
jpayne@68
|
777 raise TypeError("single key/value parameter expected")
|
jpayne@68
|
778 k, v = list(kwargs.items())[0]
|
jpayne@68
|
779 if k not in _allowed_lookups:
|
jpayne@68
|
780 raise TypeError(
|
jpayne@68
|
781 f"lookup() expects one of {list(_allowed_lookups.keys())!r}, not {k!r}"
|
jpayne@68
|
782 )
|
jpayne@68
|
783 lookup_name = _allowed_lookups[k]
|
jpayne@68
|
784 lookup_func = getattr(self.adaptor, lookup_name)
|
jpayne@68
|
785 seqid = lookup_func(self.dbid, v)
|
jpayne@68
|
786 return BioSeq.DBSeqRecord(self.adaptor, seqid)
|
jpayne@68
|
787
|
jpayne@68
|
788 def load(self, record_iterator, fetch_NCBI_taxonomy=False):
|
jpayne@68
|
789 """Load a set of SeqRecords into the BioSQL database.
|
jpayne@68
|
790
|
jpayne@68
|
791 record_iterator is either a list of SeqRecord objects, or an
|
jpayne@68
|
792 Iterator object that returns SeqRecord objects (such as the
|
jpayne@68
|
793 output from the Bio.SeqIO.parse() function), which will be
|
jpayne@68
|
794 used to populate the database.
|
jpayne@68
|
795
|
jpayne@68
|
796 fetch_NCBI_taxonomy is boolean flag allowing or preventing
|
jpayne@68
|
797 connection to the taxonomic database on the NCBI server
|
jpayne@68
|
798 (via Bio.Entrez) to fetch a detailed taxonomy for each
|
jpayne@68
|
799 SeqRecord.
|
jpayne@68
|
800
|
jpayne@68
|
801 Example::
|
jpayne@68
|
802
|
jpayne@68
|
803 from Bio import SeqIO
|
jpayne@68
|
804 count = db.load(SeqIO.parse(open(filename), format))
|
jpayne@68
|
805
|
jpayne@68
|
806 Returns the number of records loaded.
|
jpayne@68
|
807 """
|
jpayne@68
|
808 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, fetch_NCBI_taxonomy)
|
jpayne@68
|
809 num_records = 0
|
jpayne@68
|
810 global _POSTGRES_RULES_PRESENT
|
jpayne@68
|
811 for cur_record in record_iterator:
|
jpayne@68
|
812 num_records += 1
|
jpayne@68
|
813 # Hack to work around BioSQL Bug 2839 - If using PostgreSQL and
|
jpayne@68
|
814 # the RULES are present check for a duplicate record before loading
|
jpayne@68
|
815 if _POSTGRES_RULES_PRESENT:
|
jpayne@68
|
816 # Recreate what the Loader's _load_bioentry_table will do:
|
jpayne@68
|
817 if cur_record.id.count(".") == 1:
|
jpayne@68
|
818 accession, version = cur_record.id.split(".")
|
jpayne@68
|
819 try:
|
jpayne@68
|
820 version = int(version)
|
jpayne@68
|
821 except ValueError:
|
jpayne@68
|
822 accession = cur_record.id
|
jpayne@68
|
823 version = 0
|
jpayne@68
|
824 else:
|
jpayne@68
|
825 accession = cur_record.id
|
jpayne@68
|
826 version = 0
|
jpayne@68
|
827 gi = cur_record.annotations.get("gi")
|
jpayne@68
|
828 sql = (
|
jpayne@68
|
829 "SELECT bioentry_id FROM bioentry "
|
jpayne@68
|
830 "WHERE (identifier = '%s' AND biodatabase_id = '%s') "
|
jpayne@68
|
831 "OR (accession = '%s' AND version = '%s' AND biodatabase_id = '%s')"
|
jpayne@68
|
832 )
|
jpayne@68
|
833 self.adaptor.execute(
|
jpayne@68
|
834 sql % (gi, self.dbid, accession, version, self.dbid)
|
jpayne@68
|
835 )
|
jpayne@68
|
836 if self.adaptor.cursor.fetchone():
|
jpayne@68
|
837 raise self.adaptor.conn.IntegrityError(
|
jpayne@68
|
838 "Duplicate record detected: record has not been inserted"
|
jpayne@68
|
839 )
|
jpayne@68
|
840 # End of hack
|
jpayne@68
|
841 db_loader.load_seqrecord(cur_record)
|
jpayne@68
|
842 return num_records
|