jpayne@68: # Copyright 2002 by Andrew Dalke. All rights reserved. jpayne@68: # Revisions 2007-2016 copyright by Peter Cock. All rights reserved. jpayne@68: # Revisions 2009 copyright by Cymon J. Cox. All rights reserved. jpayne@68: # Revisions 2013-2014 copyright by Tiago Antao. All rights reserved. jpayne@68: # jpayne@68: # This file is part of the Biopython distribution and governed by your jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". jpayne@68: # Please see the LICENSE file that should have been included as part of this jpayne@68: # package. jpayne@68: # jpayne@68: # Note that BioSQL (including the database schema and scripts) is jpayne@68: # available and licensed separately. Please consult www.biosql.org jpayne@68: """Connect with a BioSQL database and load Biopython like objects from it. jpayne@68: jpayne@68: This provides interfaces for loading biological objects from a relational jpayne@68: database, and is compatible with the BioSQL standards. jpayne@68: """ jpayne@68: import os jpayne@68: jpayne@68: from . import BioSeq jpayne@68: from . import Loader jpayne@68: from . import DBUtils jpayne@68: jpayne@68: jpayne@68: _POSTGRES_RULES_PRESENT = False # Hack for BioSQL Bug 2839 jpayne@68: jpayne@68: jpayne@68: def open_database(driver="MySQLdb", **kwargs): jpayne@68: """Load an existing BioSQL-style database. jpayne@68: jpayne@68: This function is the easiest way to retrieve a connection to a jpayne@68: database, doing something like:: jpayne@68: jpayne@68: from BioSQL import BioSeqDatabase jpayne@68: server = BioSeqDatabase.open_database(user="root", db="minidb") jpayne@68: jpayne@68: Arguments: jpayne@68: - driver - The name of the database driver to use for connecting. The jpayne@68: driver should implement the python DB API. By default, the MySQLdb jpayne@68: driver is used. jpayne@68: - user -the username to connect to the database with. jpayne@68: - password, passwd - the password to connect with jpayne@68: - host - the hostname of the database jpayne@68: - database or db - the name of the database jpayne@68: jpayne@68: """ jpayne@68: if driver == "psycopg": jpayne@68: raise ValueError( jpayne@68: "Using BioSQL with psycopg (version one) is no " jpayne@68: "longer supported. Use psycopg2 instead." jpayne@68: ) jpayne@68: jpayne@68: if os.name == "java": jpayne@68: from com.ziclix.python.sql import zxJDBC jpayne@68: jpayne@68: module = zxJDBC jpayne@68: if driver in ["MySQLdb"]: jpayne@68: jdbc_driver = "com.mysql.jdbc.Driver" jpayne@68: url_pref = "jdbc:mysql://" + kwargs["host"] + "/" jpayne@68: elif driver in ["psycopg2"]: jpayne@68: jdbc_driver = "org.postgresql.Driver" jpayne@68: url_pref = "jdbc:postgresql://" + kwargs["host"] + "/" jpayne@68: jpayne@68: else: jpayne@68: module = __import__(driver, fromlist=["connect"]) jpayne@68: connect = module.connect jpayne@68: jpayne@68: # Different drivers use different keywords... jpayne@68: kw = kwargs.copy() jpayne@68: if driver in ["MySQLdb", "mysql.connector"] and os.name != "java": jpayne@68: if "database" in kw: jpayne@68: kw["db"] = kw["database"] jpayne@68: del kw["database"] jpayne@68: if "password" in kw: jpayne@68: kw["passwd"] = kw["password"] jpayne@68: del kw["password"] jpayne@68: # kw["charset"] = "utf8" jpayne@68: # kw["use_unicode"] = True jpayne@68: else: jpayne@68: # DB-API recommendations jpayne@68: if "db" in kw: jpayne@68: kw["database"] = kw["db"] jpayne@68: del kw["db"] jpayne@68: if "passwd" in kw: jpayne@68: kw["password"] = kw["passwd"] jpayne@68: del kw["passwd"] jpayne@68: if driver in ["psycopg2", "pgdb"] and not kw.get("database"): jpayne@68: kw["database"] = "template1" jpayne@68: # SQLite connect takes the database name as input jpayne@68: if os.name == "java": jpayne@68: if driver in ["MySQLdb"]: jpayne@68: conn = connect( jpayne@68: url_pref + kw.get("database", "mysql"), jpayne@68: kw["user"], jpayne@68: kw["password"], jpayne@68: jdbc_driver, jpayne@68: ) jpayne@68: elif driver in ["psycopg2"]: jpayne@68: conn = connect( jpayne@68: url_pref + kw.get("database", "postgresql") + "?stringtype=unspecified", jpayne@68: kw["user"], jpayne@68: kw["password"], jpayne@68: jdbc_driver, jpayne@68: ) jpayne@68: elif driver in ["sqlite3"]: jpayne@68: conn = connect(kw["database"]) jpayne@68: else: jpayne@68: conn = connect(**kw) jpayne@68: jpayne@68: if os.name == "java": jpayne@68: server = DBServer(conn, module, driver) jpayne@68: else: jpayne@68: server = DBServer(conn, module) jpayne@68: jpayne@68: # Sets MySQL to allow double quotes, rather than only backticks jpayne@68: if driver in ["MySQLdb", "mysql.connector"]: jpayne@68: server.adaptor.execute("SET sql_mode='ANSI_QUOTES';") jpayne@68: jpayne@68: # TODO - Remove the following once BioSQL Bug 2839 is fixed. jpayne@68: # Test for RULES in PostgreSQL schema, see also Bug 2833. jpayne@68: if driver in ["psycopg2", "pgdb"]: jpayne@68: sql = ( jpayne@68: "SELECT ev_class FROM pg_rewrite WHERE " jpayne@68: "rulename='rule_bioentry_i1' OR " jpayne@68: "rulename='rule_bioentry_i2';" jpayne@68: ) jpayne@68: if server.adaptor.execute_and_fetchall(sql): jpayne@68: import warnings jpayne@68: from Bio import BiopythonWarning jpayne@68: jpayne@68: warnings.warn( jpayne@68: "Your BioSQL PostgreSQL schema includes some rules " jpayne@68: "currently required for bioperl-db but which may" jpayne@68: "cause problems loading data using Biopython (see " jpayne@68: "BioSQL's RedMine Bug 2839 aka GitHub Issue 4 " jpayne@68: "https://github.com/biosql/biosql/issues/4). " jpayne@68: "If you do not use BioPerl, please remove these " jpayne@68: "rules. Biopython should cope with the rules " jpayne@68: "present, but with a performance penalty when " jpayne@68: "loading new records.", jpayne@68: BiopythonWarning, jpayne@68: ) jpayne@68: global _POSTGRES_RULES_PRESENT jpayne@68: _POSTGRES_RULES_PRESENT = True jpayne@68: jpayne@68: elif driver == "sqlite3": jpayne@68: # Tell SQLite that we want to use foreign keys jpayne@68: # https://www.sqlite.org/foreignkeys.html#fk_enable jpayne@68: server.adaptor.execute("PRAGMA foreign_keys = ON") jpayne@68: jpayne@68: return server jpayne@68: jpayne@68: jpayne@68: class DBServer: jpayne@68: """Represents a BioSQL database containing namespaces (sub-databases). jpayne@68: jpayne@68: This acts like a Python dictionary, giving access to each namespace jpayne@68: (defined by a row in the biodatabase table) as a BioSeqDatabase object. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, conn, module, module_name=None): jpayne@68: """Create a DBServer object. jpayne@68: jpayne@68: Arguments: jpayne@68: - conn - A database connection object jpayne@68: - module - The module used to create the database connection jpayne@68: - module_name - Optionally, the name of the module. Default: module.__name__ jpayne@68: jpayne@68: Normally you would not want to create a DBServer object yourself. jpayne@68: Instead use the open_database function, which returns an instance of DBServer. jpayne@68: """ jpayne@68: self.module = module jpayne@68: if module_name is None: jpayne@68: module_name = module.__name__ jpayne@68: if module_name == "mysql.connector": jpayne@68: wrap_cursor = True jpayne@68: else: jpayne@68: wrap_cursor = False jpayne@68: # Get module specific Adaptor or the base (general) Adaptor jpayne@68: Adapt = _interface_specific_adaptors.get(module_name, Adaptor) jpayne@68: self.adaptor = Adapt( jpayne@68: conn, DBUtils.get_dbutils(module_name), wrap_cursor=wrap_cursor jpayne@68: ) jpayne@68: self.module_name = module_name jpayne@68: jpayne@68: def __repr__(self): jpayne@68: """Return a short description of the class name and database connection.""" jpayne@68: return f"{self.__class__.__name__}({self.adaptor.conn!r})" jpayne@68: jpayne@68: def __getitem__(self, name): jpayne@68: """Return a BioSeqDatabase object. jpayne@68: jpayne@68: Arguments: jpayne@68: - name - The name of the BioSeqDatabase jpayne@68: jpayne@68: """ jpayne@68: return BioSeqDatabase(self.adaptor, name) jpayne@68: jpayne@68: def __len__(self): jpayne@68: """Return number of namespaces (sub-databases) in this database.""" jpayne@68: sql = "SELECT COUNT(name) FROM biodatabase;" jpayne@68: return int(self.adaptor.execute_and_fetch_col0(sql)[0]) jpayne@68: jpayne@68: def __contains__(self, value): jpayne@68: """Check if a namespace (sub-database) in this database.""" jpayne@68: sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;" jpayne@68: return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0]) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: """Iterate over namespaces (sub-databases) in the database.""" jpayne@68: # TODO - Iterate over the cursor, much more efficient jpayne@68: return iter(self.adaptor.list_biodatabase_names()) jpayne@68: jpayne@68: def keys(self): jpayne@68: """Iterate over namespaces (sub-databases) in the database.""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def values(self): jpayne@68: """Iterate over BioSeqDatabase objects in the database.""" jpayne@68: for key in self: jpayne@68: yield self[key] jpayne@68: jpayne@68: def items(self): jpayne@68: """Iterate over (namespace, BioSeqDatabase) in the database.""" jpayne@68: for key in self: jpayne@68: yield key, self[key] jpayne@68: jpayne@68: def __delitem__(self, name): jpayne@68: """Remove a namespace and all its entries.""" jpayne@68: if name not in self: jpayne@68: raise KeyError(name) jpayne@68: db_id = self.adaptor.fetch_dbid_by_dbname(name) jpayne@68: remover = Loader.DatabaseRemover(self.adaptor, db_id) jpayne@68: remover.remove() jpayne@68: jpayne@68: def new_database(self, db_name, authority=None, description=None): jpayne@68: """Add a new database to the server and return it.""" jpayne@68: # make the database jpayne@68: sql = ( jpayne@68: "INSERT INTO biodatabase (name, authority, description)" jpayne@68: " VALUES (%s, %s, %s)" jpayne@68: ) jpayne@68: self.adaptor.execute(sql, (db_name, authority, description)) jpayne@68: return BioSeqDatabase(self.adaptor, db_name) jpayne@68: jpayne@68: def load_database_sql(self, sql_file): jpayne@68: """Load a database schema into the given database. jpayne@68: jpayne@68: This is used to create tables, etc when a database is first created. jpayne@68: sql_file should specify the complete path to a file containing jpayne@68: SQL entries for building the tables. jpayne@68: """ jpayne@68: # Not sophisticated enough for PG schema. Is it needed by MySQL? jpayne@68: # Looks like we need this more complicated way for both. Leaving it jpayne@68: # the default and removing the simple-minded approach. jpayne@68: jpayne@68: # read the file with all comment lines removed jpayne@68: sql = "" jpayne@68: with open(sql_file) as sql_handle: jpayne@68: for line in sql_handle: jpayne@68: if line.startswith("--"): # don't include comment lines jpayne@68: pass jpayne@68: elif line.startswith("#"): # ditto for MySQL comments jpayne@68: pass jpayne@68: elif line.strip(): # only include non-blank lines jpayne@68: sql += line.strip() + " " jpayne@68: jpayne@68: # two ways to load the SQL jpayne@68: # 1. PostgreSQL can load it all at once and actually needs to jpayne@68: # due to FUNCTION defines at the end of the SQL which mess up jpayne@68: # the splitting by semicolons jpayne@68: if self.module_name in ["psycopg2", "pgdb"]: jpayne@68: self.adaptor.cursor.execute(sql) jpayne@68: # 2. MySQL needs the database loading split up into single lines of jpayne@68: # SQL executed one at a time jpayne@68: elif self.module_name in ["mysql.connector", "MySQLdb", "sqlite3"]: jpayne@68: sql_parts = sql.split(";") # one line per sql command jpayne@68: # don't use the last item, it's blank jpayne@68: for sql_line in sql_parts[:-1]: jpayne@68: self.adaptor.cursor.execute(sql_line) jpayne@68: else: jpayne@68: raise ValueError(f"Module {self.module_name} not supported by the loader.") jpayne@68: jpayne@68: def commit(self): jpayne@68: """Commit the current transaction to the database.""" jpayne@68: return self.adaptor.commit() jpayne@68: jpayne@68: def rollback(self): jpayne@68: """Roll-back the current transaction.""" jpayne@68: return self.adaptor.rollback() jpayne@68: jpayne@68: def close(self): jpayne@68: """Close the connection. No further activity possible.""" jpayne@68: return self.adaptor.close() jpayne@68: jpayne@68: jpayne@68: class _CursorWrapper: jpayne@68: """A wrapper for mysql.connector resolving bytestring representations.""" jpayne@68: jpayne@68: def __init__(self, real_cursor): jpayne@68: self.real_cursor = real_cursor jpayne@68: jpayne@68: def execute(self, operation, params=None, multi=False): jpayne@68: """Execute a sql statement.""" jpayne@68: self.real_cursor.execute(operation, params, multi) jpayne@68: jpayne@68: def executemany(self, operation, params): jpayne@68: """Execute many sql statements.""" jpayne@68: self.real_cursor.executemany(operation, params) jpayne@68: jpayne@68: def _convert_tuple(self, tuple_): jpayne@68: """Decode any bytestrings present in the row (PRIVATE).""" jpayne@68: tuple_list = list(tuple_) jpayne@68: for i, elem in enumerate(tuple_list): jpayne@68: if isinstance(elem, bytes): jpayne@68: tuple_list[i] = elem.decode("utf-8") jpayne@68: return tuple(tuple_list) jpayne@68: jpayne@68: def _convert_list(self, lst): jpayne@68: ret_lst = [] jpayne@68: for tuple_ in lst: jpayne@68: new_tuple = self._convert_tuple(tuple_) jpayne@68: ret_lst.append(new_tuple) jpayne@68: return ret_lst jpayne@68: jpayne@68: def fetchall(self): jpayne@68: rv = self.real_cursor.fetchall() jpayne@68: return self._convert_list(rv) jpayne@68: jpayne@68: def fetchone(self): jpayne@68: tuple_ = self.real_cursor.fetchone() jpayne@68: return self._convert_tuple(tuple_) jpayne@68: jpayne@68: jpayne@68: class Adaptor: jpayne@68: """High level wrapper for a database connection and cursor. jpayne@68: jpayne@68: Most database calls in BioSQL are done indirectly though this adaptor jpayne@68: class. This provides helper methods for fetching data and executing jpayne@68: sql. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, conn, dbutils, wrap_cursor=False): jpayne@68: """Create an Adaptor object. jpayne@68: jpayne@68: Arguments: jpayne@68: - conn - A database connection jpayne@68: - dbutils - A BioSQL.DBUtils object jpayne@68: - wrap_cursor - Optional, whether to wrap the cursor object jpayne@68: jpayne@68: """ jpayne@68: self.conn = conn jpayne@68: if wrap_cursor: jpayne@68: self.cursor = _CursorWrapper(conn.cursor()) jpayne@68: else: jpayne@68: self.cursor = conn.cursor() jpayne@68: self.dbutils = dbutils jpayne@68: jpayne@68: def last_id(self, table): jpayne@68: """Return the last row id for the selected table.""" jpayne@68: return self.dbutils.last_id(self.cursor, table) jpayne@68: jpayne@68: def autocommit(self, y=True): jpayne@68: """Set the autocommit mode. True values enable; False value disable.""" jpayne@68: return self.dbutils.autocommit(self.conn, y) jpayne@68: jpayne@68: def commit(self): jpayne@68: """Commit the current transaction.""" jpayne@68: return self.conn.commit() jpayne@68: jpayne@68: def rollback(self): jpayne@68: """Roll-back the current transaction.""" jpayne@68: return self.conn.rollback() jpayne@68: jpayne@68: def close(self): jpayne@68: """Close the connection. No further activity possible.""" jpayne@68: return self.conn.close() jpayne@68: jpayne@68: def fetch_dbid_by_dbname(self, dbname): jpayne@68: """Return the internal id for the sub-database using its name.""" jpayne@68: self.execute( jpayne@68: "select biodatabase_id from biodatabase where name = %s", (dbname,) jpayne@68: ) jpayne@68: rv = self.cursor.fetchall() jpayne@68: if not rv: jpayne@68: raise KeyError(f"Cannot find biodatabase with name {dbname!r}") jpayne@68: return rv[0][0] jpayne@68: jpayne@68: def fetch_seqid_by_display_id(self, dbid, name): jpayne@68: """Return the internal id for a sequence using its display id. jpayne@68: jpayne@68: Arguments: jpayne@68: - dbid - the internal id for the sub-database jpayne@68: - name - the name of the sequence. Corresponds to the jpayne@68: name column of the bioentry table of the SQL schema jpayne@68: jpayne@68: """ jpayne@68: sql = "select bioentry_id from bioentry where name = %s" jpayne@68: fields = [name] jpayne@68: if dbid: jpayne@68: sql += " and biodatabase_id = %s" jpayne@68: fields.append(dbid) jpayne@68: self.execute(sql, fields) jpayne@68: rv = self.cursor.fetchall() jpayne@68: if not rv: jpayne@68: raise IndexError(f"Cannot find display id {name!r}") jpayne@68: if len(rv) > 1: jpayne@68: raise IndexError(f"More than one entry with display id {name!r}") jpayne@68: return rv[0][0] jpayne@68: jpayne@68: def fetch_seqid_by_accession(self, dbid, name): jpayne@68: """Return the internal id for a sequence using its accession. jpayne@68: jpayne@68: Arguments: jpayne@68: - dbid - the internal id for the sub-database jpayne@68: - name - the accession of the sequence. Corresponds to the jpayne@68: accession column of the bioentry table of the SQL schema jpayne@68: jpayne@68: """ jpayne@68: sql = "select bioentry_id from bioentry where accession = %s" jpayne@68: fields = [name] jpayne@68: if dbid: jpayne@68: sql += " and biodatabase_id = %s" jpayne@68: fields.append(dbid) jpayne@68: self.execute(sql, fields) jpayne@68: rv = self.cursor.fetchall() jpayne@68: if not rv: jpayne@68: raise IndexError(f"Cannot find accession {name!r}") jpayne@68: if len(rv) > 1: jpayne@68: raise IndexError(f"More than one entry with accession {name!r}") jpayne@68: return rv[0][0] jpayne@68: jpayne@68: def fetch_seqids_by_accession(self, dbid, name): jpayne@68: """Return a list internal ids using an accession. jpayne@68: jpayne@68: Arguments: jpayne@68: - dbid - the internal id for the sub-database jpayne@68: - name - the accession of the sequence. Corresponds to the jpayne@68: accession column of the bioentry table of the SQL schema jpayne@68: jpayne@68: """ jpayne@68: sql = "select bioentry_id from bioentry where accession = %s" jpayne@68: fields = [name] jpayne@68: if dbid: jpayne@68: sql += " and biodatabase_id = %s" jpayne@68: fields.append(dbid) jpayne@68: return self.execute_and_fetch_col0(sql, fields) jpayne@68: jpayne@68: def fetch_seqid_by_version(self, dbid, name): jpayne@68: """Return the internal id for a sequence using its accession and version. jpayne@68: jpayne@68: Arguments: jpayne@68: - dbid - the internal id for the sub-database jpayne@68: - name - the accession of the sequence containing a version number. jpayne@68: Must correspond to . jpayne@68: jpayne@68: """ jpayne@68: acc_version = name.split(".") jpayne@68: if len(acc_version) > 2: jpayne@68: raise IndexError(f"Bad version {name!r}") jpayne@68: acc = acc_version[0] jpayne@68: if len(acc_version) == 2: jpayne@68: version = acc_version[1] jpayne@68: else: jpayne@68: version = "0" jpayne@68: sql = "SELECT bioentry_id FROM bioentry WHERE accession = %s AND version = %s" jpayne@68: fields = [acc, version] jpayne@68: if dbid: jpayne@68: sql += " and biodatabase_id = %s" jpayne@68: fields.append(dbid) jpayne@68: self.execute(sql, fields) jpayne@68: rv = self.cursor.fetchall() jpayne@68: if not rv: jpayne@68: raise IndexError(f"Cannot find version {name!r}") jpayne@68: if len(rv) > 1: jpayne@68: raise IndexError(f"More than one entry with version {name!r}") jpayne@68: return rv[0][0] jpayne@68: jpayne@68: def fetch_seqid_by_identifier(self, dbid, identifier): jpayne@68: """Return the internal id for a sequence using its identifier. jpayne@68: jpayne@68: Arguments: jpayne@68: - dbid - the internal id for the sub-database jpayne@68: - identifier - the identifier of the sequence. Corresponds to jpayne@68: the identifier column of the bioentry table in the SQL schema. jpayne@68: jpayne@68: """ jpayne@68: # YB: was fetch_seqid_by_seqid jpayne@68: sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" jpayne@68: fields = [identifier] jpayne@68: if dbid: jpayne@68: sql += " and biodatabase_id = %s" jpayne@68: fields.append(dbid) jpayne@68: self.execute(sql, fields) jpayne@68: rv = self.cursor.fetchall() jpayne@68: if not rv: jpayne@68: raise IndexError(f"Cannot find display id {identifier!r}") jpayne@68: return rv[0][0] jpayne@68: jpayne@68: def list_biodatabase_names(self): jpayne@68: """Return a list of all of the sub-databases.""" jpayne@68: return self.execute_and_fetch_col0("SELECT name FROM biodatabase") jpayne@68: jpayne@68: def list_bioentry_ids(self, dbid): jpayne@68: """Return a list of internal ids for all of the sequences in a sub-databae. jpayne@68: jpayne@68: Arguments: jpayne@68: - dbid - The internal id for a sub-database jpayne@68: jpayne@68: """ jpayne@68: return self.execute_and_fetch_col0( jpayne@68: "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", (dbid,) jpayne@68: ) jpayne@68: jpayne@68: def list_bioentry_display_ids(self, dbid): jpayne@68: """Return a list of all sequence names in a sub-databae. jpayne@68: jpayne@68: Arguments: jpayne@68: - dbid - The internal id for a sub-database jpayne@68: jpayne@68: """ jpayne@68: return self.execute_and_fetch_col0( jpayne@68: "SELECT name FROM bioentry WHERE biodatabase_id = %s", (dbid,) jpayne@68: ) jpayne@68: jpayne@68: def list_any_ids(self, sql, args): jpayne@68: """Return ids given a SQL statement to select for them. jpayne@68: jpayne@68: This assumes that the given SQL does a SELECT statement that jpayne@68: returns a list of items. This parses them out of the 2D list jpayne@68: they come as and just returns them in a list. jpayne@68: """ jpayne@68: return self.execute_and_fetch_col0(sql, args) jpayne@68: jpayne@68: def execute_one(self, sql, args=None): jpayne@68: """Execute sql that returns 1 record, and return the record.""" jpayne@68: self.execute(sql, args or ()) jpayne@68: rv = self.cursor.fetchall() jpayne@68: if len(rv) != 1: jpayne@68: raise ValueError(f"Expected 1 response, got {len(rv)}.") jpayne@68: return rv[0] jpayne@68: jpayne@68: def execute(self, sql, args=None): jpayne@68: """Just execute an sql command.""" jpayne@68: if os.name == "java": jpayne@68: sql = sql.replace("%s", "?") jpayne@68: self.dbutils.execute(self.cursor, sql, args) jpayne@68: jpayne@68: def executemany(self, sql, args): jpayne@68: """Execute many sql commands.""" jpayne@68: if os.name == "java": jpayne@68: sql = sql.replace("%s", "?") jpayne@68: self.dbutils.executemany(self.cursor, sql, args) jpayne@68: jpayne@68: def get_subseq_as_string(self, seqid, start, end): jpayne@68: """Return a substring of a sequence. jpayne@68: jpayne@68: Arguments: jpayne@68: - seqid - The internal id for the sequence jpayne@68: - start - The start position of the sequence; 0-indexed jpayne@68: - end - The end position of the sequence jpayne@68: jpayne@68: """ jpayne@68: length = end - start jpayne@68: # XXX Check this on MySQL and PostgreSQL. substr should be general, jpayne@68: # does it need dbutils? jpayne@68: # return self.execute_one( jpayne@68: # """select SUBSTRING(seq FROM %s FOR %s) jpayne@68: # from biosequence where bioentry_id = %s""", jpayne@68: # (start+1, length, seqid))[0] jpayne@68: return self.execute_one( jpayne@68: "SELECT SUBSTR(seq, %s, %s) FROM biosequence WHERE bioentry_id = %s", jpayne@68: (start + 1, length, seqid), jpayne@68: )[0] jpayne@68: jpayne@68: def execute_and_fetch_col0(self, sql, args=None): jpayne@68: """Return a list of values from the first column in the row.""" jpayne@68: self.execute(sql, args or ()) jpayne@68: return [field[0] for field in self.cursor.fetchall()] jpayne@68: jpayne@68: def execute_and_fetchall(self, sql, args=None): jpayne@68: """Return a list of tuples of all rows.""" jpayne@68: self.execute(sql, args or ()) jpayne@68: return self.cursor.fetchall() jpayne@68: jpayne@68: jpayne@68: class MysqlConnectorAdaptor(Adaptor): jpayne@68: """A BioSQL Adaptor class with fixes for the MySQL interface. jpayne@68: jpayne@68: BioSQL was failing due to returns of bytearray objects from jpayne@68: the mysql-connector-python database connector. This adaptor jpayne@68: class scrubs returns of bytearrays and of byte strings converting jpayne@68: them to string objects instead. This adaptor class was made in jpayne@68: response to backwards incompatible changes added to jpayne@68: mysql-connector-python in release 2.0.0 of the package. jpayne@68: """ jpayne@68: jpayne@68: @staticmethod jpayne@68: def _bytearray_to_str(s): jpayne@68: """If s is bytes or bytearray, convert to a string (PRIVATE).""" jpayne@68: if isinstance(s, (bytes, bytearray)): jpayne@68: return s.decode() jpayne@68: return s jpayne@68: jpayne@68: def execute_one(self, sql, args=None): jpayne@68: """Execute sql that returns 1 record, and return the record.""" jpayne@68: out = super().execute_one(sql, args) jpayne@68: return tuple(self._bytearray_to_str(v) for v in out) jpayne@68: jpayne@68: def execute_and_fetch_col0(self, sql, args=None): jpayne@68: """Return a list of values from the first column in the row.""" jpayne@68: out = super().execute_and_fetch_col0(sql, args) jpayne@68: return [self._bytearray_to_str(column) for column in out] jpayne@68: jpayne@68: def execute_and_fetchall(self, sql, args=None): jpayne@68: """Return a list of tuples of all rows.""" jpayne@68: out = super().execute_and_fetchall(sql, args) jpayne@68: return [tuple(self._bytearray_to_str(v) for v in o) for o in out] jpayne@68: jpayne@68: jpayne@68: _interface_specific_adaptors = { jpayne@68: # If SQL interfaces require a specific adaptor, use this to map the adaptor jpayne@68: "mysql.connector": MysqlConnectorAdaptor, jpayne@68: "MySQLdb": MysqlConnectorAdaptor, jpayne@68: } jpayne@68: jpayne@68: _allowed_lookups = { jpayne@68: # Lookup name / function name to get id, function to list all ids jpayne@68: "primary_id": "fetch_seqid_by_identifier", jpayne@68: "gi": "fetch_seqid_by_identifier", jpayne@68: "display_id": "fetch_seqid_by_display_id", jpayne@68: "name": "fetch_seqid_by_display_id", jpayne@68: "accession": "fetch_seqid_by_accession", jpayne@68: "version": "fetch_seqid_by_version", jpayne@68: } jpayne@68: jpayne@68: jpayne@68: class BioSeqDatabase: jpayne@68: """Represents a namespace (sub-database) within the BioSQL database. jpayne@68: jpayne@68: i.e. One row in the biodatabase table, and all all rows in the bioentry jpayne@68: table associated with it. jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, adaptor, name): jpayne@68: """Create a BioDatabase object. jpayne@68: jpayne@68: Arguments: jpayne@68: - adaptor - A BioSQL.Adaptor object jpayne@68: - name - The name of the sub-database (namespace) jpayne@68: jpayne@68: """ jpayne@68: self.adaptor = adaptor jpayne@68: self.name = name jpayne@68: self.dbid = self.adaptor.fetch_dbid_by_dbname(name) jpayne@68: jpayne@68: def __repr__(self): jpayne@68: """Return a short summary of the BioSeqDatabase.""" jpayne@68: return f"BioSeqDatabase({self.adaptor!r}, {self.name!r})" jpayne@68: jpayne@68: def get_Seq_by_id(self, name): jpayne@68: """Get a DBSeqRecord object by its name. jpayne@68: jpayne@68: Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN') jpayne@68: jpayne@68: The name of this method is misleading since it returns a DBSeqRecord jpayne@68: rather than a Seq object, and presumably was to mirror BioPerl. jpayne@68: """ jpayne@68: seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) jpayne@68: return BioSeq.DBSeqRecord(self.adaptor, seqid) jpayne@68: jpayne@68: def get_Seq_by_acc(self, name): jpayne@68: """Get a DBSeqRecord object by accession number. jpayne@68: jpayne@68: Example: seq_rec = db.get_Seq_by_acc('X77802') jpayne@68: jpayne@68: The name of this method is misleading since it returns a DBSeqRecord jpayne@68: rather than a Seq object, and presumably was to mirror BioPerl. jpayne@68: """ jpayne@68: seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) jpayne@68: return BioSeq.DBSeqRecord(self.adaptor, seqid) jpayne@68: jpayne@68: def get_Seq_by_ver(self, name): jpayne@68: """Get a DBSeqRecord object by version number. jpayne@68: jpayne@68: Example: seq_rec = db.get_Seq_by_ver('X77802.1') jpayne@68: jpayne@68: The name of this method is misleading since it returns a DBSeqRecord jpayne@68: rather than a Seq object, and presumably was to mirror BioPerl. jpayne@68: """ jpayne@68: seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) jpayne@68: return BioSeq.DBSeqRecord(self.adaptor, seqid) jpayne@68: jpayne@68: def get_Seqs_by_acc(self, name): jpayne@68: """Get a list of DBSeqRecord objects by accession number. jpayne@68: jpayne@68: Example: seq_recs = db.get_Seq_by_acc('X77802') jpayne@68: jpayne@68: The name of this method is misleading since it returns a list of jpayne@68: DBSeqRecord objects rather than a list of Seq objects, and presumably jpayne@68: was to mirror BioPerl. jpayne@68: """ jpayne@68: seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) jpayne@68: return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids] jpayne@68: jpayne@68: def __getitem__(self, key): jpayne@68: """Return a DBSeqRecord for one of the sequences in the sub-database. jpayne@68: jpayne@68: Arguments: jpayne@68: - key - The internal id for the sequence jpayne@68: jpayne@68: """ jpayne@68: record = BioSeq.DBSeqRecord(self.adaptor, key) jpayne@68: if record._biodatabase_id != self.dbid: jpayne@68: raise KeyError(f"Entry {key!r} does exist, but not in current name space") jpayne@68: return record jpayne@68: jpayne@68: def __delitem__(self, key): jpayne@68: """Remove an entry and all its annotation.""" jpayne@68: if key not in self: jpayne@68: raise KeyError( jpayne@68: f"Entry {key!r} cannot be deleted. It was not found or is invalid" jpayne@68: ) jpayne@68: # Assuming this will automatically cascade to the other tables... jpayne@68: sql = "DELETE FROM bioentry WHERE biodatabase_id=%s AND bioentry_id=%s;" jpayne@68: self.adaptor.execute(sql, (self.dbid, key)) jpayne@68: jpayne@68: def __len__(self): jpayne@68: """Return number of records in this namespace (sub database).""" jpayne@68: sql = "SELECT COUNT(bioentry_id) FROM bioentry WHERE biodatabase_id=%s;" jpayne@68: return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid,))[0]) jpayne@68: jpayne@68: def __contains__(self, value): jpayne@68: """Check if a primary (internal) id is this namespace (sub database).""" jpayne@68: sql = ( jpayne@68: "SELECT COUNT(bioentry_id) FROM bioentry " jpayne@68: "WHERE biodatabase_id=%s AND bioentry_id=%s;" jpayne@68: ) jpayne@68: # The bioentry_id field is an integer in the schema. jpayne@68: # PostgreSQL will throw an error if we use a non integer in the query. jpayne@68: try: jpayne@68: bioentry_id = int(value) jpayne@68: except ValueError: jpayne@68: return False jpayne@68: return bool( jpayne@68: self.adaptor.execute_and_fetch_col0(sql, (self.dbid, bioentry_id))[0] jpayne@68: ) jpayne@68: jpayne@68: def __iter__(self): jpayne@68: """Iterate over ids (which may not be meaningful outside this database).""" jpayne@68: # TODO - Iterate over the cursor, much more efficient jpayne@68: return iter(self.adaptor.list_bioentry_ids(self.dbid)) jpayne@68: jpayne@68: def keys(self): jpayne@68: """Iterate over ids (which may not be meaningful outside this database).""" jpayne@68: return iter(self) jpayne@68: jpayne@68: def values(self): jpayne@68: """Iterate over DBSeqRecord objects in the namespace (sub database).""" jpayne@68: for key in self: jpayne@68: yield self[key] jpayne@68: jpayne@68: def items(self): jpayne@68: """Iterate over (id, DBSeqRecord) for the namespace (sub database).""" jpayne@68: for key in self: jpayne@68: yield key, self[key] jpayne@68: jpayne@68: def lookup(self, **kwargs): jpayne@68: """Return a DBSeqRecord using an acceptable identifier. jpayne@68: jpayne@68: Arguments: jpayne@68: - kwargs - A single key-value pair where the key is one jpayne@68: of primary_id, gi, display_id, name, accession, version jpayne@68: jpayne@68: """ jpayne@68: if len(kwargs) != 1: jpayne@68: raise TypeError("single key/value parameter expected") jpayne@68: k, v = list(kwargs.items())[0] jpayne@68: if k not in _allowed_lookups: jpayne@68: raise TypeError( jpayne@68: f"lookup() expects one of {list(_allowed_lookups.keys())!r}, not {k!r}" jpayne@68: ) jpayne@68: lookup_name = _allowed_lookups[k] jpayne@68: lookup_func = getattr(self.adaptor, lookup_name) jpayne@68: seqid = lookup_func(self.dbid, v) jpayne@68: return BioSeq.DBSeqRecord(self.adaptor, seqid) jpayne@68: jpayne@68: def load(self, record_iterator, fetch_NCBI_taxonomy=False): jpayne@68: """Load a set of SeqRecords into the BioSQL database. jpayne@68: jpayne@68: record_iterator is either a list of SeqRecord objects, or an jpayne@68: Iterator object that returns SeqRecord objects (such as the jpayne@68: output from the Bio.SeqIO.parse() function), which will be jpayne@68: used to populate the database. jpayne@68: jpayne@68: fetch_NCBI_taxonomy is boolean flag allowing or preventing jpayne@68: connection to the taxonomic database on the NCBI server jpayne@68: (via Bio.Entrez) to fetch a detailed taxonomy for each jpayne@68: SeqRecord. jpayne@68: jpayne@68: Example:: jpayne@68: jpayne@68: from Bio import SeqIO jpayne@68: count = db.load(SeqIO.parse(open(filename), format)) jpayne@68: jpayne@68: Returns the number of records loaded. jpayne@68: """ jpayne@68: db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, fetch_NCBI_taxonomy) jpayne@68: num_records = 0 jpayne@68: global _POSTGRES_RULES_PRESENT jpayne@68: for cur_record in record_iterator: jpayne@68: num_records += 1 jpayne@68: # Hack to work around BioSQL Bug 2839 - If using PostgreSQL and jpayne@68: # the RULES are present check for a duplicate record before loading jpayne@68: if _POSTGRES_RULES_PRESENT: jpayne@68: # Recreate what the Loader's _load_bioentry_table will do: jpayne@68: if cur_record.id.count(".") == 1: jpayne@68: accession, version = cur_record.id.split(".") jpayne@68: try: jpayne@68: version = int(version) jpayne@68: except ValueError: jpayne@68: accession = cur_record.id jpayne@68: version = 0 jpayne@68: else: jpayne@68: accession = cur_record.id jpayne@68: version = 0 jpayne@68: gi = cur_record.annotations.get("gi") jpayne@68: sql = ( jpayne@68: "SELECT bioentry_id FROM bioentry " jpayne@68: "WHERE (identifier = '%s' AND biodatabase_id = '%s') " jpayne@68: "OR (accession = '%s' AND version = '%s' AND biodatabase_id = '%s')" jpayne@68: ) jpayne@68: self.adaptor.execute( jpayne@68: sql % (gi, self.dbid, accession, version, self.dbid) jpayne@68: ) jpayne@68: if self.adaptor.cursor.fetchone(): jpayne@68: raise self.adaptor.conn.IntegrityError( jpayne@68: "Duplicate record detected: record has not been inserted" jpayne@68: ) jpayne@68: # End of hack jpayne@68: db_loader.load_seqrecord(cur_record) jpayne@68: return num_records