jpayne@68: # Copyright 2002 by Andrew Dalke.  All rights reserved.
jpayne@68: # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved.
jpayne@68: # Revisions 2009 copyright by Cymon J. Cox.  All rights reserved.
jpayne@68: # Revisions 2013-2014 copyright by Tiago Antao.  All rights reserved.
jpayne@68: #
jpayne@68: # This file is part of the Biopython distribution and governed by your
jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68: # Please see the LICENSE file that should have been included as part of this
jpayne@68: # package.
jpayne@68: #
jpayne@68: # Note that BioSQL (including the database schema and scripts) is
jpayne@68: # available and licensed separately.  Please consult www.biosql.org
jpayne@68: """Connect with a BioSQL database and load Biopython like objects from it.
jpayne@68: 
jpayne@68: This provides interfaces for loading biological objects from a relational
jpayne@68: database, and is compatible with the BioSQL standards.
jpayne@68: """
jpayne@68: import os
jpayne@68: 
jpayne@68: from . import BioSeq
jpayne@68: from . import Loader
jpayne@68: from . import DBUtils
jpayne@68: 
jpayne@68: 
jpayne@68: _POSTGRES_RULES_PRESENT = False  # Hack for BioSQL Bug 2839
jpayne@68: 
jpayne@68: 
jpayne@68: def open_database(driver="MySQLdb", **kwargs):
jpayne@68:     """Load an existing BioSQL-style database.
jpayne@68: 
jpayne@68:     This function is the easiest way to retrieve a connection to a
jpayne@68:     database, doing something like::
jpayne@68: 
jpayne@68:         from BioSQL import BioSeqDatabase
jpayne@68:         server = BioSeqDatabase.open_database(user="root", db="minidb")
jpayne@68: 
jpayne@68:     Arguments:
jpayne@68:      - driver - The name of the database driver to use for connecting. The
jpayne@68:        driver should implement the python DB API. By default, the MySQLdb
jpayne@68:        driver is used.
jpayne@68:      - user -the username to connect to the database with.
jpayne@68:      - password, passwd - the password to connect with
jpayne@68:      - host - the hostname of the database
jpayne@68:      - database or db - the name of the database
jpayne@68: 
jpayne@68:     """
jpayne@68:     if driver == "psycopg":
jpayne@68:         raise ValueError(
jpayne@68:             "Using BioSQL with psycopg (version one) is no "
jpayne@68:             "longer supported. Use psycopg2 instead."
jpayne@68:         )
jpayne@68: 
jpayne@68:     if os.name == "java":
jpayne@68:         from com.ziclix.python.sql import zxJDBC
jpayne@68: 
jpayne@68:         module = zxJDBC
jpayne@68:         if driver in ["MySQLdb"]:
jpayne@68:             jdbc_driver = "com.mysql.jdbc.Driver"
jpayne@68:             url_pref = "jdbc:mysql://" + kwargs["host"] + "/"
jpayne@68:         elif driver in ["psycopg2"]:
jpayne@68:             jdbc_driver = "org.postgresql.Driver"
jpayne@68:             url_pref = "jdbc:postgresql://" + kwargs["host"] + "/"
jpayne@68: 
jpayne@68:     else:
jpayne@68:         module = __import__(driver, fromlist=["connect"])
jpayne@68:     connect = module.connect
jpayne@68: 
jpayne@68:     # Different drivers use different keywords...
jpayne@68:     kw = kwargs.copy()
jpayne@68:     if driver in ["MySQLdb", "mysql.connector"] and os.name != "java":
jpayne@68:         if "database" in kw:
jpayne@68:             kw["db"] = kw["database"]
jpayne@68:             del kw["database"]
jpayne@68:         if "password" in kw:
jpayne@68:             kw["passwd"] = kw["password"]
jpayne@68:             del kw["password"]
jpayne@68:         # kw["charset"] = "utf8"
jpayne@68:         # kw["use_unicode"] = True
jpayne@68:     else:
jpayne@68:         # DB-API recommendations
jpayne@68:         if "db" in kw:
jpayne@68:             kw["database"] = kw["db"]
jpayne@68:             del kw["db"]
jpayne@68:         if "passwd" in kw:
jpayne@68:             kw["password"] = kw["passwd"]
jpayne@68:             del kw["passwd"]
jpayne@68:     if driver in ["psycopg2", "pgdb"] and not kw.get("database"):
jpayne@68:         kw["database"] = "template1"
jpayne@68:     # SQLite connect takes the database name as input
jpayne@68:     if os.name == "java":
jpayne@68:         if driver in ["MySQLdb"]:
jpayne@68:             conn = connect(
jpayne@68:                 url_pref + kw.get("database", "mysql"),
jpayne@68:                 kw["user"],
jpayne@68:                 kw["password"],
jpayne@68:                 jdbc_driver,
jpayne@68:             )
jpayne@68:         elif driver in ["psycopg2"]:
jpayne@68:             conn = connect(
jpayne@68:                 url_pref + kw.get("database", "postgresql") + "?stringtype=unspecified",
jpayne@68:                 kw["user"],
jpayne@68:                 kw["password"],
jpayne@68:                 jdbc_driver,
jpayne@68:             )
jpayne@68:     elif driver in ["sqlite3"]:
jpayne@68:         conn = connect(kw["database"])
jpayne@68:     else:
jpayne@68:         conn = connect(**kw)
jpayne@68: 
jpayne@68:     if os.name == "java":
jpayne@68:         server = DBServer(conn, module, driver)
jpayne@68:     else:
jpayne@68:         server = DBServer(conn, module)
jpayne@68: 
jpayne@68:     # Sets MySQL to allow double quotes, rather than only backticks
jpayne@68:     if driver in ["MySQLdb", "mysql.connector"]:
jpayne@68:         server.adaptor.execute("SET sql_mode='ANSI_QUOTES';")
jpayne@68: 
jpayne@68:     # TODO - Remove the following once BioSQL Bug 2839 is fixed.
jpayne@68:     # Test for RULES in PostgreSQL schema, see also Bug 2833.
jpayne@68:     if driver in ["psycopg2", "pgdb"]:
jpayne@68:         sql = (
jpayne@68:             "SELECT ev_class FROM pg_rewrite WHERE "
jpayne@68:             "rulename='rule_bioentry_i1' OR "
jpayne@68:             "rulename='rule_bioentry_i2';"
jpayne@68:         )
jpayne@68:         if server.adaptor.execute_and_fetchall(sql):
jpayne@68:             import warnings
jpayne@68:             from Bio import BiopythonWarning
jpayne@68: 
jpayne@68:             warnings.warn(
jpayne@68:                 "Your BioSQL PostgreSQL schema includes some rules "
jpayne@68:                 "currently required for bioperl-db but which may"
jpayne@68:                 "cause problems loading data using Biopython (see "
jpayne@68:                 "BioSQL's RedMine Bug 2839 aka GitHub Issue 4 "
jpayne@68:                 "https://github.com/biosql/biosql/issues/4). "
jpayne@68:                 "If you do not use BioPerl, please remove these "
jpayne@68:                 "rules. Biopython should cope with the rules "
jpayne@68:                 "present, but with a performance penalty when "
jpayne@68:                 "loading new records.",
jpayne@68:                 BiopythonWarning,
jpayne@68:             )
jpayne@68:             global _POSTGRES_RULES_PRESENT
jpayne@68:             _POSTGRES_RULES_PRESENT = True
jpayne@68: 
jpayne@68:     elif driver == "sqlite3":
jpayne@68:         # Tell SQLite that we want to use foreign keys
jpayne@68:         # https://www.sqlite.org/foreignkeys.html#fk_enable
jpayne@68:         server.adaptor.execute("PRAGMA foreign_keys = ON")
jpayne@68: 
jpayne@68:     return server
jpayne@68: 
jpayne@68: 
jpayne@68: class DBServer:
jpayne@68:     """Represents a BioSQL database containing namespaces (sub-databases).
jpayne@68: 
jpayne@68:     This acts like a Python dictionary, giving access to each namespace
jpayne@68:     (defined by a row in the biodatabase table) as a BioSeqDatabase object.
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, conn, module, module_name=None):
jpayne@68:         """Create a DBServer object.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - conn - A database connection object
jpayne@68:          - module - The module used to create the database connection
jpayne@68:          - module_name - Optionally, the name of the module. Default: module.__name__
jpayne@68: 
jpayne@68:         Normally you would not want to create a DBServer object yourself.
jpayne@68:         Instead use the open_database function, which returns an instance of DBServer.
jpayne@68:         """
jpayne@68:         self.module = module
jpayne@68:         if module_name is None:
jpayne@68:             module_name = module.__name__
jpayne@68:         if module_name == "mysql.connector":
jpayne@68:             wrap_cursor = True
jpayne@68:         else:
jpayne@68:             wrap_cursor = False
jpayne@68:         # Get module specific Adaptor or the base (general) Adaptor
jpayne@68:         Adapt = _interface_specific_adaptors.get(module_name, Adaptor)
jpayne@68:         self.adaptor = Adapt(
jpayne@68:             conn, DBUtils.get_dbutils(module_name), wrap_cursor=wrap_cursor
jpayne@68:         )
jpayne@68:         self.module_name = module_name
jpayne@68: 
jpayne@68:     def __repr__(self):
jpayne@68:         """Return a short description of the class name and database connection."""
jpayne@68:         return f"{self.__class__.__name__}({self.adaptor.conn!r})"
jpayne@68: 
jpayne@68:     def __getitem__(self, name):
jpayne@68:         """Return a BioSeqDatabase object.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:             - name - The name of the BioSeqDatabase
jpayne@68: 
jpayne@68:         """
jpayne@68:         return BioSeqDatabase(self.adaptor, name)
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         """Return number of namespaces (sub-databases) in this database."""
jpayne@68:         sql = "SELECT COUNT(name) FROM biodatabase;"
jpayne@68:         return int(self.adaptor.execute_and_fetch_col0(sql)[0])
jpayne@68: 
jpayne@68:     def __contains__(self, value):
jpayne@68:         """Check if a namespace (sub-database) in this database."""
jpayne@68:         sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;"
jpayne@68:         return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         """Iterate over namespaces (sub-databases) in the database."""
jpayne@68:         # TODO - Iterate over the cursor, much more efficient
jpayne@68:         return iter(self.adaptor.list_biodatabase_names())
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """Iterate over namespaces (sub-databases) in the database."""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """Iterate over BioSeqDatabase objects in the database."""
jpayne@68:         for key in self:
jpayne@68:             yield self[key]
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """Iterate over (namespace, BioSeqDatabase) in the database."""
jpayne@68:         for key in self:
jpayne@68:             yield key, self[key]
jpayne@68: 
jpayne@68:     def __delitem__(self, name):
jpayne@68:         """Remove a namespace and all its entries."""
jpayne@68:         if name not in self:
jpayne@68:             raise KeyError(name)
jpayne@68:         db_id = self.adaptor.fetch_dbid_by_dbname(name)
jpayne@68:         remover = Loader.DatabaseRemover(self.adaptor, db_id)
jpayne@68:         remover.remove()
jpayne@68: 
jpayne@68:     def new_database(self, db_name, authority=None, description=None):
jpayne@68:         """Add a new database to the server and return it."""
jpayne@68:         # make the database
jpayne@68:         sql = (
jpayne@68:             "INSERT INTO biodatabase (name, authority, description)"
jpayne@68:             " VALUES (%s, %s, %s)"
jpayne@68:         )
jpayne@68:         self.adaptor.execute(sql, (db_name, authority, description))
jpayne@68:         return BioSeqDatabase(self.adaptor, db_name)
jpayne@68: 
jpayne@68:     def load_database_sql(self, sql_file):
jpayne@68:         """Load a database schema into the given database.
jpayne@68: 
jpayne@68:         This is used to create tables, etc when a database is first created.
jpayne@68:         sql_file should specify the complete path to a file containing
jpayne@68:         SQL entries for building the tables.
jpayne@68:         """
jpayne@68:         # Not sophisticated enough for PG schema. Is it needed by MySQL?
jpayne@68:         # Looks like we need this more complicated way for both. Leaving it
jpayne@68:         # the default and removing the simple-minded approach.
jpayne@68: 
jpayne@68:         # read the file with all comment lines removed
jpayne@68:         sql = ""
jpayne@68:         with open(sql_file) as sql_handle:
jpayne@68:             for line in sql_handle:
jpayne@68:                 if line.startswith("--"):  # don't include comment lines
jpayne@68:                     pass
jpayne@68:                 elif line.startswith("#"):  # ditto for MySQL comments
jpayne@68:                     pass
jpayne@68:                 elif line.strip():  # only include non-blank lines
jpayne@68:                     sql += line.strip() + " "
jpayne@68: 
jpayne@68:         # two ways to load the SQL
jpayne@68:         # 1. PostgreSQL can load it all at once and actually needs to
jpayne@68:         # due to FUNCTION defines at the end of the SQL which mess up
jpayne@68:         # the splitting by semicolons
jpayne@68:         if self.module_name in ["psycopg2", "pgdb"]:
jpayne@68:             self.adaptor.cursor.execute(sql)
jpayne@68:         # 2. MySQL needs the database loading split up into single lines of
jpayne@68:         # SQL executed one at a time
jpayne@68:         elif self.module_name in ["mysql.connector", "MySQLdb", "sqlite3"]:
jpayne@68:             sql_parts = sql.split(";")  # one line per sql command
jpayne@68:             # don't use the last item, it's blank
jpayne@68:             for sql_line in sql_parts[:-1]:
jpayne@68:                 self.adaptor.cursor.execute(sql_line)
jpayne@68:         else:
jpayne@68:             raise ValueError(f"Module {self.module_name} not supported by the loader.")
jpayne@68: 
jpayne@68:     def commit(self):
jpayne@68:         """Commit the current transaction to the database."""
jpayne@68:         return self.adaptor.commit()
jpayne@68: 
jpayne@68:     def rollback(self):
jpayne@68:         """Roll-back the current transaction."""
jpayne@68:         return self.adaptor.rollback()
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         """Close the connection. No further activity possible."""
jpayne@68:         return self.adaptor.close()
jpayne@68: 
jpayne@68: 
jpayne@68: class _CursorWrapper:
jpayne@68:     """A wrapper for mysql.connector resolving bytestring representations."""
jpayne@68: 
jpayne@68:     def __init__(self, real_cursor):
jpayne@68:         self.real_cursor = real_cursor
jpayne@68: 
jpayne@68:     def execute(self, operation, params=None, multi=False):
jpayne@68:         """Execute a sql statement."""
jpayne@68:         self.real_cursor.execute(operation, params, multi)
jpayne@68: 
jpayne@68:     def executemany(self, operation, params):
jpayne@68:         """Execute many sql statements."""
jpayne@68:         self.real_cursor.executemany(operation, params)
jpayne@68: 
jpayne@68:     def _convert_tuple(self, tuple_):
jpayne@68:         """Decode any bytestrings present in the row (PRIVATE)."""
jpayne@68:         tuple_list = list(tuple_)
jpayne@68:         for i, elem in enumerate(tuple_list):
jpayne@68:             if isinstance(elem, bytes):
jpayne@68:                 tuple_list[i] = elem.decode("utf-8")
jpayne@68:         return tuple(tuple_list)
jpayne@68: 
jpayne@68:     def _convert_list(self, lst):
jpayne@68:         ret_lst = []
jpayne@68:         for tuple_ in lst:
jpayne@68:             new_tuple = self._convert_tuple(tuple_)
jpayne@68:             ret_lst.append(new_tuple)
jpayne@68:         return ret_lst
jpayne@68: 
jpayne@68:     def fetchall(self):
jpayne@68:         rv = self.real_cursor.fetchall()
jpayne@68:         return self._convert_list(rv)
jpayne@68: 
jpayne@68:     def fetchone(self):
jpayne@68:         tuple_ = self.real_cursor.fetchone()
jpayne@68:         return self._convert_tuple(tuple_)
jpayne@68: 
jpayne@68: 
jpayne@68: class Adaptor:
jpayne@68:     """High level wrapper for a database connection and cursor.
jpayne@68: 
jpayne@68:     Most database calls in BioSQL are done indirectly though this adaptor
jpayne@68:     class. This provides helper methods for fetching data and executing
jpayne@68:     sql.
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, conn, dbutils, wrap_cursor=False):
jpayne@68:         """Create an Adaptor object.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - conn - A database connection
jpayne@68:          - dbutils - A BioSQL.DBUtils object
jpayne@68:          - wrap_cursor - Optional, whether to wrap the cursor object
jpayne@68: 
jpayne@68:         """
jpayne@68:         self.conn = conn
jpayne@68:         if wrap_cursor:
jpayne@68:             self.cursor = _CursorWrapper(conn.cursor())
jpayne@68:         else:
jpayne@68:             self.cursor = conn.cursor()
jpayne@68:         self.dbutils = dbutils
jpayne@68: 
jpayne@68:     def last_id(self, table):
jpayne@68:         """Return the last row id for the selected table."""
jpayne@68:         return self.dbutils.last_id(self.cursor, table)
jpayne@68: 
jpayne@68:     def autocommit(self, y=True):
jpayne@68:         """Set the autocommit mode. True values enable; False value disable."""
jpayne@68:         return self.dbutils.autocommit(self.conn, y)
jpayne@68: 
jpayne@68:     def commit(self):
jpayne@68:         """Commit the current transaction."""
jpayne@68:         return self.conn.commit()
jpayne@68: 
jpayne@68:     def rollback(self):
jpayne@68:         """Roll-back the current transaction."""
jpayne@68:         return self.conn.rollback()
jpayne@68: 
jpayne@68:     def close(self):
jpayne@68:         """Close the connection. No further activity possible."""
jpayne@68:         return self.conn.close()
jpayne@68: 
jpayne@68:     def fetch_dbid_by_dbname(self, dbname):
jpayne@68:         """Return the internal id for the sub-database using its name."""
jpayne@68:         self.execute(
jpayne@68:             "select biodatabase_id from biodatabase where name = %s", (dbname,)
jpayne@68:         )
jpayne@68:         rv = self.cursor.fetchall()
jpayne@68:         if not rv:
jpayne@68:             raise KeyError(f"Cannot find biodatabase with name {dbname!r}")
jpayne@68:         return rv[0][0]
jpayne@68: 
jpayne@68:     def fetch_seqid_by_display_id(self, dbid, name):
jpayne@68:         """Return the internal id for a sequence using its display id.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - dbid - the internal id for the sub-database
jpayne@68:          - name - the name of the sequence. Corresponds to the
jpayne@68:            name column of the bioentry table of the SQL schema
jpayne@68: 
jpayne@68:         """
jpayne@68:         sql = "select bioentry_id from bioentry where name = %s"
jpayne@68:         fields = [name]
jpayne@68:         if dbid:
jpayne@68:             sql += " and biodatabase_id = %s"
jpayne@68:             fields.append(dbid)
jpayne@68:         self.execute(sql, fields)
jpayne@68:         rv = self.cursor.fetchall()
jpayne@68:         if not rv:
jpayne@68:             raise IndexError(f"Cannot find display id {name!r}")
jpayne@68:         if len(rv) > 1:
jpayne@68:             raise IndexError(f"More than one entry with display id {name!r}")
jpayne@68:         return rv[0][0]
jpayne@68: 
jpayne@68:     def fetch_seqid_by_accession(self, dbid, name):
jpayne@68:         """Return the internal id for a sequence using its accession.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - dbid - the internal id for the sub-database
jpayne@68:          - name - the accession of the sequence. Corresponds to the
jpayne@68:            accession column of the bioentry table of the SQL schema
jpayne@68: 
jpayne@68:         """
jpayne@68:         sql = "select bioentry_id from bioentry where accession = %s"
jpayne@68:         fields = [name]
jpayne@68:         if dbid:
jpayne@68:             sql += " and biodatabase_id = %s"
jpayne@68:             fields.append(dbid)
jpayne@68:         self.execute(sql, fields)
jpayne@68:         rv = self.cursor.fetchall()
jpayne@68:         if not rv:
jpayne@68:             raise IndexError(f"Cannot find accession {name!r}")
jpayne@68:         if len(rv) > 1:
jpayne@68:             raise IndexError(f"More than one entry with accession {name!r}")
jpayne@68:         return rv[0][0]
jpayne@68: 
jpayne@68:     def fetch_seqids_by_accession(self, dbid, name):
jpayne@68:         """Return a list internal ids using an accession.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - dbid - the internal id for the sub-database
jpayne@68:          - name - the accession of the sequence. Corresponds to the
jpayne@68:            accession column of the bioentry table of the SQL schema
jpayne@68: 
jpayne@68:         """
jpayne@68:         sql = "select bioentry_id from bioentry where accession = %s"
jpayne@68:         fields = [name]
jpayne@68:         if dbid:
jpayne@68:             sql += " and biodatabase_id = %s"
jpayne@68:             fields.append(dbid)
jpayne@68:         return self.execute_and_fetch_col0(sql, fields)
jpayne@68: 
jpayne@68:     def fetch_seqid_by_version(self, dbid, name):
jpayne@68:         """Return the internal id for a sequence using its accession and version.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - dbid - the internal id for the sub-database
jpayne@68:          - name - the accession of the sequence containing a version number.
jpayne@68:            Must correspond to <accession>.<version>
jpayne@68: 
jpayne@68:         """
jpayne@68:         acc_version = name.split(".")
jpayne@68:         if len(acc_version) > 2:
jpayne@68:             raise IndexError(f"Bad version {name!r}")
jpayne@68:         acc = acc_version[0]
jpayne@68:         if len(acc_version) == 2:
jpayne@68:             version = acc_version[1]
jpayne@68:         else:
jpayne@68:             version = "0"
jpayne@68:         sql = "SELECT bioentry_id FROM bioentry WHERE accession = %s AND version = %s"
jpayne@68:         fields = [acc, version]
jpayne@68:         if dbid:
jpayne@68:             sql += " and biodatabase_id = %s"
jpayne@68:             fields.append(dbid)
jpayne@68:         self.execute(sql, fields)
jpayne@68:         rv = self.cursor.fetchall()
jpayne@68:         if not rv:
jpayne@68:             raise IndexError(f"Cannot find version {name!r}")
jpayne@68:         if len(rv) > 1:
jpayne@68:             raise IndexError(f"More than one entry with version {name!r}")
jpayne@68:         return rv[0][0]
jpayne@68: 
jpayne@68:     def fetch_seqid_by_identifier(self, dbid, identifier):
jpayne@68:         """Return the internal id for a sequence using its identifier.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - dbid - the internal id for the sub-database
jpayne@68:          - identifier - the identifier of the sequence. Corresponds to
jpayne@68:            the identifier column of the bioentry table in the SQL schema.
jpayne@68: 
jpayne@68:         """
jpayne@68:         # YB: was fetch_seqid_by_seqid
jpayne@68:         sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s"
jpayne@68:         fields = [identifier]
jpayne@68:         if dbid:
jpayne@68:             sql += " and biodatabase_id = %s"
jpayne@68:             fields.append(dbid)
jpayne@68:         self.execute(sql, fields)
jpayne@68:         rv = self.cursor.fetchall()
jpayne@68:         if not rv:
jpayne@68:             raise IndexError(f"Cannot find display id {identifier!r}")
jpayne@68:         return rv[0][0]
jpayne@68: 
jpayne@68:     def list_biodatabase_names(self):
jpayne@68:         """Return a list of all of the sub-databases."""
jpayne@68:         return self.execute_and_fetch_col0("SELECT name FROM biodatabase")
jpayne@68: 
jpayne@68:     def list_bioentry_ids(self, dbid):
jpayne@68:         """Return a list of internal ids for all of the sequences in a sub-databae.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - dbid - The internal id for a sub-database
jpayne@68: 
jpayne@68:         """
jpayne@68:         return self.execute_and_fetch_col0(
jpayne@68:             "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", (dbid,)
jpayne@68:         )
jpayne@68: 
jpayne@68:     def list_bioentry_display_ids(self, dbid):
jpayne@68:         """Return a list of all sequence names in a sub-databae.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - dbid - The internal id for a sub-database
jpayne@68: 
jpayne@68:         """
jpayne@68:         return self.execute_and_fetch_col0(
jpayne@68:             "SELECT name FROM bioentry WHERE biodatabase_id = %s", (dbid,)
jpayne@68:         )
jpayne@68: 
jpayne@68:     def list_any_ids(self, sql, args):
jpayne@68:         """Return ids given a SQL statement to select for them.
jpayne@68: 
jpayne@68:         This assumes that the given SQL does a SELECT statement that
jpayne@68:         returns a list of items. This parses them out of the 2D list
jpayne@68:         they come as and just returns them in a list.
jpayne@68:         """
jpayne@68:         return self.execute_and_fetch_col0(sql, args)
jpayne@68: 
jpayne@68:     def execute_one(self, sql, args=None):
jpayne@68:         """Execute sql that returns 1 record, and return the record."""
jpayne@68:         self.execute(sql, args or ())
jpayne@68:         rv = self.cursor.fetchall()
jpayne@68:         if len(rv) != 1:
jpayne@68:             raise ValueError(f"Expected 1 response, got {len(rv)}.")
jpayne@68:         return rv[0]
jpayne@68: 
jpayne@68:     def execute(self, sql, args=None):
jpayne@68:         """Just execute an sql command."""
jpayne@68:         if os.name == "java":
jpayne@68:             sql = sql.replace("%s", "?")
jpayne@68:         self.dbutils.execute(self.cursor, sql, args)
jpayne@68: 
jpayne@68:     def executemany(self, sql, args):
jpayne@68:         """Execute many sql commands."""
jpayne@68:         if os.name == "java":
jpayne@68:             sql = sql.replace("%s", "?")
jpayne@68:         self.dbutils.executemany(self.cursor, sql, args)
jpayne@68: 
jpayne@68:     def get_subseq_as_string(self, seqid, start, end):
jpayne@68:         """Return a substring of a sequence.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - seqid - The internal id for the sequence
jpayne@68:          - start - The start position of the sequence; 0-indexed
jpayne@68:          - end - The end position of the sequence
jpayne@68: 
jpayne@68:         """
jpayne@68:         length = end - start
jpayne@68:         # XXX Check this on MySQL and PostgreSQL. substr should be general,
jpayne@68:         # does it need dbutils?
jpayne@68:         # return self.execute_one(
jpayne@68:         #    """select SUBSTRING(seq FROM %s FOR %s)
jpayne@68:         #             from biosequence where bioentry_id = %s""",
jpayne@68:         #    (start+1, length, seqid))[0]
jpayne@68:         return self.execute_one(
jpayne@68:             "SELECT SUBSTR(seq, %s, %s) FROM biosequence WHERE bioentry_id = %s",
jpayne@68:             (start + 1, length, seqid),
jpayne@68:         )[0]
jpayne@68: 
jpayne@68:     def execute_and_fetch_col0(self, sql, args=None):
jpayne@68:         """Return a list of values from the first column in the row."""
jpayne@68:         self.execute(sql, args or ())
jpayne@68:         return [field[0] for field in self.cursor.fetchall()]
jpayne@68: 
jpayne@68:     def execute_and_fetchall(self, sql, args=None):
jpayne@68:         """Return a list of tuples of all rows."""
jpayne@68:         self.execute(sql, args or ())
jpayne@68:         return self.cursor.fetchall()
jpayne@68: 
jpayne@68: 
jpayne@68: class MysqlConnectorAdaptor(Adaptor):
jpayne@68:     """A BioSQL Adaptor class with fixes for the MySQL interface.
jpayne@68: 
jpayne@68:     BioSQL was failing due to returns of bytearray objects from
jpayne@68:     the mysql-connector-python database connector. This adaptor
jpayne@68:     class scrubs returns of bytearrays and of byte strings converting
jpayne@68:     them to string objects instead. This adaptor class was made in
jpayne@68:     response to backwards incompatible changes added to
jpayne@68:     mysql-connector-python in release 2.0.0 of the package.
jpayne@68:     """
jpayne@68: 
jpayne@68:     @staticmethod
jpayne@68:     def _bytearray_to_str(s):
jpayne@68:         """If s is bytes or bytearray, convert to a string (PRIVATE)."""
jpayne@68:         if isinstance(s, (bytes, bytearray)):
jpayne@68:             return s.decode()
jpayne@68:         return s
jpayne@68: 
jpayne@68:     def execute_one(self, sql, args=None):
jpayne@68:         """Execute sql that returns 1 record, and return the record."""
jpayne@68:         out = super().execute_one(sql, args)
jpayne@68:         return tuple(self._bytearray_to_str(v) for v in out)
jpayne@68: 
jpayne@68:     def execute_and_fetch_col0(self, sql, args=None):
jpayne@68:         """Return a list of values from the first column in the row."""
jpayne@68:         out = super().execute_and_fetch_col0(sql, args)
jpayne@68:         return [self._bytearray_to_str(column) for column in out]
jpayne@68: 
jpayne@68:     def execute_and_fetchall(self, sql, args=None):
jpayne@68:         """Return a list of tuples of all rows."""
jpayne@68:         out = super().execute_and_fetchall(sql, args)
jpayne@68:         return [tuple(self._bytearray_to_str(v) for v in o) for o in out]
jpayne@68: 
jpayne@68: 
jpayne@68: _interface_specific_adaptors = {
jpayne@68:     # If SQL interfaces require a specific adaptor, use this to map the adaptor
jpayne@68:     "mysql.connector": MysqlConnectorAdaptor,
jpayne@68:     "MySQLdb": MysqlConnectorAdaptor,
jpayne@68: }
jpayne@68: 
jpayne@68: _allowed_lookups = {
jpayne@68:     # Lookup name / function name to get id, function to list all ids
jpayne@68:     "primary_id": "fetch_seqid_by_identifier",
jpayne@68:     "gi": "fetch_seqid_by_identifier",
jpayne@68:     "display_id": "fetch_seqid_by_display_id",
jpayne@68:     "name": "fetch_seqid_by_display_id",
jpayne@68:     "accession": "fetch_seqid_by_accession",
jpayne@68:     "version": "fetch_seqid_by_version",
jpayne@68: }
jpayne@68: 
jpayne@68: 
jpayne@68: class BioSeqDatabase:
jpayne@68:     """Represents a namespace (sub-database) within the BioSQL database.
jpayne@68: 
jpayne@68:     i.e. One row in the biodatabase table, and all all rows in the bioentry
jpayne@68:     table associated with it.
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, adaptor, name):
jpayne@68:         """Create a BioDatabase object.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - adaptor - A BioSQL.Adaptor object
jpayne@68:          - name - The name of the sub-database (namespace)
jpayne@68: 
jpayne@68:         """
jpayne@68:         self.adaptor = adaptor
jpayne@68:         self.name = name
jpayne@68:         self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
jpayne@68: 
jpayne@68:     def __repr__(self):
jpayne@68:         """Return a short summary of the BioSeqDatabase."""
jpayne@68:         return f"BioSeqDatabase({self.adaptor!r}, {self.name!r})"
jpayne@68: 
jpayne@68:     def get_Seq_by_id(self, name):
jpayne@68:         """Get a DBSeqRecord object by its name.
jpayne@68: 
jpayne@68:         Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN')
jpayne@68: 
jpayne@68:         The name of this method is misleading since it returns a DBSeqRecord
jpayne@68:         rather than a Seq object, and presumably was to mirror BioPerl.
jpayne@68:         """
jpayne@68:         seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name)
jpayne@68:         return BioSeq.DBSeqRecord(self.adaptor, seqid)
jpayne@68: 
jpayne@68:     def get_Seq_by_acc(self, name):
jpayne@68:         """Get a DBSeqRecord object by accession number.
jpayne@68: 
jpayne@68:         Example: seq_rec = db.get_Seq_by_acc('X77802')
jpayne@68: 
jpayne@68:         The name of this method is misleading since it returns a DBSeqRecord
jpayne@68:         rather than a Seq object, and presumably was to mirror BioPerl.
jpayne@68:         """
jpayne@68:         seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name)
jpayne@68:         return BioSeq.DBSeqRecord(self.adaptor, seqid)
jpayne@68: 
jpayne@68:     def get_Seq_by_ver(self, name):
jpayne@68:         """Get a DBSeqRecord object by version number.
jpayne@68: 
jpayne@68:         Example: seq_rec = db.get_Seq_by_ver('X77802.1')
jpayne@68: 
jpayne@68:         The name of this method is misleading since it returns a DBSeqRecord
jpayne@68:         rather than a Seq object, and presumably was to mirror BioPerl.
jpayne@68:         """
jpayne@68:         seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name)
jpayne@68:         return BioSeq.DBSeqRecord(self.adaptor, seqid)
jpayne@68: 
jpayne@68:     def get_Seqs_by_acc(self, name):
jpayne@68:         """Get a list of DBSeqRecord objects by accession number.
jpayne@68: 
jpayne@68:         Example: seq_recs = db.get_Seq_by_acc('X77802')
jpayne@68: 
jpayne@68:         The name of this method is misleading since it returns a list of
jpayne@68:         DBSeqRecord objects rather than a list of Seq objects, and presumably
jpayne@68:         was to mirror BioPerl.
jpayne@68:         """
jpayne@68:         seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name)
jpayne@68:         return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         """Return a DBSeqRecord for one of the sequences in the sub-database.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - key - The internal id for the sequence
jpayne@68: 
jpayne@68:         """
jpayne@68:         record = BioSeq.DBSeqRecord(self.adaptor, key)
jpayne@68:         if record._biodatabase_id != self.dbid:
jpayne@68:             raise KeyError(f"Entry {key!r} does exist, but not in current name space")
jpayne@68:         return record
jpayne@68: 
jpayne@68:     def __delitem__(self, key):
jpayne@68:         """Remove an entry and all its annotation."""
jpayne@68:         if key not in self:
jpayne@68:             raise KeyError(
jpayne@68:                 f"Entry {key!r} cannot be deleted. It was not found or is invalid"
jpayne@68:             )
jpayne@68:         # Assuming this will automatically cascade to the other tables...
jpayne@68:         sql = "DELETE FROM bioentry WHERE biodatabase_id=%s AND bioentry_id=%s;"
jpayne@68:         self.adaptor.execute(sql, (self.dbid, key))
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         """Return number of records in this namespace (sub database)."""
jpayne@68:         sql = "SELECT COUNT(bioentry_id) FROM bioentry WHERE biodatabase_id=%s;"
jpayne@68:         return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid,))[0])
jpayne@68: 
jpayne@68:     def __contains__(self, value):
jpayne@68:         """Check if a primary (internal) id is this namespace (sub database)."""
jpayne@68:         sql = (
jpayne@68:             "SELECT COUNT(bioentry_id) FROM bioentry "
jpayne@68:             "WHERE biodatabase_id=%s AND bioentry_id=%s;"
jpayne@68:         )
jpayne@68:         # The bioentry_id field is an integer in the schema.
jpayne@68:         # PostgreSQL will throw an error if we use a non integer in the query.
jpayne@68:         try:
jpayne@68:             bioentry_id = int(value)
jpayne@68:         except ValueError:
jpayne@68:             return False
jpayne@68:         return bool(
jpayne@68:             self.adaptor.execute_and_fetch_col0(sql, (self.dbid, bioentry_id))[0]
jpayne@68:         )
jpayne@68: 
jpayne@68:     def __iter__(self):
jpayne@68:         """Iterate over ids (which may not be meaningful outside this database)."""
jpayne@68:         # TODO - Iterate over the cursor, much more efficient
jpayne@68:         return iter(self.adaptor.list_bioentry_ids(self.dbid))
jpayne@68: 
jpayne@68:     def keys(self):
jpayne@68:         """Iterate over ids (which may not be meaningful outside this database)."""
jpayne@68:         return iter(self)
jpayne@68: 
jpayne@68:     def values(self):
jpayne@68:         """Iterate over DBSeqRecord objects in the namespace (sub database)."""
jpayne@68:         for key in self:
jpayne@68:             yield self[key]
jpayne@68: 
jpayne@68:     def items(self):
jpayne@68:         """Iterate over (id, DBSeqRecord) for the namespace (sub database)."""
jpayne@68:         for key in self:
jpayne@68:             yield key, self[key]
jpayne@68: 
jpayne@68:     def lookup(self, **kwargs):
jpayne@68:         """Return a DBSeqRecord using an acceptable identifier.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - kwargs - A single key-value pair where the key is one
jpayne@68:            of primary_id, gi, display_id, name, accession, version
jpayne@68: 
jpayne@68:         """
jpayne@68:         if len(kwargs) != 1:
jpayne@68:             raise TypeError("single key/value parameter expected")
jpayne@68:         k, v = list(kwargs.items())[0]
jpayne@68:         if k not in _allowed_lookups:
jpayne@68:             raise TypeError(
jpayne@68:                 f"lookup() expects one of {list(_allowed_lookups.keys())!r}, not {k!r}"
jpayne@68:             )
jpayne@68:         lookup_name = _allowed_lookups[k]
jpayne@68:         lookup_func = getattr(self.adaptor, lookup_name)
jpayne@68:         seqid = lookup_func(self.dbid, v)
jpayne@68:         return BioSeq.DBSeqRecord(self.adaptor, seqid)
jpayne@68: 
jpayne@68:     def load(self, record_iterator, fetch_NCBI_taxonomy=False):
jpayne@68:         """Load a set of SeqRecords into the BioSQL database.
jpayne@68: 
jpayne@68:         record_iterator is either a list of SeqRecord objects, or an
jpayne@68:         Iterator object that returns SeqRecord objects (such as the
jpayne@68:         output from the Bio.SeqIO.parse() function), which will be
jpayne@68:         used to populate the database.
jpayne@68: 
jpayne@68:         fetch_NCBI_taxonomy is boolean flag allowing or preventing
jpayne@68:         connection to the taxonomic database on the NCBI server
jpayne@68:         (via Bio.Entrez) to fetch a detailed taxonomy for each
jpayne@68:         SeqRecord.
jpayne@68: 
jpayne@68:         Example::
jpayne@68: 
jpayne@68:             from Bio import SeqIO
jpayne@68:             count = db.load(SeqIO.parse(open(filename), format))
jpayne@68: 
jpayne@68:         Returns the number of records loaded.
jpayne@68:         """
jpayne@68:         db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, fetch_NCBI_taxonomy)
jpayne@68:         num_records = 0
jpayne@68:         global _POSTGRES_RULES_PRESENT
jpayne@68:         for cur_record in record_iterator:
jpayne@68:             num_records += 1
jpayne@68:             # Hack to work around BioSQL Bug 2839 - If using PostgreSQL and
jpayne@68:             # the RULES are present check for a duplicate record before loading
jpayne@68:             if _POSTGRES_RULES_PRESENT:
jpayne@68:                 # Recreate what the Loader's _load_bioentry_table will do:
jpayne@68:                 if cur_record.id.count(".") == 1:
jpayne@68:                     accession, version = cur_record.id.split(".")
jpayne@68:                     try:
jpayne@68:                         version = int(version)
jpayne@68:                     except ValueError:
jpayne@68:                         accession = cur_record.id
jpayne@68:                         version = 0
jpayne@68:                 else:
jpayne@68:                     accession = cur_record.id
jpayne@68:                     version = 0
jpayne@68:                 gi = cur_record.annotations.get("gi")
jpayne@68:                 sql = (
jpayne@68:                     "SELECT bioentry_id FROM bioentry "
jpayne@68:                     "WHERE (identifier = '%s' AND biodatabase_id = '%s') "
jpayne@68:                     "OR (accession = '%s' AND version = '%s' AND biodatabase_id = '%s')"
jpayne@68:                 )
jpayne@68:                 self.adaptor.execute(
jpayne@68:                     sql % (gi, self.dbid, accession, version, self.dbid)
jpayne@68:                 )
jpayne@68:                 if self.adaptor.cursor.fetchone():
jpayne@68:                     raise self.adaptor.conn.IntegrityError(
jpayne@68:                         "Duplicate record detected: record has not been inserted"
jpayne@68:                     )
jpayne@68:             # End of hack
jpayne@68:             db_loader.load_seqrecord(cur_record)
jpayne@68:         return num_records