csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/BioSeqDatabase.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/BioSeqDatabase.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 # Copyright 2002 by Andrew Dalke. All rights reserved.
jpayne@69	2 # Revisions 2007-2016 copyright by Peter Cock. All rights reserved.
jpayne@69	3 # Revisions 2009 copyright by Cymon J. Cox. All rights reserved.
jpayne@69	4 # Revisions 2013-2014 copyright by Tiago Antao. All rights reserved.
jpayne@69	5 #
jpayne@69	6 # This file is part of the Biopython distribution and governed by your
jpayne@69	7 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69	8 # Please see the LICENSE file that should have been included as part of this
jpayne@69	9 # package.
jpayne@69	10 #
jpayne@69	11 # Note that BioSQL (including the database schema and scripts) is
jpayne@69	12 # available and licensed separately. Please consult www.biosql.org
jpayne@69	13 """Connect with a BioSQL database and load Biopython like objects from it.
jpayne@69	14
jpayne@69	15 This provides interfaces for loading biological objects from a relational
jpayne@69	16 database, and is compatible with the BioSQL standards.
jpayne@69	17 """
jpayne@69	18 import os
jpayne@69	19
jpayne@69	20 from . import BioSeq
jpayne@69	21 from . import Loader
jpayne@69	22 from . import DBUtils
jpayne@69	23
jpayne@69	24
jpayne@69	25 _POSTGRES_RULES_PRESENT = False # Hack for BioSQL Bug 2839
jpayne@69	26
jpayne@69	27
jpayne@69	28 def open_database(driver="MySQLdb", **kwargs):
jpayne@69	29 """Load an existing BioSQL-style database.
jpayne@69	30
jpayne@69	31 This function is the easiest way to retrieve a connection to a
jpayne@69	32 database, doing something like::
jpayne@69	33
jpayne@69	34 from BioSQL import BioSeqDatabase
jpayne@69	35 server = BioSeqDatabase.open_database(user="root", db="minidb")
jpayne@69	36
jpayne@69	37 Arguments:
jpayne@69	38 - driver - The name of the database driver to use for connecting. The
jpayne@69	39 driver should implement the python DB API. By default, the MySQLdb
jpayne@69	40 driver is used.
jpayne@69	41 - user -the username to connect to the database with.
jpayne@69	42 - password, passwd - the password to connect with
jpayne@69	43 - host - the hostname of the database
jpayne@69	44 - database or db - the name of the database
jpayne@69	45
jpayne@69	46 """
jpayne@69	47 if driver == "psycopg":
jpayne@69	48 raise ValueError(
jpayne@69	49 "Using BioSQL with psycopg (version one) is no "
jpayne@69	50 "longer supported. Use psycopg2 instead."
jpayne@69	51 )
jpayne@69	52
jpayne@69	53 if os.name == "java":
jpayne@69	54 from com.ziclix.python.sql import zxJDBC
jpayne@69	55
jpayne@69	56 module = zxJDBC
jpayne@69	57 if driver in ["MySQLdb"]:
jpayne@69	58 jdbc_driver = "com.mysql.jdbc.Driver"
jpayne@69	59 url_pref = "jdbc:mysql://" + kwargs["host"] + "/"
jpayne@69	60 elif driver in ["psycopg2"]:
jpayne@69	61 jdbc_driver = "org.postgresql.Driver"
jpayne@69	62 url_pref = "jdbc:postgresql://" + kwargs["host"] + "/"
jpayne@69	63
jpayne@69	64 else:
jpayne@69	65 module = __import__(driver, fromlist=["connect"])
jpayne@69	66 connect = module.connect
jpayne@69	67
jpayne@69	68 # Different drivers use different keywords...
jpayne@69	69 kw = kwargs.copy()
jpayne@69	70 if driver in ["MySQLdb", "mysql.connector"] and os.name != "java":
jpayne@69	71 if "database" in kw:
jpayne@69	72 kw["db"] = kw["database"]
jpayne@69	73 del kw["database"]
jpayne@69	74 if "password" in kw:
jpayne@69	75 kw["passwd"] = kw["password"]
jpayne@69	76 del kw["password"]
jpayne@69	77 # kw["charset"] = "utf8"
jpayne@69	78 # kw["use_unicode"] = True
jpayne@69	79 else:
jpayne@69	80 # DB-API recommendations
jpayne@69	81 if "db" in kw:
jpayne@69	82 kw["database"] = kw["db"]
jpayne@69	83 del kw["db"]
jpayne@69	84 if "passwd" in kw:
jpayne@69	85 kw["password"] = kw["passwd"]
jpayne@69	86 del kw["passwd"]
jpayne@69	87 if driver in ["psycopg2", "pgdb"] and not kw.get("database"):
jpayne@69	88 kw["database"] = "template1"
jpayne@69	89 # SQLite connect takes the database name as input
jpayne@69	90 if os.name == "java":
jpayne@69	91 if driver in ["MySQLdb"]:
jpayne@69	92 conn = connect(
jpayne@69	93 url_pref + kw.get("database", "mysql"),
jpayne@69	94 kw["user"],
jpayne@69	95 kw["password"],
jpayne@69	96 jdbc_driver,
jpayne@69	97 )
jpayne@69	98 elif driver in ["psycopg2"]:
jpayne@69	99 conn = connect(
jpayne@69	100 url_pref + kw.get("database", "postgresql") + "?stringtype=unspecified",
jpayne@69	101 kw["user"],
jpayne@69	102 kw["password"],
jpayne@69	103 jdbc_driver,
jpayne@69	104 )
jpayne@69	105 elif driver in ["sqlite3"]:
jpayne@69	106 conn = connect(kw["database"])
jpayne@69	107 else:
jpayne@69	108 conn = connect(**kw)
jpayne@69	109
jpayne@69	110 if os.name == "java":
jpayne@69	111 server = DBServer(conn, module, driver)
jpayne@69	112 else:
jpayne@69	113 server = DBServer(conn, module)
jpayne@69	114
jpayne@69	115 # Sets MySQL to allow double quotes, rather than only backticks
jpayne@69	116 if driver in ["MySQLdb", "mysql.connector"]:
jpayne@69	117 server.adaptor.execute("SET sql_mode='ANSI_QUOTES';")
jpayne@69	118
jpayne@69	119 # TODO - Remove the following once BioSQL Bug 2839 is fixed.
jpayne@69	120 # Test for RULES in PostgreSQL schema, see also Bug 2833.
jpayne@69	121 if driver in ["psycopg2", "pgdb"]:
jpayne@69	122 sql = (
jpayne@69	123 "SELECT ev_class FROM pg_rewrite WHERE "
jpayne@69	124 "rulename='rule_bioentry_i1' OR "
jpayne@69	125 "rulename='rule_bioentry_i2';"
jpayne@69	126 )
jpayne@69	127 if server.adaptor.execute_and_fetchall(sql):
jpayne@69	128 import warnings
jpayne@69	129 from Bio import BiopythonWarning
jpayne@69	130
jpayne@69	131 warnings.warn(
jpayne@69	132 "Your BioSQL PostgreSQL schema includes some rules "
jpayne@69	133 "currently required for bioperl-db but which may"
jpayne@69	134 "cause problems loading data using Biopython (see "
jpayne@69	135 "BioSQL's RedMine Bug 2839 aka GitHub Issue 4 "
jpayne@69	136 "https://github.com/biosql/biosql/issues/4). "
jpayne@69	137 "If you do not use BioPerl, please remove these "
jpayne@69	138 "rules. Biopython should cope with the rules "
jpayne@69	139 "present, but with a performance penalty when "
jpayne@69	140 "loading new records.",
jpayne@69	141 BiopythonWarning,
jpayne@69	142 )
jpayne@69	143 global _POSTGRES_RULES_PRESENT
jpayne@69	144 _POSTGRES_RULES_PRESENT = True
jpayne@69	145
jpayne@69	146 elif driver == "sqlite3":
jpayne@69	147 # Tell SQLite that we want to use foreign keys
jpayne@69	148 # https://www.sqlite.org/foreignkeys.html#fk_enable
jpayne@69	149 server.adaptor.execute("PRAGMA foreign_keys = ON")
jpayne@69	150
jpayne@69	151 return server
jpayne@69	152
jpayne@69	153
jpayne@69	154 class DBServer:
jpayne@69	155 """Represents a BioSQL database containing namespaces (sub-databases).
jpayne@69	156
jpayne@69	157 This acts like a Python dictionary, giving access to each namespace
jpayne@69	158 (defined by a row in the biodatabase table) as a BioSeqDatabase object.
jpayne@69	159 """
jpayne@69	160
jpayne@69	161 def __init__(self, conn, module, module_name=None):
jpayne@69	162 """Create a DBServer object.
jpayne@69	163
jpayne@69	164 Arguments:
jpayne@69	165 - conn - A database connection object
jpayne@69	166 - module - The module used to create the database connection
jpayne@69	167 - module_name - Optionally, the name of the module. Default: module.__name__
jpayne@69	168
jpayne@69	169 Normally you would not want to create a DBServer object yourself.
jpayne@69	170 Instead use the open_database function, which returns an instance of DBServer.
jpayne@69	171 """
jpayne@69	172 self.module = module
jpayne@69	173 if module_name is None:
jpayne@69	174 module_name = module.__name__
jpayne@69	175 if module_name == "mysql.connector":
jpayne@69	176 wrap_cursor = True
jpayne@69	177 else:
jpayne@69	178 wrap_cursor = False
jpayne@69	179 # Get module specific Adaptor or the base (general) Adaptor
jpayne@69	180 Adapt = _interface_specific_adaptors.get(module_name, Adaptor)
jpayne@69	181 self.adaptor = Adapt(
jpayne@69	182 conn, DBUtils.get_dbutils(module_name), wrap_cursor=wrap_cursor
jpayne@69	183 )
jpayne@69	184 self.module_name = module_name
jpayne@69	185
jpayne@69	186 def __repr__(self):
jpayne@69	187 """Return a short description of the class name and database connection."""
jpayne@69	188 return f"{self.__class__.__name__}({self.adaptor.conn!r})"
jpayne@69	189
jpayne@69	190 def __getitem__(self, name):
jpayne@69	191 """Return a BioSeqDatabase object.
jpayne@69	192
jpayne@69	193 Arguments:
jpayne@69	194 - name - The name of the BioSeqDatabase
jpayne@69	195
jpayne@69	196 """
jpayne@69	197 return BioSeqDatabase(self.adaptor, name)
jpayne@69	198
jpayne@69	199 def __len__(self):
jpayne@69	200 """Return number of namespaces (sub-databases) in this database."""
jpayne@69	201 sql = "SELECT COUNT(name) FROM biodatabase;"
jpayne@69	202 return int(self.adaptor.execute_and_fetch_col0(sql)[0])
jpayne@69	203
jpayne@69	204 def __contains__(self, value):
jpayne@69	205 """Check if a namespace (sub-database) in this database."""
jpayne@69	206 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;"
jpayne@69	207 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
jpayne@69	208
jpayne@69	209 def __iter__(self):
jpayne@69	210 """Iterate over namespaces (sub-databases) in the database."""
jpayne@69	211 # TODO - Iterate over the cursor, much more efficient
jpayne@69	212 return iter(self.adaptor.list_biodatabase_names())
jpayne@69	213
jpayne@69	214 def keys(self):
jpayne@69	215 """Iterate over namespaces (sub-databases) in the database."""
jpayne@69	216 return iter(self)
jpayne@69	217
jpayne@69	218 def values(self):
jpayne@69	219 """Iterate over BioSeqDatabase objects in the database."""
jpayne@69	220 for key in self:
jpayne@69	221 yield self[key]
jpayne@69	222
jpayne@69	223 def items(self):
jpayne@69	224 """Iterate over (namespace, BioSeqDatabase) in the database."""
jpayne@69	225 for key in self:
jpayne@69	226 yield key, self[key]
jpayne@69	227
jpayne@69	228 def __delitem__(self, name):
jpayne@69	229 """Remove a namespace and all its entries."""
jpayne@69	230 if name not in self:
jpayne@69	231 raise KeyError(name)
jpayne@69	232 db_id = self.adaptor.fetch_dbid_by_dbname(name)
jpayne@69	233 remover = Loader.DatabaseRemover(self.adaptor, db_id)
jpayne@69	234 remover.remove()
jpayne@69	235
jpayne@69	236 def new_database(self, db_name, authority=None, description=None):
jpayne@69	237 """Add a new database to the server and return it."""
jpayne@69	238 # make the database
jpayne@69	239 sql = (
jpayne@69	240 "INSERT INTO biodatabase (name, authority, description)"
jpayne@69	241 " VALUES (%s, %s, %s)"
jpayne@69	242 )
jpayne@69	243 self.adaptor.execute(sql, (db_name, authority, description))
jpayne@69	244 return BioSeqDatabase(self.adaptor, db_name)
jpayne@69	245
jpayne@69	246 def load_database_sql(self, sql_file):
jpayne@69	247 """Load a database schema into the given database.
jpayne@69	248
jpayne@69	249 This is used to create tables, etc when a database is first created.
jpayne@69	250 sql_file should specify the complete path to a file containing
jpayne@69	251 SQL entries for building the tables.
jpayne@69	252 """
jpayne@69	253 # Not sophisticated enough for PG schema. Is it needed by MySQL?
jpayne@69	254 # Looks like we need this more complicated way for both. Leaving it
jpayne@69	255 # the default and removing the simple-minded approach.
jpayne@69	256
jpayne@69	257 # read the file with all comment lines removed
jpayne@69	258 sql = ""
jpayne@69	259 with open(sql_file) as sql_handle:
jpayne@69	260 for line in sql_handle:
jpayne@69	261 if line.startswith("--"): # don't include comment lines
jpayne@69	262 pass
jpayne@69	263 elif line.startswith("#"): # ditto for MySQL comments
jpayne@69	264 pass
jpayne@69	265 elif line.strip(): # only include non-blank lines
jpayne@69	266 sql += line.strip() + " "
jpayne@69	267
jpayne@69	268 # two ways to load the SQL
jpayne@69	269 # 1. PostgreSQL can load it all at once and actually needs to
jpayne@69	270 # due to FUNCTION defines at the end of the SQL which mess up
jpayne@69	271 # the splitting by semicolons
jpayne@69	272 if self.module_name in ["psycopg2", "pgdb"]:
jpayne@69	273 self.adaptor.cursor.execute(sql)
jpayne@69	274 # 2. MySQL needs the database loading split up into single lines of
jpayne@69	275 # SQL executed one at a time
jpayne@69	276 elif self.module_name in ["mysql.connector", "MySQLdb", "sqlite3"]:
jpayne@69	277 sql_parts = sql.split(";") # one line per sql command
jpayne@69	278 # don't use the last item, it's blank
jpayne@69	279 for sql_line in sql_parts[:-1]:
jpayne@69	280 self.adaptor.cursor.execute(sql_line)
jpayne@69	281 else:
jpayne@69	282 raise ValueError(f"Module {self.module_name} not supported by the loader.")
jpayne@69	283
jpayne@69	284 def commit(self):
jpayne@69	285 """Commit the current transaction to the database."""
jpayne@69	286 return self.adaptor.commit()
jpayne@69	287
jpayne@69	288 def rollback(self):
jpayne@69	289 """Roll-back the current transaction."""
jpayne@69	290 return self.adaptor.rollback()
jpayne@69	291
jpayne@69	292 def close(self):
jpayne@69	293 """Close the connection. No further activity possible."""
jpayne@69	294 return self.adaptor.close()
jpayne@69	295
jpayne@69	296
jpayne@69	297 class _CursorWrapper:
jpayne@69	298 """A wrapper for mysql.connector resolving bytestring representations."""
jpayne@69	299
jpayne@69	300 def __init__(self, real_cursor):
jpayne@69	301 self.real_cursor = real_cursor
jpayne@69	302
jpayne@69	303 def execute(self, operation, params=None, multi=False):
jpayne@69	304 """Execute a sql statement."""
jpayne@69	305 self.real_cursor.execute(operation, params, multi)
jpayne@69	306
jpayne@69	307 def executemany(self, operation, params):
jpayne@69	308 """Execute many sql statements."""
jpayne@69	309 self.real_cursor.executemany(operation, params)
jpayne@69	310
jpayne@69	311 def _convert_tuple(self, tuple_):
jpayne@69	312 """Decode any bytestrings present in the row (PRIVATE)."""
jpayne@69	313 tuple_list = list(tuple_)
jpayne@69	314 for i, elem in enumerate(tuple_list):
jpayne@69	315 if isinstance(elem, bytes):
jpayne@69	316 tuple_list[i] = elem.decode("utf-8")
jpayne@69	317 return tuple(tuple_list)
jpayne@69	318
jpayne@69	319 def _convert_list(self, lst):
jpayne@69	320 ret_lst = []
jpayne@69	321 for tuple_ in lst:
jpayne@69	322 new_tuple = self._convert_tuple(tuple_)
jpayne@69	323 ret_lst.append(new_tuple)
jpayne@69	324 return ret_lst
jpayne@69	325
jpayne@69	326 def fetchall(self):
jpayne@69	327 rv = self.real_cursor.fetchall()
jpayne@69	328 return self._convert_list(rv)
jpayne@69	329
jpayne@69	330 def fetchone(self):
jpayne@69	331 tuple_ = self.real_cursor.fetchone()
jpayne@69	332 return self._convert_tuple(tuple_)
jpayne@69	333
jpayne@69	334
jpayne@69	335 class Adaptor:
jpayne@69	336 """High level wrapper for a database connection and cursor.
jpayne@69	337
jpayne@69	338 Most database calls in BioSQL are done indirectly though this adaptor
jpayne@69	339 class. This provides helper methods for fetching data and executing
jpayne@69	340 sql.
jpayne@69	341 """
jpayne@69	342
jpayne@69	343 def __init__(self, conn, dbutils, wrap_cursor=False):
jpayne@69	344 """Create an Adaptor object.
jpayne@69	345
jpayne@69	346 Arguments:
jpayne@69	347 - conn - A database connection
jpayne@69	348 - dbutils - A BioSQL.DBUtils object
jpayne@69	349 - wrap_cursor - Optional, whether to wrap the cursor object
jpayne@69	350
jpayne@69	351 """
jpayne@69	352 self.conn = conn
jpayne@69	353 if wrap_cursor:
jpayne@69	354 self.cursor = _CursorWrapper(conn.cursor())
jpayne@69	355 else:
jpayne@69	356 self.cursor = conn.cursor()
jpayne@69	357 self.dbutils = dbutils
jpayne@69	358
jpayne@69	359 def last_id(self, table):
jpayne@69	360 """Return the last row id for the selected table."""
jpayne@69	361 return self.dbutils.last_id(self.cursor, table)
jpayne@69	362
jpayne@69	363 def autocommit(self, y=True):
jpayne@69	364 """Set the autocommit mode. True values enable; False value disable."""
jpayne@69	365 return self.dbutils.autocommit(self.conn, y)
jpayne@69	366
jpayne@69	367 def commit(self):
jpayne@69	368 """Commit the current transaction."""
jpayne@69	369 return self.conn.commit()
jpayne@69	370
jpayne@69	371 def rollback(self):
jpayne@69	372 """Roll-back the current transaction."""
jpayne@69	373 return self.conn.rollback()
jpayne@69	374
jpayne@69	375 def close(self):
jpayne@69	376 """Close the connection. No further activity possible."""
jpayne@69	377 return self.conn.close()
jpayne@69	378
jpayne@69	379 def fetch_dbid_by_dbname(self, dbname):
jpayne@69	380 """Return the internal id for the sub-database using its name."""
jpayne@69	381 self.execute(
jpayne@69	382 "select biodatabase_id from biodatabase where name = %s", (dbname,)
jpayne@69	383 )
jpayne@69	384 rv = self.cursor.fetchall()
jpayne@69	385 if not rv:
jpayne@69	386 raise KeyError(f"Cannot find biodatabase with name {dbname!r}")
jpayne@69	387 return rv[0][0]
jpayne@69	388
jpayne@69	389 def fetch_seqid_by_display_id(self, dbid, name):
jpayne@69	390 """Return the internal id for a sequence using its display id.
jpayne@69	391
jpayne@69	392 Arguments:
jpayne@69	393 - dbid - the internal id for the sub-database
jpayne@69	394 - name - the name of the sequence. Corresponds to the
jpayne@69	395 name column of the bioentry table of the SQL schema
jpayne@69	396
jpayne@69	397 """
jpayne@69	398 sql = "select bioentry_id from bioentry where name = %s"
jpayne@69	399 fields = [name]
jpayne@69	400 if dbid:
jpayne@69	401 sql += " and biodatabase_id = %s"
jpayne@69	402 fields.append(dbid)
jpayne@69	403 self.execute(sql, fields)
jpayne@69	404 rv = self.cursor.fetchall()
jpayne@69	405 if not rv:
jpayne@69	406 raise IndexError(f"Cannot find display id {name!r}")
jpayne@69	407 if len(rv) > 1:
jpayne@69	408 raise IndexError(f"More than one entry with display id {name!r}")
jpayne@69	409 return rv[0][0]
jpayne@69	410
jpayne@69	411 def fetch_seqid_by_accession(self, dbid, name):
jpayne@69	412 """Return the internal id for a sequence using its accession.
jpayne@69	413
jpayne@69	414 Arguments:
jpayne@69	415 - dbid - the internal id for the sub-database
jpayne@69	416 - name - the accession of the sequence. Corresponds to the
jpayne@69	417 accession column of the bioentry table of the SQL schema
jpayne@69	418
jpayne@69	419 """
jpayne@69	420 sql = "select bioentry_id from bioentry where accession = %s"
jpayne@69	421 fields = [name]
jpayne@69	422 if dbid:
jpayne@69	423 sql += " and biodatabase_id = %s"
jpayne@69	424 fields.append(dbid)
jpayne@69	425 self.execute(sql, fields)
jpayne@69	426 rv = self.cursor.fetchall()
jpayne@69	427 if not rv:
jpayne@69	428 raise IndexError(f"Cannot find accession {name!r}")
jpayne@69	429 if len(rv) > 1:
jpayne@69	430 raise IndexError(f"More than one entry with accession {name!r}")
jpayne@69	431 return rv[0][0]
jpayne@69	432
jpayne@69	433 def fetch_seqids_by_accession(self, dbid, name):
jpayne@69	434 """Return a list internal ids using an accession.
jpayne@69	435
jpayne@69	436 Arguments:
jpayne@69	437 - dbid - the internal id for the sub-database
jpayne@69	438 - name - the accession of the sequence. Corresponds to the
jpayne@69	439 accession column of the bioentry table of the SQL schema
jpayne@69	440
jpayne@69	441 """
jpayne@69	442 sql = "select bioentry_id from bioentry where accession = %s"
jpayne@69	443 fields = [name]
jpayne@69	444 if dbid:
jpayne@69	445 sql += " and biodatabase_id = %s"
jpayne@69	446 fields.append(dbid)
jpayne@69	447 return self.execute_and_fetch_col0(sql, fields)
jpayne@69	448
jpayne@69	449 def fetch_seqid_by_version(self, dbid, name):
jpayne@69	450 """Return the internal id for a sequence using its accession and version.
jpayne@69	451
jpayne@69	452 Arguments:
jpayne@69	453 - dbid - the internal id for the sub-database
jpayne@69	454 - name - the accession of the sequence containing a version number.
jpayne@69	455 Must correspond to <accession>.<version>
jpayne@69	456
jpayne@69	457 """
jpayne@69	458 acc_version = name.split(".")
jpayne@69	459 if len(acc_version) > 2:
jpayne@69	460 raise IndexError(f"Bad version {name!r}")
jpayne@69	461 acc = acc_version[0]
jpayne@69	462 if len(acc_version) == 2:
jpayne@69	463 version = acc_version[1]
jpayne@69	464 else:
jpayne@69	465 version = "0"
jpayne@69	466 sql = "SELECT bioentry_id FROM bioentry WHERE accession = %s AND version = %s"
jpayne@69	467 fields = [acc, version]
jpayne@69	468 if dbid:
jpayne@69	469 sql += " and biodatabase_id = %s"
jpayne@69	470 fields.append(dbid)
jpayne@69	471 self.execute(sql, fields)
jpayne@69	472 rv = self.cursor.fetchall()
jpayne@69	473 if not rv:
jpayne@69	474 raise IndexError(f"Cannot find version {name!r}")
jpayne@69	475 if len(rv) > 1:
jpayne@69	476 raise IndexError(f"More than one entry with version {name!r}")
jpayne@69	477 return rv[0][0]
jpayne@69	478
jpayne@69	479 def fetch_seqid_by_identifier(self, dbid, identifier):
jpayne@69	480 """Return the internal id for a sequence using its identifier.
jpayne@69	481
jpayne@69	482 Arguments:
jpayne@69	483 - dbid - the internal id for the sub-database
jpayne@69	484 - identifier - the identifier of the sequence. Corresponds to
jpayne@69	485 the identifier column of the bioentry table in the SQL schema.
jpayne@69	486
jpayne@69	487 """
jpayne@69	488 # YB: was fetch_seqid_by_seqid
jpayne@69	489 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s"
jpayne@69	490 fields = [identifier]
jpayne@69	491 if dbid:
jpayne@69	492 sql += " and biodatabase_id = %s"
jpayne@69	493 fields.append(dbid)
jpayne@69	494 self.execute(sql, fields)
jpayne@69	495 rv = self.cursor.fetchall()
jpayne@69	496 if not rv:
jpayne@69	497 raise IndexError(f"Cannot find display id {identifier!r}")
jpayne@69	498 return rv[0][0]
jpayne@69	499
jpayne@69	500 def list_biodatabase_names(self):
jpayne@69	501 """Return a list of all of the sub-databases."""
jpayne@69	502 return self.execute_and_fetch_col0("SELECT name FROM biodatabase")
jpayne@69	503
jpayne@69	504 def list_bioentry_ids(self, dbid):
jpayne@69	505 """Return a list of internal ids for all of the sequences in a sub-databae.
jpayne@69	506
jpayne@69	507 Arguments:
jpayne@69	508 - dbid - The internal id for a sub-database
jpayne@69	509
jpayne@69	510 """
jpayne@69	511 return self.execute_and_fetch_col0(
jpayne@69	512 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", (dbid,)
jpayne@69	513 )
jpayne@69	514
jpayne@69	515 def list_bioentry_display_ids(self, dbid):
jpayne@69	516 """Return a list of all sequence names in a sub-databae.
jpayne@69	517
jpayne@69	518 Arguments:
jpayne@69	519 - dbid - The internal id for a sub-database
jpayne@69	520
jpayne@69	521 """
jpayne@69	522 return self.execute_and_fetch_col0(
jpayne@69	523 "SELECT name FROM bioentry WHERE biodatabase_id = %s", (dbid,)
jpayne@69	524 )
jpayne@69	525
jpayne@69	526 def list_any_ids(self, sql, args):
jpayne@69	527 """Return ids given a SQL statement to select for them.
jpayne@69	528
jpayne@69	529 This assumes that the given SQL does a SELECT statement that
jpayne@69	530 returns a list of items. This parses them out of the 2D list
jpayne@69	531 they come as and just returns them in a list.
jpayne@69	532 """
jpayne@69	533 return self.execute_and_fetch_col0(sql, args)
jpayne@69	534
jpayne@69	535 def execute_one(self, sql, args=None):
jpayne@69	536 """Execute sql that returns 1 record, and return the record."""
jpayne@69	537 self.execute(sql, args or ())
jpayne@69	538 rv = self.cursor.fetchall()
jpayne@69	539 if len(rv) != 1:
jpayne@69	540 raise ValueError(f"Expected 1 response, got {len(rv)}.")
jpayne@69	541 return rv[0]
jpayne@69	542
jpayne@69	543 def execute(self, sql, args=None):
jpayne@69	544 """Just execute an sql command."""
jpayne@69	545 if os.name == "java":
jpayne@69	546 sql = sql.replace("%s", "?")
jpayne@69	547 self.dbutils.execute(self.cursor, sql, args)
jpayne@69	548
jpayne@69	549 def executemany(self, sql, args):
jpayne@69	550 """Execute many sql commands."""
jpayne@69	551 if os.name == "java":
jpayne@69	552 sql = sql.replace("%s", "?")
jpayne@69	553 self.dbutils.executemany(self.cursor, sql, args)
jpayne@69	554
jpayne@69	555 def get_subseq_as_string(self, seqid, start, end):
jpayne@69	556 """Return a substring of a sequence.
jpayne@69	557
jpayne@69	558 Arguments:
jpayne@69	559 - seqid - The internal id for the sequence
jpayne@69	560 - start - The start position of the sequence; 0-indexed
jpayne@69	561 - end - The end position of the sequence
jpayne@69	562
jpayne@69	563 """
jpayne@69	564 length = end - start
jpayne@69	565 # XXX Check this on MySQL and PostgreSQL. substr should be general,
jpayne@69	566 # does it need dbutils?
jpayne@69	567 # return self.execute_one(
jpayne@69	568 # """select SUBSTRING(seq FROM %s FOR %s)
jpayne@69	569 # from biosequence where bioentry_id = %s""",
jpayne@69	570 # (start+1, length, seqid))[0]
jpayne@69	571 return self.execute_one(
jpayne@69	572 "SELECT SUBSTR(seq, %s, %s) FROM biosequence WHERE bioentry_id = %s",
jpayne@69	573 (start + 1, length, seqid),
jpayne@69	574 )[0]
jpayne@69	575
jpayne@69	576 def execute_and_fetch_col0(self, sql, args=None):
jpayne@69	577 """Return a list of values from the first column in the row."""
jpayne@69	578 self.execute(sql, args or ())
jpayne@69	579 return [field[0] for field in self.cursor.fetchall()]
jpayne@69	580
jpayne@69	581 def execute_and_fetchall(self, sql, args=None):
jpayne@69	582 """Return a list of tuples of all rows."""
jpayne@69	583 self.execute(sql, args or ())
jpayne@69	584 return self.cursor.fetchall()
jpayne@69	585
jpayne@69	586
jpayne@69	587 class MysqlConnectorAdaptor(Adaptor):
jpayne@69	588 """A BioSQL Adaptor class with fixes for the MySQL interface.
jpayne@69	589
jpayne@69	590 BioSQL was failing due to returns of bytearray objects from
jpayne@69	591 the mysql-connector-python database connector. This adaptor
jpayne@69	592 class scrubs returns of bytearrays and of byte strings converting
jpayne@69	593 them to string objects instead. This adaptor class was made in
jpayne@69	594 response to backwards incompatible changes added to
jpayne@69	595 mysql-connector-python in release 2.0.0 of the package.
jpayne@69	596 """
jpayne@69	597
jpayne@69	598 @staticmethod
jpayne@69	599 def _bytearray_to_str(s):
jpayne@69	600 """If s is bytes or bytearray, convert to a string (PRIVATE)."""
jpayne@69	601 if isinstance(s, (bytes, bytearray)):
jpayne@69	602 return s.decode()
jpayne@69	603 return s
jpayne@69	604
jpayne@69	605 def execute_one(self, sql, args=None):
jpayne@69	606 """Execute sql that returns 1 record, and return the record."""
jpayne@69	607 out = super().execute_one(sql, args)
jpayne@69	608 return tuple(self._bytearray_to_str(v) for v in out)
jpayne@69	609
jpayne@69	610 def execute_and_fetch_col0(self, sql, args=None):
jpayne@69	611 """Return a list of values from the first column in the row."""
jpayne@69	612 out = super().execute_and_fetch_col0(sql, args)
jpayne@69	613 return [self._bytearray_to_str(column) for column in out]
jpayne@69	614
jpayne@69	615 def execute_and_fetchall(self, sql, args=None):
jpayne@69	616 """Return a list of tuples of all rows."""
jpayne@69	617 out = super().execute_and_fetchall(sql, args)
jpayne@69	618 return [tuple(self._bytearray_to_str(v) for v in o) for o in out]
jpayne@69	619
jpayne@69	620
jpayne@69	621 _interface_specific_adaptors = {
jpayne@69	622 # If SQL interfaces require a specific adaptor, use this to map the adaptor
jpayne@69	623 "mysql.connector": MysqlConnectorAdaptor,
jpayne@69	624 "MySQLdb": MysqlConnectorAdaptor,
jpayne@69	625 }
jpayne@69	626
jpayne@69	627 _allowed_lookups = {
jpayne@69	628 # Lookup name / function name to get id, function to list all ids
jpayne@69	629 "primary_id": "fetch_seqid_by_identifier",
jpayne@69	630 "gi": "fetch_seqid_by_identifier",
jpayne@69	631 "display_id": "fetch_seqid_by_display_id",
jpayne@69	632 "name": "fetch_seqid_by_display_id",
jpayne@69	633 "accession": "fetch_seqid_by_accession",
jpayne@69	634 "version": "fetch_seqid_by_version",
jpayne@69	635 }
jpayne@69	636
jpayne@69	637
jpayne@69	638 class BioSeqDatabase:
jpayne@69	639 """Represents a namespace (sub-database) within the BioSQL database.
jpayne@69	640
jpayne@69	641 i.e. One row in the biodatabase table, and all all rows in the bioentry
jpayne@69	642 table associated with it.
jpayne@69	643 """
jpayne@69	644
jpayne@69	645 def __init__(self, adaptor, name):
jpayne@69	646 """Create a BioDatabase object.
jpayne@69	647
jpayne@69	648 Arguments:
jpayne@69	649 - adaptor - A BioSQL.Adaptor object
jpayne@69	650 - name - The name of the sub-database (namespace)
jpayne@69	651
jpayne@69	652 """
jpayne@69	653 self.adaptor = adaptor
jpayne@69	654 self.name = name
jpayne@69	655 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
jpayne@69	656
jpayne@69	657 def __repr__(self):
jpayne@69	658 """Return a short summary of the BioSeqDatabase."""
jpayne@69	659 return f"BioSeqDatabase({self.adaptor!r}, {self.name!r})"
jpayne@69	660
jpayne@69	661 def get_Seq_by_id(self, name):
jpayne@69	662 """Get a DBSeqRecord object by its name.
jpayne@69	663
jpayne@69	664 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN')
jpayne@69	665
jpayne@69	666 The name of this method is misleading since it returns a DBSeqRecord
jpayne@69	667 rather than a Seq object, and presumably was to mirror BioPerl.
jpayne@69	668 """
jpayne@69	669 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name)
jpayne@69	670 return BioSeq.DBSeqRecord(self.adaptor, seqid)
jpayne@69	671
jpayne@69	672 def get_Seq_by_acc(self, name):
jpayne@69	673 """Get a DBSeqRecord object by accession number.
jpayne@69	674
jpayne@69	675 Example: seq_rec = db.get_Seq_by_acc('X77802')
jpayne@69	676
jpayne@69	677 The name of this method is misleading since it returns a DBSeqRecord
jpayne@69	678 rather than a Seq object, and presumably was to mirror BioPerl.
jpayne@69	679 """
jpayne@69	680 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name)
jpayne@69	681 return BioSeq.DBSeqRecord(self.adaptor, seqid)
jpayne@69	682
jpayne@69	683 def get_Seq_by_ver(self, name):
jpayne@69	684 """Get a DBSeqRecord object by version number.
jpayne@69	685
jpayne@69	686 Example: seq_rec = db.get_Seq_by_ver('X77802.1')
jpayne@69	687
jpayne@69	688 The name of this method is misleading since it returns a DBSeqRecord
jpayne@69	689 rather than a Seq object, and presumably was to mirror BioPerl.
jpayne@69	690 """
jpayne@69	691 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name)
jpayne@69	692 return BioSeq.DBSeqRecord(self.adaptor, seqid)
jpayne@69	693
jpayne@69	694 def get_Seqs_by_acc(self, name):
jpayne@69	695 """Get a list of DBSeqRecord objects by accession number.
jpayne@69	696
jpayne@69	697 Example: seq_recs = db.get_Seq_by_acc('X77802')
jpayne@69	698
jpayne@69	699 The name of this method is misleading since it returns a list of
jpayne@69	700 DBSeqRecord objects rather than a list of Seq objects, and presumably
jpayne@69	701 was to mirror BioPerl.
jpayne@69	702 """
jpayne@69	703 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name)
jpayne@69	704 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
jpayne@69	705
jpayne@69	706 def __getitem__(self, key):
jpayne@69	707 """Return a DBSeqRecord for one of the sequences in the sub-database.
jpayne@69	708
jpayne@69	709 Arguments:
jpayne@69	710 - key - The internal id for the sequence
jpayne@69	711
jpayne@69	712 """
jpayne@69	713 record = BioSeq.DBSeqRecord(self.adaptor, key)
jpayne@69	714 if record._biodatabase_id != self.dbid:
jpayne@69	715 raise KeyError(f"Entry {key!r} does exist, but not in current name space")
jpayne@69	716 return record
jpayne@69	717
jpayne@69	718 def __delitem__(self, key):
jpayne@69	719 """Remove an entry and all its annotation."""
jpayne@69	720 if key not in self:
jpayne@69	721 raise KeyError(
jpayne@69	722 f"Entry {key!r} cannot be deleted. It was not found or is invalid"
jpayne@69	723 )
jpayne@69	724 # Assuming this will automatically cascade to the other tables...
jpayne@69	725 sql = "DELETE FROM bioentry WHERE biodatabase_id=%s AND bioentry_id=%s;"
jpayne@69	726 self.adaptor.execute(sql, (self.dbid, key))
jpayne@69	727
jpayne@69	728 def __len__(self):
jpayne@69	729 """Return number of records in this namespace (sub database)."""
jpayne@69	730 sql = "SELECT COUNT(bioentry_id) FROM bioentry WHERE biodatabase_id=%s;"
jpayne@69	731 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid,))[0])
jpayne@69	732
jpayne@69	733 def __contains__(self, value):
jpayne@69	734 """Check if a primary (internal) id is this namespace (sub database)."""
jpayne@69	735 sql = (
jpayne@69	736 "SELECT COUNT(bioentry_id) FROM bioentry "
jpayne@69	737 "WHERE biodatabase_id=%s AND bioentry_id=%s;"
jpayne@69	738 )
jpayne@69	739 # The bioentry_id field is an integer in the schema.
jpayne@69	740 # PostgreSQL will throw an error if we use a non integer in the query.
jpayne@69	741 try:
jpayne@69	742 bioentry_id = int(value)
jpayne@69	743 except ValueError:
jpayne@69	744 return False
jpayne@69	745 return bool(
jpayne@69	746 self.adaptor.execute_and_fetch_col0(sql, (self.dbid, bioentry_id))[0]
jpayne@69	747 )
jpayne@69	748
jpayne@69	749 def __iter__(self):
jpayne@69	750 """Iterate over ids (which may not be meaningful outside this database)."""
jpayne@69	751 # TODO - Iterate over the cursor, much more efficient
jpayne@69	752 return iter(self.adaptor.list_bioentry_ids(self.dbid))
jpayne@69	753
jpayne@69	754 def keys(self):
jpayne@69	755 """Iterate over ids (which may not be meaningful outside this database)."""
jpayne@69	756 return iter(self)
jpayne@69	757
jpayne@69	758 def values(self):
jpayne@69	759 """Iterate over DBSeqRecord objects in the namespace (sub database)."""
jpayne@69	760 for key in self:
jpayne@69	761 yield self[key]
jpayne@69	762
jpayne@69	763 def items(self):
jpayne@69	764 """Iterate over (id, DBSeqRecord) for the namespace (sub database)."""
jpayne@69	765 for key in self:
jpayne@69	766 yield key, self[key]
jpayne@69	767
jpayne@69	768 def lookup(self, **kwargs):
jpayne@69	769 """Return a DBSeqRecord using an acceptable identifier.
jpayne@69	770
jpayne@69	771 Arguments:
jpayne@69	772 - kwargs - A single key-value pair where the key is one
jpayne@69	773 of primary_id, gi, display_id, name, accession, version
jpayne@69	774
jpayne@69	775 """
jpayne@69	776 if len(kwargs) != 1:
jpayne@69	777 raise TypeError("single key/value parameter expected")
jpayne@69	778 k, v = list(kwargs.items())[0]
jpayne@69	779 if k not in _allowed_lookups:
jpayne@69	780 raise TypeError(
jpayne@69	781 f"lookup() expects one of {list(_allowed_lookups.keys())!r}, not {k!r}"
jpayne@69	782 )
jpayne@69	783 lookup_name = _allowed_lookups[k]
jpayne@69	784 lookup_func = getattr(self.adaptor, lookup_name)
jpayne@69	785 seqid = lookup_func(self.dbid, v)
jpayne@69	786 return BioSeq.DBSeqRecord(self.adaptor, seqid)
jpayne@69	787
jpayne@69	788 def load(self, record_iterator, fetch_NCBI_taxonomy=False):
jpayne@69	789 """Load a set of SeqRecords into the BioSQL database.
jpayne@69	790
jpayne@69	791 record_iterator is either a list of SeqRecord objects, or an
jpayne@69	792 Iterator object that returns SeqRecord objects (such as the
jpayne@69	793 output from the Bio.SeqIO.parse() function), which will be
jpayne@69	794 used to populate the database.
jpayne@69	795
jpayne@69	796 fetch_NCBI_taxonomy is boolean flag allowing or preventing
jpayne@69	797 connection to the taxonomic database on the NCBI server
jpayne@69	798 (via Bio.Entrez) to fetch a detailed taxonomy for each
jpayne@69	799 SeqRecord.
jpayne@69	800
jpayne@69	801 Example::
jpayne@69	802
jpayne@69	803 from Bio import SeqIO
jpayne@69	804 count = db.load(SeqIO.parse(open(filename), format))
jpayne@69	805
jpayne@69	806 Returns the number of records loaded.
jpayne@69	807 """
jpayne@69	808 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, fetch_NCBI_taxonomy)
jpayne@69	809 num_records = 0
jpayne@69	810 global _POSTGRES_RULES_PRESENT
jpayne@69	811 for cur_record in record_iterator:
jpayne@69	812 num_records += 1
jpayne@69	813 # Hack to work around BioSQL Bug 2839 - If using PostgreSQL and
jpayne@69	814 # the RULES are present check for a duplicate record before loading
jpayne@69	815 if _POSTGRES_RULES_PRESENT:
jpayne@69	816 # Recreate what the Loader's _load_bioentry_table will do:
jpayne@69	817 if cur_record.id.count(".") == 1:
jpayne@69	818 accession, version = cur_record.id.split(".")
jpayne@69	819 try:
jpayne@69	820 version = int(version)
jpayne@69	821 except ValueError:
jpayne@69	822 accession = cur_record.id
jpayne@69	823 version = 0
jpayne@69	824 else:
jpayne@69	825 accession = cur_record.id
jpayne@69	826 version = 0
jpayne@69	827 gi = cur_record.annotations.get("gi")
jpayne@69	828 sql = (
jpayne@69	829 "SELECT bioentry_id FROM bioentry "
jpayne@69	830 "WHERE (identifier = '%s' AND biodatabase_id = '%s') "
jpayne@69	831 "OR (accession = '%s' AND version = '%s' AND biodatabase_id = '%s')"
jpayne@69	832 )
jpayne@69	833 self.adaptor.execute(
jpayne@69	834 sql % (gi, self.dbid, accession, version, self.dbid)
jpayne@69	835 )
jpayne@69	836 if self.adaptor.cursor.fetchone():
jpayne@69	837 raise self.adaptor.conn.IntegrityError(
jpayne@69	838 "Duplicate record detected: record has not been inserted"
jpayne@69	839 )
jpayne@69	840 # End of hack
jpayne@69	841 db_loader.load_seqrecord(cur_record)
jpayne@69	842 return num_records

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/BioSeqDatabase.py @ 69:33d812a61356