csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/Loader.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/Loader.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 # Copyright 2002 by Andrew Dalke. All rights reserved.
jpayne@69	2 # Revisions 2007-2016 copyright by Peter Cock. All rights reserved.
jpayne@69	3 # Revisions 2008 copyright by Cymon J. Cox. All rights reserved.
jpayne@69	4 #
jpayne@69	5 # This file is part of the Biopython distribution and governed by your
jpayne@69	6 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@69	7 # Please see the LICENSE file that should have been included as part of this
jpayne@69	8 # package.
jpayne@69	9 #
jpayne@69	10 # Note that BioSQL (including the database schema and scripts) is
jpayne@69	11 # available and licensed separately. Please consult www.biosql.org
jpayne@69	12
jpayne@69	13 """Load biopython objects into a BioSQL database for persistent storage.
jpayne@69	14
jpayne@69	15 This code makes it possible to store biopython objects in a relational
jpayne@69	16 database and then retrieve them back. You shouldn't use any of the
jpayne@69	17 classes in this module directly. Rather, call the load() method on
jpayne@69	18 a database object.
jpayne@69	19 """
jpayne@69	20 # standard modules
jpayne@69	21 from time import gmtime, strftime
jpayne@69	22
jpayne@69	23 # biopython
jpayne@69	24 from Bio.SeqUtils.CheckSum import crc64
jpayne@69	25 from Bio import Entrez
jpayne@69	26 from Bio.Seq import UndefinedSequenceError
jpayne@69	27 from Bio.SeqFeature import UnknownPosition
jpayne@69	28
jpayne@69	29
jpayne@69	30 class DatabaseLoader:
jpayne@69	31 """Object used to load SeqRecord objects into a BioSQL database."""
jpayne@69	32
jpayne@69	33 def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
jpayne@69	34 """Initialize with connection information for the database.
jpayne@69	35
jpayne@69	36 Creating a DatabaseLoader object is normally handled via the
jpayne@69	37 BioSeqDatabase DBServer object, for example::
jpayne@69	38
jpayne@69	39 from BioSQL import BioSeqDatabase
jpayne@69	40 server = BioSeqDatabase.open_database(driver="MySQLdb",
jpayne@69	41 user="gbrowse",
jpayne@69	42 passwd="biosql",
jpayne@69	43 host="localhost",
jpayne@69	44 db="test_biosql")
jpayne@69	45 try:
jpayne@69	46 db = server["test"]
jpayne@69	47 except KeyError:
jpayne@69	48 db = server.new_database("test",
jpayne@69	49 description="For testing GBrowse")
jpayne@69	50
jpayne@69	51 """
jpayne@69	52 self.adaptor = adaptor
jpayne@69	53 self.dbid = dbid
jpayne@69	54 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
jpayne@69	55
jpayne@69	56 def load_seqrecord(self, record):
jpayne@69	57 """Load a Biopython SeqRecord into the database."""
jpayne@69	58 bioentry_id = self._load_bioentry_table(record)
jpayne@69	59 self._load_bioentry_date(record, bioentry_id)
jpayne@69	60 self._load_biosequence(record, bioentry_id)
jpayne@69	61 self._load_comment(record, bioentry_id)
jpayne@69	62 self._load_dbxrefs(record, bioentry_id)
jpayne@69	63 references = record.annotations.get("references", ())
jpayne@69	64 for reference, rank in zip(references, list(range(len(references)))):
jpayne@69	65 self._load_reference(reference, rank, bioentry_id)
jpayne@69	66 self._load_annotations(record, bioentry_id)
jpayne@69	67 for seq_feature_num in range(len(record.features)):
jpayne@69	68 seq_feature = record.features[seq_feature_num]
jpayne@69	69 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
jpayne@69	70
jpayne@69	71 def _get_ontology_id(self, name, definition=None):
jpayne@69	72 """Return identifier for the named ontology (PRIVATE).
jpayne@69	73
jpayne@69	74 This looks through the onotology table for a the given entry name.
jpayne@69	75 If it is not found, a row is added for this ontology (using the
jpayne@69	76 definition if supplied). In either case, the id corresponding to
jpayne@69	77 the provided name is returned, so that you can reference it in
jpayne@69	78 another table.
jpayne@69	79 """
jpayne@69	80 oids = self.adaptor.execute_and_fetch_col0(
jpayne@69	81 "SELECT ontology_id FROM ontology WHERE name = %s", (name,)
jpayne@69	82 )
jpayne@69	83 if oids:
jpayne@69	84 return oids[0]
jpayne@69	85 self.adaptor.execute(
jpayne@69	86 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", (name, definition)
jpayne@69	87 )
jpayne@69	88 return self.adaptor.last_id("ontology")
jpayne@69	89
jpayne@69	90 def _get_term_id(self, name, ontology_id=None, definition=None, identifier=None):
jpayne@69	91 """Get the id that corresponds to a term (PRIVATE).
jpayne@69	92
jpayne@69	93 This looks through the term table for a the given term. If it
jpayne@69	94 is not found, a new id corresponding to this term is created.
jpayne@69	95 In either case, the id corresponding to that term is returned, so
jpayne@69	96 that you can reference it in another table.
jpayne@69	97
jpayne@69	98 The ontology_id should be used to disambiguate the term.
jpayne@69	99 """
jpayne@69	100 # try to get the term id
jpayne@69	101 sql = "SELECT term_id FROM term WHERE name = %s"
jpayne@69	102 fields = [name]
jpayne@69	103 if ontology_id:
jpayne@69	104 sql += " AND ontology_id = %s"
jpayne@69	105 fields.append(ontology_id)
jpayne@69	106 id_results = self.adaptor.execute_and_fetchall(sql, fields)
jpayne@69	107 # something is wrong
jpayne@69	108 if len(id_results) > 1:
jpayne@69	109 raise ValueError(f"Multiple term ids for {name}: {id_results!r}")
jpayne@69	110 elif len(id_results) == 1:
jpayne@69	111 return id_results[0][0]
jpayne@69	112 else:
jpayne@69	113 sql = (
jpayne@69	114 "INSERT INTO term (name, definition,"
jpayne@69	115 " identifier, ontology_id)"
jpayne@69	116 " VALUES (%s, %s, %s, %s)"
jpayne@69	117 )
jpayne@69	118 self.adaptor.execute(sql, (name, definition, identifier, ontology_id))
jpayne@69	119 return self.adaptor.last_id("term")
jpayne@69	120
jpayne@69	121 def _add_dbxref(self, dbname, accession, version):
jpayne@69	122 """Insert a dbxref and return its id (PRIVATE)."""
jpayne@69	123 self.adaptor.execute(
jpayne@69	124 "INSERT INTO dbxref(dbname, accession, version) VALUES (%s, %s, %s)",
jpayne@69	125 (dbname, accession, version),
jpayne@69	126 )
jpayne@69	127 return self.adaptor.last_id("dbxref")
jpayne@69	128
jpayne@69	129 def _get_taxon_id(self, record):
jpayne@69	130 """Get the taxon id for this record (PRIVATE).
jpayne@69	131
jpayne@69	132 Arguments:
jpayne@69	133 - record - a SeqRecord object
jpayne@69	134
jpayne@69	135 This searches the taxon/taxon_name tables using the
jpayne@69	136 NCBI taxon ID, scientific name and common name to find
jpayne@69	137 the matching taxon table entry's id.
jpayne@69	138
jpayne@69	139 If the species isn't in the taxon table, and we have at
jpayne@69	140 least the NCBI taxon ID, scientific name or common name,
jpayne@69	141 at least a minimal stub entry is created in the table.
jpayne@69	142
jpayne@69	143 Returns the taxon id (database key for the taxon table,
jpayne@69	144 not an NCBI taxon ID), or None if the taxonomy information
jpayne@69	145 is missing.
jpayne@69	146
jpayne@69	147 See also the BioSQL script load_ncbi_taxonomy.pl which
jpayne@69	148 will populate and update the taxon/taxon_name tables
jpayne@69	149 with the latest information from the NCBI.
jpayne@69	150 """
jpayne@69	151 # To find the NCBI taxid, first check for a top level annotation
jpayne@69	152 ncbi_taxon_id = None
jpayne@69	153 if "ncbi_taxid" in record.annotations:
jpayne@69	154 # Could be a list of IDs.
jpayne@69	155 if isinstance(record.annotations["ncbi_taxid"], list):
jpayne@69	156 if len(record.annotations["ncbi_taxid"]) == 1:
jpayne@69	157 ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
jpayne@69	158 else:
jpayne@69	159 ncbi_taxon_id = record.annotations["ncbi_taxid"]
jpayne@69	160 if not ncbi_taxon_id:
jpayne@69	161 # Secondly, look for a source feature
jpayne@69	162 for f in record.features:
jpayne@69	163 if f.type == "source":
jpayne@69	164 quals = getattr(f, "qualifiers", {})
jpayne@69	165 if "db_xref" in quals:
jpayne@69	166 for db_xref in f.qualifiers["db_xref"]:
jpayne@69	167 if db_xref.startswith("taxon:"):
jpayne@69	168 ncbi_taxon_id = int(db_xref[6:])
jpayne@69	169 break
jpayne@69	170 if ncbi_taxon_id:
jpayne@69	171 break
jpayne@69	172
jpayne@69	173 try:
jpayne@69	174 scientific_name = record.annotations["organism"][:255]
jpayne@69	175 except KeyError:
jpayne@69	176 scientific_name = None
jpayne@69	177 try:
jpayne@69	178 common_name = record.annotations["source"][:255]
jpayne@69	179 except KeyError:
jpayne@69	180 common_name = None
jpayne@69	181 # Note: The maximum length for taxon names in the schema is 255.
jpayne@69	182 # Cropping it now should help in getting a match when searching,
jpayne@69	183 # and avoids an error if we try and add these to the database.
jpayne@69	184
jpayne@69	185 if ncbi_taxon_id:
jpayne@69	186 # Good, we have the NCBI taxon to go on - this is unambiguous :)
jpayne@69	187 # Note that the scientific name and common name will only be
jpayne@69	188 # used if we have to record a stub entry.
jpayne@69	189 return self._get_taxon_id_from_ncbi_taxon_id(
jpayne@69	190 ncbi_taxon_id, scientific_name, common_name
jpayne@69	191 )
jpayne@69	192
jpayne@69	193 if not common_name and not scientific_name:
jpayne@69	194 # Nothing to go on... and there is no point adding
jpayne@69	195 # a new entry to the database. We'll just leave this
jpayne@69	196 # sequence's taxon as a NULL in the database.
jpayne@69	197 return None
jpayne@69	198
jpayne@69	199 # Next, we'll try to find a match based on the species name
jpayne@69	200 # (stored in GenBank files as the organism and/or the source).
jpayne@69	201 if scientific_name:
jpayne@69	202 taxa = self.adaptor.execute_and_fetch_col0(
jpayne@69	203 "SELECT taxon_id FROM taxon_name"
jpayne@69	204 " WHERE name_class = 'scientific name' AND name = %s",
jpayne@69	205 (scientific_name,),
jpayne@69	206 )
jpayne@69	207 if taxa:
jpayne@69	208 # Good, mapped the scientific name to a taxon table entry
jpayne@69	209 return taxa[0]
jpayne@69	210
jpayne@69	211 # Last chance...
jpayne@69	212 if common_name:
jpayne@69	213 taxa = self.adaptor.execute_and_fetch_col0(
jpayne@69	214 "SELECT DISTINCT taxon_id FROM taxon_name WHERE name = %s",
jpayne@69	215 (common_name,),
jpayne@69	216 )
jpayne@69	217 # Its natural that several distinct taxa will have the same common
jpayne@69	218 # name - in which case we can't resolve the taxon uniquely.
jpayne@69	219 if len(taxa) > 1:
jpayne@69	220 raise ValueError(
jpayne@69	221 "Taxa: %d species have name %r" % (len(taxa), common_name)
jpayne@69	222 )
jpayne@69	223 if taxa:
jpayne@69	224 # Good, mapped the common name to a taxon table entry
jpayne@69	225 return taxa[0]
jpayne@69	226
jpayne@69	227 # At this point, as far as we can tell, this species isn't
jpayne@69	228 # in the taxon table already. So we'll have to add it.
jpayne@69	229 # We don't have an NCBI taxonomy ID, so if we do record just
jpayne@69	230 # a stub entry, there is no simple way to fix this later.
jpayne@69	231 #
jpayne@69	232 # TODO - Should we try searching the NCBI taxonomy using the
jpayne@69	233 # species name?
jpayne@69	234 #
jpayne@69	235 # OK, let's try inserting the species.
jpayne@69	236 # Chances are we don't have enough information ...
jpayne@69	237 # Furthermore, it won't be in the hierarchy.
jpayne@69	238
jpayne@69	239 lineage = []
jpayne@69	240 for c in record.annotations.get("taxonomy", []):
jpayne@69	241 lineage.append([None, None, c])
jpayne@69	242 if lineage:
jpayne@69	243 lineage[-1][1] = "genus"
jpayne@69	244 lineage.append([None, "species", record.annotations["organism"]])
jpayne@69	245 # XXX do we have them?
jpayne@69	246 if "subspecies" in record.annotations:
jpayne@69	247 lineage.append([None, "subspecies", record.annotations["subspecies"]])
jpayne@69	248 if "variant" in record.annotations:
jpayne@69	249 lineage.append([None, "varietas", record.annotations["variant"]])
jpayne@69	250 lineage[-1][0] = ncbi_taxon_id
jpayne@69	251
jpayne@69	252 left_value = self.adaptor.execute_one("SELECT MAX(left_value) FROM taxon")[0]
jpayne@69	253 if not left_value:
jpayne@69	254 left_value = 0
jpayne@69	255 left_value += 1
jpayne@69	256
jpayne@69	257 # XXX -- Brad: Fixing this for now in an ugly way because
jpayne@69	258 # I am getting overlaps for right_values. I need to dig into this
jpayne@69	259 # more to actually understand how it works. I'm not sure it is
jpayne@69	260 # actually working right anyhow.
jpayne@69	261 right_start_value = self.adaptor.execute_one(
jpayne@69	262 "SELECT MAX(right_value) FROM taxon"
jpayne@69	263 )[0]
jpayne@69	264 if not right_start_value:
jpayne@69	265 right_start_value = 0
jpayne@69	266 right_value = right_start_value + 2 * len(lineage) - 1
jpayne@69	267
jpayne@69	268 parent_taxon_id = None
jpayne@69	269 for taxon in lineage:
jpayne@69	270 self.adaptor.execute(
jpayne@69	271 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
jpayne@69	272 " left_value, right_value)"
jpayne@69	273 " VALUES (%s, %s, %s, %s, %s)",
jpayne@69	274 (parent_taxon_id, taxon[0], taxon[1], left_value, right_value),
jpayne@69	275 )
jpayne@69	276 taxon_id = self.adaptor.last_id("taxon")
jpayne@69	277 self.adaptor.execute(
jpayne@69	278 "INSERT INTO taxon_name(taxon_id, name, name_class)"
jpayne@69	279 "VALUES (%s, %s, 'scientific name')",
jpayne@69	280 (taxon_id, taxon[2][:255]),
jpayne@69	281 )
jpayne@69	282 # Note the name field is limited to 255, some SwissProt files
jpayne@69	283 # have a multi-species name which can be longer. So truncate this.
jpayne@69	284 left_value += 1
jpayne@69	285 right_value -= 1
jpayne@69	286 parent_taxon_id = taxon_id
jpayne@69	287 if common_name:
jpayne@69	288 self.adaptor.execute(
jpayne@69	289 "INSERT INTO taxon_name(taxon_id, name, name_class)"
jpayne@69	290 "VALUES (%s, %s, 'common name')",
jpayne@69	291 (taxon_id, common_name),
jpayne@69	292 )
jpayne@69	293
jpayne@69	294 return taxon_id
jpayne@69	295
jpayne@69	296 def _fix_name_class(self, entrez_name):
jpayne@69	297 """Map Entrez name terms to those used in taxdump (PRIVATE).
jpayne@69	298
jpayne@69	299 We need to make this conversion to match the taxon_name.name_class
jpayne@69	300 values used by the BioSQL load_ncbi_taxonomy.pl script.
jpayne@69	301
jpayne@69	302 e.g.::
jpayne@69	303
jpayne@69	304 "ScientificName" -> "scientific name",
jpayne@69	305 "EquivalentName" -> "equivalent name",
jpayne@69	306 "Synonym" -> "synonym",
jpayne@69	307
jpayne@69	308 """
jpayne@69	309 # Add any special cases here:
jpayne@69	310 #
jpayne@69	311 # known = {}
jpayne@69	312 # try:
jpayne@69	313 # return known[entrez_name]
jpayne@69	314 # except KeyError:
jpayne@69	315 # pass
jpayne@69	316
jpayne@69	317 # Try automatically by adding spaces before each capital
jpayne@69	318 def add_space(letter):
jpayne@69	319 """Add a space before a capital letter."""
jpayne@69	320 if letter.isupper():
jpayne@69	321 return " " + letter.lower()
jpayne@69	322 else:
jpayne@69	323 return letter
jpayne@69	324
jpayne@69	325 answer = "".join(add_space(letter) for letter in entrez_name).strip()
jpayne@69	326 if answer != answer.lower():
jpayne@69	327 raise ValueError(
jpayne@69	328 f"Expected processed entrez_name, '{answer}' to only have lower case letters."
jpayne@69	329 )
jpayne@69	330 return answer
jpayne@69	331
jpayne@69	332 def _update_left_right_taxon_values(self, left_value):
jpayne@69	333 """Update the left and right taxon values in the table (PRIVATE)."""
jpayne@69	334 if not left_value:
jpayne@69	335 return
jpayne@69	336 # Due to the UNIQUE constraint on the left and right values in the taxon
jpayne@69	337 # table we cannot simply update them through an SQL statement as we risk
jpayne@69	338 # colliding values. Instead we must select all of the rows that we want to
jpayne@69	339 # update, modify the values in python and then update the rows
jpayne@69	340 # self.adaptor.execute("UPDATE taxon SET right_value = right_value + 2 "
jpayne@69	341 # "WHERE right_value >= %s", (left_value,))
jpayne@69	342 # self.adaptor.execute("UPDATE taxon SET left_value = left_value + 2 "
jpayne@69	343 # "WHERE left_value > %s", (left_value,))
jpayne@69	344
jpayne@69	345 rows = self.adaptor.execute_and_fetchall(
jpayne@69	346 "SELECT left_value, right_value, taxon_id FROM taxon "
jpayne@69	347 "WHERE right_value >= %s or left_value > %s",
jpayne@69	348 (left_value, left_value),
jpayne@69	349 )
jpayne@69	350
jpayne@69	351 right_rows = []
jpayne@69	352 left_rows = []
jpayne@69	353 for row in rows:
jpayne@69	354 new_right = row[1]
jpayne@69	355 new_left = row[0]
jpayne@69	356 if new_right >= left_value:
jpayne@69	357 new_right += 2
jpayne@69	358
jpayne@69	359 if new_left > left_value:
jpayne@69	360 new_left += 2
jpayne@69	361 right_rows.append((new_right, row[2]))
jpayne@69	362 left_rows.append((new_left, row[2]))
jpayne@69	363
jpayne@69	364 # sort the rows based on the value from largest to smallest
jpayne@69	365 # should ensure no overlaps
jpayne@69	366 right_rows = sorted(right_rows, key=lambda x: x[0], reverse=True)
jpayne@69	367 left_rows = sorted(left_rows, key=lambda x: x[0], reverse=True)
jpayne@69	368
jpayne@69	369 self.adaptor.executemany(
jpayne@69	370 "UPDATE taxon SET left_value = %s WHERE taxon_id = %s", left_rows
jpayne@69	371 )
jpayne@69	372 self.adaptor.executemany(
jpayne@69	373 "UPDATE taxon SET right_value = %s WHERE taxon_id = %s", right_rows
jpayne@69	374 )
jpayne@69	375
jpayne@69	376 def _get_taxon_id_from_ncbi_taxon_id(
jpayne@69	377 self, ncbi_taxon_id, scientific_name=None, common_name=None
jpayne@69	378 ):
jpayne@69	379 """Get the taxon id for record from NCBI taxon ID (PRIVATE).
jpayne@69	380
jpayne@69	381 Arguments:
jpayne@69	382 - ncbi_taxon_id - string containing an NCBI taxon id
jpayne@69	383 - scientific_name - string, used if a stub entry is recorded
jpayne@69	384 - common_name - string, used if a stub entry is recorded
jpayne@69	385
jpayne@69	386 This searches the taxon table using ONLY the NCBI taxon ID
jpayne@69	387 to find the matching taxon table entry's ID (database key).
jpayne@69	388
jpayne@69	389 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
jpayne@69	390 flag is true, Biopython will attempt to go online using Bio.Entrez
jpayne@69	391 to fetch the official NCBI lineage, recursing up the tree until an
jpayne@69	392 existing entry is found in the database or the full lineage has been
jpayne@69	393 fetched.
jpayne@69	394
jpayne@69	395 Otherwise the NCBI taxon ID, scientific name and common name are
jpayne@69	396 recorded as a minimal stub entry in the taxon and taxon_name tables.
jpayne@69	397 Any partial information about the lineage from the SeqRecord is NOT
jpayne@69	398 recorded. This should mean that (re)running the BioSQL script
jpayne@69	399 load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
jpayne@69	400
jpayne@69	401 Returns the taxon id (database key for the taxon table, not
jpayne@69	402 an NCBI taxon ID).
jpayne@69	403 """
jpayne@69	404 if not ncbi_taxon_id:
jpayne@69	405 raise ValueError("Expected a non-empty value for ncbi_taxon_id.")
jpayne@69	406
jpayne@69	407 taxon_id = self.adaptor.execute_and_fetch_col0(
jpayne@69	408 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", (int(ncbi_taxon_id),)
jpayne@69	409 )
jpayne@69	410 if taxon_id:
jpayne@69	411 # Good, we have mapped the NCBI taxid to a taxon table entry
jpayne@69	412 return taxon_id[0]
jpayne@69	413
jpayne@69	414 # At this point, as far as we can tell, this species isn't
jpayne@69	415 # in the taxon table already. So we'll have to add it.
jpayne@69	416
jpayne@69	417 parent_taxon_id = None
jpayne@69	418 rank = "species"
jpayne@69	419 genetic_code = None
jpayne@69	420 mito_genetic_code = None
jpayne@69	421 parent_left_value = None
jpayne@69	422 parent_right_value = None
jpayne@69	423 left_value = None
jpayne@69	424 right_value = None
jpayne@69	425 species_names = []
jpayne@69	426 if scientific_name:
jpayne@69	427 species_names.append(("scientific name", scientific_name))
jpayne@69	428 if common_name:
jpayne@69	429 species_names.append(("common name", common_name))
jpayne@69	430
jpayne@69	431 if self.fetch_NCBI_taxonomy:
jpayne@69	432 # Go online to get the parent taxon ID!
jpayne@69	433 handle = Entrez.efetch(db="taxonomy", id=ncbi_taxon_id, retmode="XML")
jpayne@69	434 taxonomic_record = Entrez.read(handle)
jpayne@69	435 if len(taxonomic_record) == 1:
jpayne@69	436 if taxonomic_record[0]["TaxId"] != str(ncbi_taxon_id):
jpayne@69	437 raise ValueError(
jpayne@69	438 f"ncbi_taxon_id different from parent taxon id. {ncbi_taxon_id} versus {taxonomic_record[0]['TaxId']}"
jpayne@69	439 )
jpayne@69	440
jpayne@69	441 (
jpayne@69	442 parent_taxon_id,
jpayne@69	443 parent_left_value,
jpayne@69	444 parent_right_value,
jpayne@69	445 ) = self._get_taxon_id_from_ncbi_lineage(
jpayne@69	446 taxonomic_record[0]["LineageEx"]
jpayne@69	447 )
jpayne@69	448
jpayne@69	449 left_value = parent_right_value
jpayne@69	450 right_value = parent_right_value + 1
jpayne@69	451
jpayne@69	452 rank = str(taxonomic_record[0]["Rank"])
jpayne@69	453
jpayne@69	454 genetic_code = int(taxonomic_record[0]["GeneticCode"]["GCId"])
jpayne@69	455
jpayne@69	456 mito_genetic_code = int(taxonomic_record[0]["MitoGeneticCode"]["MGCId"])
jpayne@69	457
jpayne@69	458 species_names = [
jpayne@69	459 ("scientific name", str(taxonomic_record[0]["ScientificName"]))
jpayne@69	460 ]
jpayne@69	461 try:
jpayne@69	462 for name_class, names in taxonomic_record[0]["OtherNames"].items():
jpayne@69	463 name_class = self._fix_name_class(name_class)
jpayne@69	464 if not isinstance(names, list):
jpayne@69	465 # The Entrez parser seems to return single entry
jpayne@69	466 # lists as just a string which is annoying.
jpayne@69	467 names = [names]
jpayne@69	468 for name in names:
jpayne@69	469 # Want to ignore complex things like ClassCDE
jpayne@69	470 # entries
jpayne@69	471 if isinstance(name, str):
jpayne@69	472 species_names.append((name_class, name))
jpayne@69	473 except KeyError:
jpayne@69	474 # OtherNames isn't always present,
jpayne@69	475 # e.g. NCBI taxon 41205, Bromheadia finlaysoniana
jpayne@69	476 pass
jpayne@69	477 else:
jpayne@69	478 pass
jpayne@69	479 # If we are not allowed to go online, we will record the bare minimum;
jpayne@69	480 # as long as the NCBI taxon id is present, then (re)running
jpayne@69	481 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage
jpayne@69	482 # (and update the species names).
jpayne@69	483 #
jpayne@69	484 # I am NOT going to try and record the lineage, even if it
jpayne@69	485 # is in the record annotation as a list of names, as we won't
jpayne@69	486 # know the NCBI taxon IDs for these parent nodes.
jpayne@69	487
jpayne@69	488 self._update_left_right_taxon_values(left_value)
jpayne@69	489
jpayne@69	490 self.adaptor.execute(
jpayne@69	491 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
jpayne@69	492 " genetic_code, mito_genetic_code, left_value, right_value)"
jpayne@69	493 " VALUES (%s, %s, %s, %s, %s, %s, %s)",
jpayne@69	494 (
jpayne@69	495 parent_taxon_id,
jpayne@69	496 ncbi_taxon_id,
jpayne@69	497 rank,
jpayne@69	498 genetic_code,
jpayne@69	499 mito_genetic_code,
jpayne@69	500 left_value,
jpayne@69	501 right_value,
jpayne@69	502 ),
jpayne@69	503 )
jpayne@69	504
jpayne@69	505 taxon_id = self.adaptor.last_id("taxon")
jpayne@69	506
jpayne@69	507 # Record the scientific name, common name, etc
jpayne@69	508 for name_class, name in species_names:
jpayne@69	509 self.adaptor.execute(
jpayne@69	510 "INSERT INTO taxon_name(taxon_id, name, name_class)"
jpayne@69	511 " VALUES (%s, %s, %s)",
jpayne@69	512 (taxon_id, name[:255], name_class),
jpayne@69	513 )
jpayne@69	514 return taxon_id
jpayne@69	515
jpayne@69	516 def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
jpayne@69	517 """Recursive method to get taxon ID from NCBI lineage (PRIVATE).
jpayne@69	518
jpayne@69	519 Arguments:
jpayne@69	520 - taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
jpayne@69	521
jpayne@69	522 First dictionary in list is the taxonomy root, highest would be
jpayne@69	523 the species. Each dictionary includes:
jpayne@69	524
jpayne@69	525 - TaxID (string, NCBI taxon id)
jpayne@69	526 - Rank (string, e.g. "species", "genus", ..., "phylum", ...)
jpayne@69	527 - ScientificName (string)
jpayne@69	528
jpayne@69	529 (and that is all at the time of writing)
jpayne@69	530
jpayne@69	531 This method will record all the lineage given, returning the taxon id
jpayne@69	532 (database key, not NCBI taxon id) of the final entry (the species).
jpayne@69	533 """
jpayne@69	534 ncbi_taxon_id = int(taxonomic_lineage[-1]["TaxId"])
jpayne@69	535 left_value = None
jpayne@69	536 right_value = None
jpayne@69	537 parent_left_value = None
jpayne@69	538 parent_right_value = None
jpayne@69	539 # Is this in the database already? Check the taxon table...
jpayne@69	540 rows = self.adaptor.execute_and_fetchall(
jpayne@69	541 "SELECT taxon_id, left_value, right_value FROM taxon"
jpayne@69	542 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id
jpayne@69	543 )
jpayne@69	544 if rows:
jpayne@69	545 # we could verify that the Scientific Name etc in the database
jpayne@69	546 # is the same and update it or print a warning if not...
jpayne@69	547 if len(rows) != 1:
jpayne@69	548 raise ValueError(f"Expected 1 reponse, got {len(rows)}")
jpayne@69	549 return rows[0]
jpayne@69	550
jpayne@69	551 # We have to record this.
jpayne@69	552 if len(taxonomic_lineage) > 1:
jpayne@69	553 # Use recursion to find out the taxon id (database key) of the
jpayne@69	554 # parent.
jpayne@69	555 (
jpayne@69	556 parent_taxon_id,
jpayne@69	557 parent_left_value,
jpayne@69	558 parent_right_value,
jpayne@69	559 ) = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
jpayne@69	560 left_value = parent_right_value
jpayne@69	561 right_value = parent_right_value + 1
jpayne@69	562 if not isinstance(parent_taxon_id, int):
jpayne@69	563 raise ValueError(
jpayne@69	564 f"Expected parent_taxon_id to be an int, got {parent_taxon_id}"
jpayne@69	565 )
jpayne@69	566 else:
jpayne@69	567 # we have reached the top of the lineage but no current taxonomy
jpayne@69	568 # id has been found
jpayne@69	569 parent_taxon_id = None
jpayne@69	570 left_value = self.adaptor.execute_one("SELECT MAX(left_value) FROM taxon")[
jpayne@69	571 0
jpayne@69	572 ]
jpayne@69	573 if not left_value:
jpayne@69	574 left_value = 0
jpayne@69	575
jpayne@69	576 right_value = left_value + 1
jpayne@69	577
jpayne@69	578 self._update_left_right_taxon_values(left_value)
jpayne@69	579
jpayne@69	580 # INSERT new taxon
jpayne@69	581 rank = str(taxonomic_lineage[-1].get("Rank"))
jpayne@69	582 self.adaptor.execute(
jpayne@69	583 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, "
jpayne@69	584 "left_value, right_value) VALUES (%s, %s, %s, %s, %s)",
jpayne@69	585 (ncbi_taxon_id, parent_taxon_id, rank, left_value, right_value),
jpayne@69	586 )
jpayne@69	587
jpayne@69	588 taxon_id = self.adaptor.last_id("taxon")
jpayne@69	589 # assert isinstance(taxon_id, int), repr(taxon_id)
jpayne@69	590 # ... and its name in taxon_name
jpayne@69	591 scientific_name = taxonomic_lineage[-1].get("ScientificName")
jpayne@69	592 if scientific_name:
jpayne@69	593 self.adaptor.execute(
jpayne@69	594 "INSERT INTO taxon_name(taxon_id, name, name_class) "
jpayne@69	595 "VALUES (%s, %s, 'scientific name')",
jpayne@69	596 (taxon_id, scientific_name[:255]),
jpayne@69	597 )
jpayne@69	598 return taxon_id, left_value, right_value
jpayne@69	599
jpayne@69	600 def _load_bioentry_table(self, record):
jpayne@69	601 """Fill the bioentry table with sequence information (PRIVATE).
jpayne@69	602
jpayne@69	603 Arguments:
jpayne@69	604 - record - SeqRecord object to add to the database.
jpayne@69	605
jpayne@69	606 """
jpayne@69	607 # get the pertinent info and insert it
jpayne@69	608
jpayne@69	609 if record.id.count(".") == 1: # try to get a version from the id
jpayne@69	610 # This assumes the string is something like "XXXXXXXX.123"
jpayne@69	611 accession, version = record.id.split(".")
jpayne@69	612 try:
jpayne@69	613 version = int(version)
jpayne@69	614 except ValueError:
jpayne@69	615 accession = record.id
jpayne@69	616 version = 0
jpayne@69	617 else: # otherwise just use a version of 0
jpayne@69	618 accession = record.id
jpayne@69	619 version = 0
jpayne@69	620
jpayne@69	621 if (
jpayne@69	622 "accessions" in record.annotations
jpayne@69	623 and isinstance(record.annotations["accessions"], list)
jpayne@69	624 and record.annotations["accessions"]
jpayne@69	625 ):
jpayne@69	626 # Take the first accession (one if there is more than one)
jpayne@69	627 accession = record.annotations["accessions"][0]
jpayne@69	628
jpayne@69	629 # Find the taxon id (this is not just the NCBI Taxon ID)
jpayne@69	630 # NOTE - If the species isn't defined in the taxon table,
jpayne@69	631 # a new minimal entry is created.
jpayne@69	632 taxon_id = self._get_taxon_id(record)
jpayne@69	633
jpayne@69	634 if "gi" in record.annotations:
jpayne@69	635 identifier = record.annotations["gi"]
jpayne@69	636 else:
jpayne@69	637 identifier = record.id
jpayne@69	638
jpayne@69	639 # Allow description and division to default to NULL as in BioPerl.
jpayne@69	640 description = getattr(record, "description", None)
jpayne@69	641 division = record.annotations.get("data_file_division")
jpayne@69	642
jpayne@69	643 sql = """
jpayne@69	644 INSERT INTO bioentry (
jpayne@69	645 biodatabase_id,
jpayne@69	646 taxon_id,
jpayne@69	647 name,
jpayne@69	648 accession,
jpayne@69	649 identifier,
jpayne@69	650 division,
jpayne@69	651 description,
jpayne@69	652 version)
jpayne@69	653 VALUES (
jpayne@69	654 %s,
jpayne@69	655 %s,
jpayne@69	656 %s,
jpayne@69	657 %s,
jpayne@69	658 %s,
jpayne@69	659 %s,
jpayne@69	660 %s,
jpayne@69	661 %s)"""
jpayne@69	662 # print(self.dbid, taxon_id, record.name, accession, identifier, \
jpayne@69	663 # division, description, version)
jpayne@69	664 self.adaptor.execute(
jpayne@69	665 sql,
jpayne@69	666 (
jpayne@69	667 self.dbid,
jpayne@69	668 taxon_id,
jpayne@69	669 record.name,
jpayne@69	670 accession,
jpayne@69	671 identifier,
jpayne@69	672 division,
jpayne@69	673 description,
jpayne@69	674 version,
jpayne@69	675 ),
jpayne@69	676 )
jpayne@69	677 # now retrieve the id for the bioentry
jpayne@69	678 return self.adaptor.last_id("bioentry")
jpayne@69	679
jpayne@69	680 def _load_bioentry_date(self, record, bioentry_id):
jpayne@69	681 """Add the effective date of the entry into the database (PRIVATE).
jpayne@69	682
jpayne@69	683 record - a SeqRecord object with an annotated date
jpayne@69	684 bioentry_id - corresponding database identifier
jpayne@69	685 """
jpayne@69	686 # dates are GenBank style, like:
jpayne@69	687 # 14-SEP-2000
jpayne@69	688 date = record.annotations.get("date", strftime("%d-%b-%Y", gmtime()).upper())
jpayne@69	689 if isinstance(date, list):
jpayne@69	690 date = date[0]
jpayne@69	691 annotation_tags_id = self._get_ontology_id("Annotation Tags")
jpayne@69	692 date_id = self._get_term_id("date_changed", annotation_tags_id)
jpayne@69	693 sql = (
jpayne@69	694 "INSERT INTO bioentry_qualifier_value"
jpayne@69	695 ' (bioentry_id, term_id, value, "rank")'
jpayne@69	696 " VALUES (%s, %s, %s, 1)"
jpayne@69	697 )
jpayne@69	698 self.adaptor.execute(sql, (bioentry_id, date_id, date))
jpayne@69	699
jpayne@69	700 def _load_biosequence(self, record, bioentry_id):
jpayne@69	701 """Record SeqRecord's sequence and alphabet in DB (PRIVATE).
jpayne@69	702
jpayne@69	703 Arguments:
jpayne@69	704 - record - a SeqRecord object with a seq property
jpayne@69	705 - bioentry_id - corresponding database identifier
jpayne@69	706
jpayne@69	707 """
jpayne@69	708 if record.seq is None:
jpayne@69	709 # The biosequence table entry is optional, so if we haven't
jpayne@69	710 # got a sequence, we don't need to write to the table.
jpayne@69	711 return
jpayne@69	712
jpayne@69	713 molecule_type = record.annotations.get("molecule_type", "")
jpayne@69	714 if "DNA" in molecule_type:
jpayne@69	715 alphabet = "dna"
jpayne@69	716 elif "RNA" in molecule_type:
jpayne@69	717 alphabet = "rna"
jpayne@69	718 elif "protein" in molecule_type:
jpayne@69	719 alphabet = "protein"
jpayne@69	720 else:
jpayne@69	721 alphabet = "unknown"
jpayne@69	722
jpayne@69	723 try:
jpayne@69	724 seq_str = str(record.seq)
jpayne@69	725 except UndefinedSequenceError:
jpayne@69	726 seq_str = None
jpayne@69	727
jpayne@69	728 sql = (
jpayne@69	729 "INSERT INTO biosequence (bioentry_id, version, "
jpayne@69	730 "length, seq, alphabet) "
jpayne@69	731 "VALUES (%s, 0, %s, %s, %s)"
jpayne@69	732 )
jpayne@69	733 self.adaptor.execute(sql, (bioentry_id, len(record.seq), seq_str, alphabet))
jpayne@69	734
jpayne@69	735 def _load_comment(self, record, bioentry_id):
jpayne@69	736 """Record a SeqRecord's annotated comment in the database (PRIVATE).
jpayne@69	737
jpayne@69	738 Arguments:
jpayne@69	739 - record - a SeqRecord object with an annotated comment
jpayne@69	740 - bioentry_id - corresponding database identifier
jpayne@69	741
jpayne@69	742 """
jpayne@69	743 comments = record.annotations.get("comment")
jpayne@69	744 if not comments:
jpayne@69	745 return
jpayne@69	746 if not isinstance(comments, list):
jpayne@69	747 # It should be a string then...
jpayne@69	748 comments = [comments]
jpayne@69	749
jpayne@69	750 for index, comment in enumerate(comments):
jpayne@69	751 comment = comment.replace("\n", " ")
jpayne@69	752 # TODO - Store each line as a separate entry? This would preserve
jpayne@69	753 # the newlines, but we should check BioPerl etc to be consistent.
jpayne@69	754 sql = (
jpayne@69	755 'INSERT INTO comment (bioentry_id, comment_text, "rank")'
jpayne@69	756 " VALUES (%s, %s, %s)"
jpayne@69	757 )
jpayne@69	758 self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
jpayne@69	759
jpayne@69	760 def _load_annotations(self, record, bioentry_id):
jpayne@69	761 """Record a SeqRecord's misc annotations in the database (PRIVATE).
jpayne@69	762
jpayne@69	763 The annotation strings are recorded in the bioentry_qualifier_value
jpayne@69	764 table, except for special cases like the reference, comment and
jpayne@69	765 taxonomy which are handled with their own tables.
jpayne@69	766
jpayne@69	767 Arguments:
jpayne@69	768 - record - a SeqRecord object with an annotations dictionary
jpayne@69	769 - bioentry_id - corresponding database identifier
jpayne@69	770
jpayne@69	771 """
jpayne@69	772 mono_sql = (
jpayne@69	773 "INSERT INTO bioentry_qualifier_value"
jpayne@69	774 "(bioentry_id, term_id, value)"
jpayne@69	775 " VALUES (%s, %s, %s)"
jpayne@69	776 )
jpayne@69	777 many_sql = (
jpayne@69	778 "INSERT INTO bioentry_qualifier_value"
jpayne@69	779 '(bioentry_id, term_id, value, "rank")'
jpayne@69	780 " VALUES (%s, %s, %s, %s)"
jpayne@69	781 )
jpayne@69	782 tag_ontology_id = self._get_ontology_id("Annotation Tags")
jpayne@69	783 for key, value in record.annotations.items():
jpayne@69	784 if key in ["molecule_type", "references", "comment", "ncbi_taxid", "date"]:
jpayne@69	785 # Handled separately
jpayne@69	786 continue
jpayne@69	787 term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
jpayne@69	788 if isinstance(value, (list, tuple)):
jpayne@69	789 rank = 0
jpayne@69	790 for entry in value:
jpayne@69	791 if isinstance(entry, (str, int)):
jpayne@69	792 # Easy case
jpayne@69	793 rank += 1
jpayne@69	794 self.adaptor.execute(
jpayne@69	795 many_sql, (bioentry_id, term_id, str(entry), rank)
jpayne@69	796 )
jpayne@69	797 else:
jpayne@69	798 pass
jpayne@69	799 elif isinstance(value, (str, int)):
jpayne@69	800 # Have a simple single entry, leave rank as the DB default
jpayne@69	801 self.adaptor.execute(mono_sql, (bioentry_id, term_id, str(value)))
jpayne@69	802 else:
jpayne@69	803 pass
jpayne@69	804 # print("Ignoring annotation '%s' entry of type '%s'" \
jpayne@69	805 # % (key, type(value)))
jpayne@69	806
jpayne@69	807 def _load_reference(self, reference, rank, bioentry_id):
jpayne@69	808 """Record SeqRecord's annotated references in the database (PRIVATE).
jpayne@69	809
jpayne@69	810 Arguments:
jpayne@69	811 - record - a SeqRecord object with annotated references
jpayne@69	812 - bioentry_id - corresponding database identifier
jpayne@69	813
jpayne@69	814 """
jpayne@69	815 refs = None
jpayne@69	816 if reference.medline_id:
jpayne@69	817 refs = self.adaptor.execute_and_fetch_col0(
jpayne@69	818 "SELECT reference_id"
jpayne@69	819 " FROM reference JOIN dbxref USING (dbxref_id)"
jpayne@69	820 " WHERE dbname = 'MEDLINE' AND accession = %s",
jpayne@69	821 (reference.medline_id,),
jpayne@69	822 )
jpayne@69	823 if not refs and reference.pubmed_id:
jpayne@69	824 refs = self.adaptor.execute_and_fetch_col0(
jpayne@69	825 "SELECT reference_id"
jpayne@69	826 " FROM reference JOIN dbxref USING (dbxref_id)"
jpayne@69	827 " WHERE dbname = 'PUBMED' AND accession = %s",
jpayne@69	828 (reference.pubmed_id,),
jpayne@69	829 )
jpayne@69	830 if not refs:
jpayne@69	831 s = []
jpayne@69	832 for f in reference.authors, reference.title, reference.journal:
jpayne@69	833 s.append(f or "<undef>")
jpayne@69	834 crc = crc64("".join(s))
jpayne@69	835 refs = self.adaptor.execute_and_fetch_col0(
jpayne@69	836 "SELECT reference_id FROM reference WHERE crc = %s", (crc,)
jpayne@69	837 )
jpayne@69	838 if not refs:
jpayne@69	839 if reference.medline_id:
jpayne@69	840 dbxref_id = self._add_dbxref("MEDLINE", reference.medline_id, 0)
jpayne@69	841 elif reference.pubmed_id:
jpayne@69	842 dbxref_id = self._add_dbxref("PUBMED", reference.pubmed_id, 0)
jpayne@69	843 else:
jpayne@69	844 dbxref_id = None
jpayne@69	845 authors = reference.authors or None
jpayne@69	846 title = reference.title or None
jpayne@69	847 # The location/journal field cannot be Null, so default
jpayne@69	848 # to an empty string rather than None:
jpayne@69	849 journal = reference.journal or ""
jpayne@69	850 self.adaptor.execute(
jpayne@69	851 "INSERT INTO reference (dbxref_id, location,"
jpayne@69	852 " title, authors, crc)"
jpayne@69	853 " VALUES (%s, %s, %s, %s, %s)",
jpayne@69	854 (dbxref_id, journal, title, authors, crc),
jpayne@69	855 )
jpayne@69	856 reference_id = self.adaptor.last_id("reference")
jpayne@69	857 else:
jpayne@69	858 reference_id = refs[0]
jpayne@69	859
jpayne@69	860 if reference.location:
jpayne@69	861 start = 1 + int(str(reference.location[0].start))
jpayne@69	862 end = int(str(reference.location[0].end))
jpayne@69	863 else:
jpayne@69	864 start = None
jpayne@69	865 end = None
jpayne@69	866
jpayne@69	867 sql = (
jpayne@69	868 "INSERT INTO bioentry_reference (bioentry_id, reference_id,"
jpayne@69	869 ' start_pos, end_pos, "rank") VALUES (%s, %s, %s, %s, %s)'
jpayne@69	870 )
jpayne@69	871 self.adaptor.execute(sql, (bioentry_id, reference_id, start, end, rank + 1))
jpayne@69	872
jpayne@69	873 def _load_seqfeature(self, feature, feature_rank, bioentry_id):
jpayne@69	874 """Load a biopython SeqFeature into the database (PRIVATE)."""
jpayne@69	875 # records loaded from a gff file using BCBio.GFF will contain value
jpayne@69	876 # of 2nd column of the gff as a feature qualifier. The BioSQL wiki
jpayne@69	877 # suggests that the source should not go in with the other feature
jpayne@69	878 # mappings but instead be put in the term table
jpayne@69	879 # (http://www.biosql.org/wiki/Annotation_Mapping)
jpayne@69	880 try:
jpayne@69	881 source = feature.qualifiers["source"]
jpayne@69	882 if isinstance(source, list):
jpayne@69	883 source = source[0]
jpayne@69	884 seqfeature_id = self._load_seqfeature_basic(
jpayne@69	885 feature.type, feature_rank, bioentry_id, source=source
jpayne@69	886 )
jpayne@69	887 except KeyError:
jpayne@69	888 seqfeature_id = self._load_seqfeature_basic(
jpayne@69	889 feature.type, feature_rank, bioentry_id
jpayne@69	890 )
jpayne@69	891
jpayne@69	892 self._load_seqfeature_locations(feature, seqfeature_id)
jpayne@69	893 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
jpayne@69	894
jpayne@69	895 def _load_seqfeature_basic(
jpayne@69	896 self, feature_type, feature_rank, bioentry_id, source="EMBL/GenBank/SwissProt"
jpayne@69	897 ):
jpayne@69	898 """Load the first tables of a seqfeature and returns the id (PRIVATE).
jpayne@69	899
jpayne@69	900 This loads the "key" of the seqfeature (ie. CDS, gene) and
jpayne@69	901 the basic seqfeature table itself.
jpayne@69	902 """
jpayne@69	903 ontology_id = self._get_ontology_id("SeqFeature Keys")
jpayne@69	904 seqfeature_key_id = self._get_term_id(feature_type, ontology_id=ontology_id)
jpayne@69	905 source_cat_id = self._get_ontology_id("SeqFeature Sources")
jpayne@69	906 source_term_id = self._get_term_id(source, ontology_id=source_cat_id)
jpayne@69	907
jpayne@69	908 sql = (
jpayne@69	909 "INSERT INTO seqfeature (bioentry_id, type_term_id, "
jpayne@69	910 'source_term_id, "rank") VALUES (%s, %s, %s, %s)'
jpayne@69	911 )
jpayne@69	912 self.adaptor.execute(
jpayne@69	913 sql, (bioentry_id, seqfeature_key_id, source_term_id, feature_rank + 1)
jpayne@69	914 )
jpayne@69	915 return self.adaptor.last_id("seqfeature")
jpayne@69	916
jpayne@69	917 def _load_seqfeature_locations(self, feature, seqfeature_id):
jpayne@69	918 """Load all of the locations for a SeqFeature into tables (PRIVATE).
jpayne@69	919
jpayne@69	920 This adds the locations related to the SeqFeature into the
jpayne@69	921 seqfeature_location table. Fuzzies are not handled right now.
jpayne@69	922 For a simple location, ie (1..2), we have a single table row
jpayne@69	923 with seq_start = 1, seq_end = 2, location_rank = 1.
jpayne@69	924
jpayne@69	925 For split locations, ie (1..2, 3..4, 5..6) we would have three
jpayne@69	926 row tables with::
jpayne@69	927
jpayne@69	928 start = 1, end = 2, rank = 1
jpayne@69	929 start = 3, end = 4, rank = 2
jpayne@69	930 start = 5, end = 6, rank = 3
jpayne@69	931
jpayne@69	932 """
jpayne@69	933 # TODO - Record an ontology for the locations (using location.term_id)
jpayne@69	934 # which for now as in BioPerl we leave defaulting to NULL.
jpayne@69	935 try:
jpayne@69	936 if feature.location.operator != "join":
jpayne@69	937 # e.g. order locations... we don't record "order" so it
jpayne@69	938 # will become a "join" on reloading. What does BioPerl do?
jpayne@69	939 import warnings
jpayne@69	940 from Bio import BiopythonWarning
jpayne@69	941
jpayne@69	942 warnings.warn(
jpayne@69	943 "%s location operators are not fully supported"
jpayne@69	944 % feature.location_operator,
jpayne@69	945 BiopythonWarning,
jpayne@69	946 )
jpayne@69	947 except AttributeError:
jpayne@69	948 pass
jpayne@69	949 # This will be a list of length one for a SimpleLocation:
jpayne@69	950 parts = feature.location.parts
jpayne@69	951 if parts and {loc.strand for loc in parts} == {-1}:
jpayne@69	952 # To mimic prior behaviour of Biopython+BioSQL, reverse order
jpayne@69	953 parts = parts[::-1]
jpayne@69	954 # TODO - Check what BioPerl does; see also BioSeq.py code
jpayne@69	955 for rank, loc in enumerate(parts):
jpayne@69	956 self._insert_location(loc, rank + 1, seqfeature_id)
jpayne@69	957
jpayne@69	958 def _insert_location(self, location, rank, seqfeature_id):
jpayne@69	959 """Add SeqFeature location to seqfeature_location table (PRIVATE).
jpayne@69	960
jpayne@69	961 TODO - Add location operator to location_qualifier_value?
jpayne@69	962 """
jpayne@69	963 # convert biopython locations to the 1-based location system
jpayne@69	964 # used in bioSQL
jpayne@69	965 # XXX This could also handle fuzzies
jpayne@69	966
jpayne@69	967 try:
jpayne@69	968 start = int(location.start) + 1
jpayne@69	969 except TypeError:
jpayne@69	970 # Handle SwissProt unknown position (?)
jpayne@69	971 if isinstance(location.start, UnknownPosition):
jpayne@69	972 start = None
jpayne@69	973 else:
jpayne@69	974 raise
jpayne@69	975
jpayne@69	976 try:
jpayne@69	977 end = int(location.end)
jpayne@69	978 except TypeError:
jpayne@69	979 # Handle SwissProt unknown position (?)
jpayne@69	980 if isinstance(location.end, UnknownPosition):
jpayne@69	981 end = None
jpayne@69	982 else:
jpayne@69	983 raise
jpayne@69	984
jpayne@69	985 # Biopython uses None when we don't know strand information but
jpayne@69	986 # BioSQL requires something (non null) and sets this as zero
jpayne@69	987 # So we'll use the strand or 0 if Biopython spits out None
jpayne@69	988 strand = location.strand or 0
jpayne@69	989
jpayne@69	990 # TODO - Record an ontology term for the location (location.term_id)
jpayne@69	991 # which for now like BioPerl we'll leave as NULL.
jpayne@69	992 # This might allow us to record "between" positions properly, but I
jpayne@69	993 # don't really see how it could work for before/after fuzzy positions
jpayne@69	994 loc_term_id = None
jpayne@69	995
jpayne@69	996 if location.ref:
jpayne@69	997 # sub_feature remote locations when they are in the same db as the
jpayne@69	998 # current record do not have a value for ref_db, which SeqFeature
jpayne@69	999 # object stores as None. BioSQL schema requires a varchar and is
jpayne@69	1000 # not NULL
jpayne@69	1001 dbxref_id = self._get_dbxref_id(location.ref_db or "", location.ref)
jpayne@69	1002 else:
jpayne@69	1003 dbxref_id = None
jpayne@69	1004
jpayne@69	1005 sql = (
jpayne@69	1006 "INSERT INTO location (seqfeature_id, dbxref_id, term_id,"
jpayne@69	1007 'start_pos, end_pos, strand, "rank") '
jpayne@69	1008 "VALUES (%s, %s, %s, %s, %s, %s, %s)"
jpayne@69	1009 )
jpayne@69	1010 self.adaptor.execute(
jpayne@69	1011 sql, (seqfeature_id, dbxref_id, loc_term_id, start, end, strand, rank)
jpayne@69	1012 )
jpayne@69	1013
jpayne@69	1014 """
jpayne@69	1015 # See Bug 2677
jpayne@69	1016 # TODO - Record the location_operator (e.g. "join" or "order")
jpayne@69	1017 # using the location_qualifier_value table (which we and BioPerl
jpayne@69	1018 # have historically left empty).
jpayne@69	1019 # Note this will need an ontology term for the location qualifier
jpayne@69	1020 # (location_qualifier_value.term_id) for which oddly the schema
jpayne@69	1021 # does not allow NULL.
jpayne@69	1022 if feature.location_operator:
jpayne@69	1023 #e.g. "join" (common),
jpayne@69	1024 #or "order" (see Tests/GenBank/protein_refseq2.gb)
jpayne@69	1025 location_id = self.adaptor.last_id('location')
jpayne@69	1026 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
jpayne@69	1027 sql = ("INSERT INTO location_qualifier_value"
jpayne@69	1028 "(location_id, term_id, value) "
jpayne@69	1029 "VALUES (%s, %s, %s)")
jpayne@69	1030 self.adaptor.execute(sql, (location_id, loc_qual_term_id,
jpayne@69	1031 feature.location_operator))
jpayne@69	1032 """
jpayne@69	1033
jpayne@69	1034 def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
jpayne@69	1035 """Insert feature's (key, value) pair qualifiers (PRIVATE).
jpayne@69	1036
jpayne@69	1037 Qualifiers should be a dictionary of the form::
jpayne@69	1038
jpayne@69	1039 {key : [value1, value2]}
jpayne@69	1040
jpayne@69	1041 """
jpayne@69	1042 tag_ontology_id = self._get_ontology_id("Annotation Tags")
jpayne@69	1043 for qualifier_key in qualifiers:
jpayne@69	1044 # Treat db_xref qualifiers differently to sequence annotation
jpayne@69	1045 # qualifiers by populating the seqfeature_dbxref and dbxref
jpayne@69	1046 # tables. Other qualifiers go into the seqfeature_qualifier_value
jpayne@69	1047 # and (if new) term tables.
jpayne@69	1048 if qualifier_key != "db_xref":
jpayne@69	1049 qualifier_key_id = self._get_term_id(
jpayne@69	1050 qualifier_key, ontology_id=tag_ontology_id
jpayne@69	1051 )
jpayne@69	1052 # now add all of the values to their table
jpayne@69	1053 entries = qualifiers[qualifier_key]
jpayne@69	1054 if not isinstance(entries, list):
jpayne@69	1055 # Could be a plain string, or an int or a float.
jpayne@69	1056 # However, we exect a list of strings here.
jpayne@69	1057 entries = [entries]
jpayne@69	1058 for qual_value_rank in range(len(entries)):
jpayne@69	1059 qualifier_value = entries[qual_value_rank]
jpayne@69	1060 sql = (
jpayne@69	1061 "INSERT INTO seqfeature_qualifier_value "
jpayne@69	1062 ' (seqfeature_id, term_id, "rank", value) VALUES'
jpayne@69	1063 " (%s, %s, %s, %s)"
jpayne@69	1064 )
jpayne@69	1065 self.adaptor.execute(
jpayne@69	1066 sql,
jpayne@69	1067 (
jpayne@69	1068 seqfeature_id,
jpayne@69	1069 qualifier_key_id,
jpayne@69	1070 qual_value_rank + 1,
jpayne@69	1071 qualifier_value,
jpayne@69	1072 ),
jpayne@69	1073 )
jpayne@69	1074 else:
jpayne@69	1075 # The dbxref_id qualifier/value sets go into the dbxref table
jpayne@69	1076 # as dbname, accession, version tuples, with dbxref.dbxref_id
jpayne@69	1077 # being automatically assigned, and into the seqfeature_dbxref
jpayne@69	1078 # table as seqfeature_id, dbxref_id, and rank tuples
jpayne@69	1079 self._load_seqfeature_dbxref(qualifiers[qualifier_key], seqfeature_id)
jpayne@69	1080
jpayne@69	1081 def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
jpayne@69	1082 """Add SeqFeature's DB cross-references to the database (PRIVATE).
jpayne@69	1083
jpayne@69	1084 Arguments:
jpayne@69	1085 - dbxrefs - List, dbxref data from the source file in the
jpayne@69	1086 format <database>:<accession>
jpayne@69	1087 - seqfeature_id - Int, the identifier for the seqfeature in the
jpayne@69	1088 seqfeature table
jpayne@69	1089
jpayne@69	1090 Insert dbxref qualifier data for a seqfeature into the
jpayne@69	1091 seqfeature_dbxref and, if required, dbxref tables.
jpayne@69	1092 The dbxref_id qualifier/value sets go into the dbxref table
jpayne@69	1093 as dbname, accession, version tuples, with dbxref.dbxref_id
jpayne@69	1094 being automatically assigned, and into the seqfeature_dbxref
jpayne@69	1095 table as seqfeature_id, dbxref_id, and rank tuples.
jpayne@69	1096 """
jpayne@69	1097 # NOTE - In older versions of Biopython, we would map the GenBank
jpayne@69	1098 # db_xref "name", for example "GI" to "GeneIndex", and give a warning
jpayne@69	1099 # for any unknown terms. This was a long term maintenance problem,
jpayne@69	1100 # and differed from BioPerl and BioJava's implementation. See bug 2405
jpayne@69	1101 for rank, value in enumerate(dbxrefs):
jpayne@69	1102 # Split the DB:accession format string at colons. We have to
jpayne@69	1103 # account for multiple-line and multiple-accession entries
jpayne@69	1104 try:
jpayne@69	1105 dbxref_data = value.replace(" ", "").replace("\n", "").split(":")
jpayne@69	1106 db = dbxref_data[0]
jpayne@69	1107 accessions = dbxref_data[1:]
jpayne@69	1108 except Exception:
jpayne@69	1109 raise ValueError(f"Parsing of db_xref failed: '{value}'") from None
jpayne@69	1110 # Loop over all the grabbed accessions, and attempt to fill the
jpayne@69	1111 # table
jpayne@69	1112 for accession in accessions:
jpayne@69	1113 # Get the dbxref_id value for the dbxref data
jpayne@69	1114 dbxref_id = self._get_dbxref_id(db, accession)
jpayne@69	1115 # Insert the seqfeature_dbxref data
jpayne@69	1116 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
jpayne@69	1117
jpayne@69	1118 def _get_dbxref_id(self, db, accession):
jpayne@69	1119 """Get DB cross-reference for accession (PRIVATE).
jpayne@69	1120
jpayne@69	1121 Arguments:
jpayne@69	1122 - db - String, the name of the external database containing
jpayne@69	1123 the accession number
jpayne@69	1124 - accession - String, the accession of the dbxref data
jpayne@69	1125
jpayne@69	1126 Finds and returns the dbxref_id for the passed data. The method
jpayne@69	1127 attempts to find an existing record first, and inserts the data
jpayne@69	1128 if there is no record.
jpayne@69	1129 """
jpayne@69	1130 # Check for an existing record
jpayne@69	1131 sql = "SELECT dbxref_id FROM dbxref WHERE dbname = %s AND accession = %s"
jpayne@69	1132 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
jpayne@69	1133 # If there was a record, return the dbxref_id, else create the
jpayne@69	1134 # record and return the created dbxref_id
jpayne@69	1135 if dbxref_id:
jpayne@69	1136 return dbxref_id[0]
jpayne@69	1137 return self._add_dbxref(db, accession, 0)
jpayne@69	1138
jpayne@69	1139 def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
jpayne@69	1140 """Get DB cross-reference, creating it if needed (PRIVATE).
jpayne@69	1141
jpayne@69	1142 Check for a pre-existing seqfeature_dbxref entry with the passed
jpayne@69	1143 seqfeature_id and dbxref_id. If one does not exist, insert new
jpayne@69	1144 data.
jpayne@69	1145 """
jpayne@69	1146 # Check for an existing record
jpayne@69	1147 sql = (
jpayne@69	1148 "SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref "
jpayne@69	1149 "WHERE seqfeature_id = %s AND dbxref_id = %s"
jpayne@69	1150 )
jpayne@69	1151 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, dbxref_id))
jpayne@69	1152 # If there was a record, return without executing anything, else create
jpayne@69	1153 # the record and return
jpayne@69	1154 if result:
jpayne@69	1155 return result
jpayne@69	1156 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
jpayne@69	1157
jpayne@69	1158 def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
jpayne@69	1159 """Add DB cross-reference (PRIVATE).
jpayne@69	1160
jpayne@69	1161 Insert a seqfeature_dbxref row and return the seqfeature_id and
jpayne@69	1162 dbxref_id
jpayne@69	1163 """
jpayne@69	1164 sql = (
jpayne@69	1165 "INSERT INTO seqfeature_dbxref "
jpayne@69	1166 '(seqfeature_id, dbxref_id, "rank") VALUES'
jpayne@69	1167 "(%s, %s, %s)"
jpayne@69	1168 )
jpayne@69	1169 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
jpayne@69	1170 return (seqfeature_id, dbxref_id)
jpayne@69	1171
jpayne@69	1172 def _load_dbxrefs(self, record, bioentry_id):
jpayne@69	1173 """Load any sequence level cross references into the database (PRIVATE).
jpayne@69	1174
jpayne@69	1175 See table bioentry_dbxref.
jpayne@69	1176 """
jpayne@69	1177 for rank, value in enumerate(record.dbxrefs):
jpayne@69	1178 # Split the DB:accession string at first colon.
jpayne@69	1179 # We have to cope with things like:
jpayne@69	1180 # "MGD:MGI:892" (db="MGD", accession="MGI:892")
jpayne@69	1181 # "GO:GO:123" (db="GO", accession="GO:123")
jpayne@69	1182 #
jpayne@69	1183 # Annoyingly I have seen the NCBI use both the style
jpayne@69	1184 # "GO:GO:123" and "GO:123" in different vintages.
jpayne@69	1185 newline_escape_count = value.count("\n")
jpayne@69	1186 if newline_escape_count != 0:
jpayne@69	1187 raise ValueError(
jpayne@69	1188 "Expected a single line in value, got {newline_escape_count}"
jpayne@69	1189 )
jpayne@69	1190 try:
jpayne@69	1191 db, accession = value.split(":", 1)
jpayne@69	1192 db = db.strip()
jpayne@69	1193 accession = accession.strip()
jpayne@69	1194 except Exception:
jpayne@69	1195 raise ValueError(f"Parsing of dbxrefs list failed: '{value}'") from None
jpayne@69	1196 # Get the dbxref_id value for the dbxref data
jpayne@69	1197 dbxref_id = self._get_dbxref_id(db, accession)
jpayne@69	1198 # Insert the bioentry_dbxref data
jpayne@69	1199 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
jpayne@69	1200
jpayne@69	1201 def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
jpayne@69	1202 """Get pre-existing db-xref, or create and return it (PRIVATE).
jpayne@69	1203
jpayne@69	1204 Check for a pre-existing bioentry_dbxref entry with the passed
jpayne@69	1205 seqfeature_id and dbxref_id. If one does not exist, insert new
jpayne@69	1206 data
jpayne@69	1207 """
jpayne@69	1208 # Check for an existing record
jpayne@69	1209 sql = (
jpayne@69	1210 "SELECT bioentry_id, dbxref_id FROM bioentry_dbxref "
jpayne@69	1211 "WHERE bioentry_id = %s AND dbxref_id = %s"
jpayne@69	1212 )
jpayne@69	1213 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, dbxref_id))
jpayne@69	1214 # If there was a record, return without executing anything, else create
jpayne@69	1215 # the record and return
jpayne@69	1216 if result:
jpayne@69	1217 return result
jpayne@69	1218 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
jpayne@69	1219
jpayne@69	1220 def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
jpayne@69	1221 """Insert a bioentry_dbxref row (PRIVATE).
jpayne@69	1222
jpayne@69	1223 Returns the seqfeature_id and dbxref_id (PRIVATE).
jpayne@69	1224 """
jpayne@69	1225 sql = (
jpayne@69	1226 "INSERT INTO bioentry_dbxref "
jpayne@69	1227 '(bioentry_id,dbxref_id,"rank") VALUES '
jpayne@69	1228 "(%s, %s, %s)"
jpayne@69	1229 )
jpayne@69	1230 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
jpayne@69	1231 return (bioentry_id, dbxref_id)
jpayne@69	1232
jpayne@69	1233
jpayne@69	1234 class DatabaseRemover:
jpayne@69	1235 """Complement the Loader functionality by fully removing a database.
jpayne@69	1236
jpayne@69	1237 This probably isn't really useful for normal purposes, since you
jpayne@69	1238 can just do a::
jpayne@69	1239
jpayne@69	1240 DROP DATABASE db_name
jpayne@69	1241
jpayne@69	1242 and then recreate the database. But, it's really useful for testing
jpayne@69	1243 purposes.
jpayne@69	1244 """
jpayne@69	1245
jpayne@69	1246 def __init__(self, adaptor, dbid):
jpayne@69	1247 """Initialize with a database id and adaptor connection."""
jpayne@69	1248 self.adaptor = adaptor
jpayne@69	1249 self.dbid = dbid
jpayne@69	1250
jpayne@69	1251 def remove(self):
jpayne@69	1252 """Remove everything related to the given database id."""
jpayne@69	1253 sql = "DELETE FROM bioentry WHERE biodatabase_id = %s"
jpayne@69	1254 self.adaptor.execute(sql, (self.dbid,))
jpayne@69	1255 sql = "DELETE FROM biodatabase WHERE biodatabase_id = %s"
jpayne@69	1256 self.adaptor.execute(sql, (self.dbid,))

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/Loader.py @ 69:33d812a61356