csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/Loader.py comparison

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/Loader.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

comparison

equal deleted inserted replaced

-:0e9998148a16
+:33d812a61356
+# Copyright 2002 by Andrew Dalke.  All rights reserved.
+# Revisions 2007-2016 copyright by Peter Cock.  All rights reserved.
+# Revisions 2008 copyright by Cymon J. Cox.  All rights reserved.
+#
+# This file is part of the Biopython distribution and governed by your
+# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
+# Please see the LICENSE file that should have been included as part of this
+# package.
+#
+# Note that BioSQL (including the database schema and scripts) is
+# available and licensed separately.  Please consult www.biosql.org
+"""Load biopython objects into a BioSQL database for persistent storage.
+This code makes it possible to store biopython objects in a relational
+database and then retrieve them back. You shouldn't use any of the
+classes in this module directly. Rather, call the load() method on
+a database object.
+"""
+# standard modules
+from time import gmtime, strftime
+# biopython
+from Bio.SeqUtils.CheckSum import crc64
+from Bio import Entrez
+from Bio.Seq import UndefinedSequenceError
+from Bio.SeqFeature import UnknownPosition
+class DatabaseLoader:
+"""Object used to load SeqRecord objects into a BioSQL database."""
+def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
+"""Initialize with connection information for the database.
+Creating a DatabaseLoader object is normally handled via the
+BioSeqDatabase DBServer object, for example::
+from BioSQL import BioSeqDatabase
+server = BioSeqDatabase.open_database(driver="MySQLdb",
+user="gbrowse",
+passwd="biosql",
+host="localhost",
+db="test_biosql")
+try:
+db = server["test"]
+except KeyError:
+db = server.new_database("test",
+description="For testing GBrowse")
+"""
+self.adaptor = adaptor
+self.dbid = dbid
+self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
+def load_seqrecord(self, record):
+"""Load a Biopython SeqRecord into the database."""
+bioentry_id = self._load_bioentry_table(record)
+self._load_bioentry_date(record, bioentry_id)
+self._load_biosequence(record, bioentry_id)
+self._load_comment(record, bioentry_id)
+self._load_dbxrefs(record, bioentry_id)
+references = record.annotations.get("references", ())
+for reference, rank in zip(references, list(range(len(references)))):
+self._load_reference(reference, rank, bioentry_id)
+self._load_annotations(record, bioentry_id)
+for seq_feature_num in range(len(record.features)):
+seq_feature = record.features[seq_feature_num]
+self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
+def _get_ontology_id(self, name, definition=None):
+"""Return identifier for the named ontology (PRIVATE).
+This looks through the onotology table for a the given entry name.
+If it is not found, a row is added for this ontology (using the
+definition if supplied).  In either case, the id corresponding to
+the provided name is returned, so that you can reference it in
+another table.
+"""
+oids = self.adaptor.execute_and_fetch_col0(
+"SELECT ontology_id FROM ontology WHERE name = %s", (name,)
+)
+if oids:
+return oids[0]
+self.adaptor.execute(
+"INSERT INTO ontology(name, definition) VALUES (%s, %s)", (name, definition)
+)
+return self.adaptor.last_id("ontology")
+def _get_term_id(self, name, ontology_id=None, definition=None, identifier=None):
+"""Get the id that corresponds to a term (PRIVATE).
+This looks through the term table for a the given term. If it
+is not found, a new id corresponding to this term is created.
+In either case, the id corresponding to that term is returned, so
+that you can reference it in another table.
+The ontology_id should be used to disambiguate the term.
+"""
+# try to get the term id
+sql = "SELECT term_id FROM term WHERE name = %s"
+fields = [name]
+if ontology_id:
+sql += " AND ontology_id = %s"
+fields.append(ontology_id)
+id_results = self.adaptor.execute_and_fetchall(sql, fields)
+# something is wrong
+if len(id_results) > 1:
+raise ValueError(f"Multiple term ids for {name}: {id_results!r}")
+elif len(id_results) == 1:
+return id_results[0][0]
+else:
+sql = (
+"INSERT INTO term (name, definition,"
+" identifier, ontology_id)"
+" VALUES (%s, %s, %s, %s)"
+)
+self.adaptor.execute(sql, (name, definition, identifier, ontology_id))
+return self.adaptor.last_id("term")
+def _add_dbxref(self, dbname, accession, version):
+"""Insert a dbxref and return its id (PRIVATE)."""
+self.adaptor.execute(
+"INSERT INTO dbxref(dbname, accession, version) VALUES (%s, %s, %s)",
+(dbname, accession, version),
+)
+return self.adaptor.last_id("dbxref")
+def _get_taxon_id(self, record):
+"""Get the taxon id for this record (PRIVATE).
+Arguments:
+- record - a SeqRecord object
+This searches the taxon/taxon_name tables using the
+NCBI taxon ID, scientific name and common name to find
+the matching taxon table entry's id.
+If the species isn't in the taxon table, and we have at
+least the NCBI taxon ID, scientific name or common name,
+at least a minimal stub entry is created in the table.
+Returns the taxon id (database key for the taxon table,
+not an NCBI taxon ID), or None if the taxonomy information
+is missing.
+See also the BioSQL script load_ncbi_taxonomy.pl which
+will populate and update the taxon/taxon_name tables
+with the latest information from the NCBI.
+"""
+# To find the NCBI taxid, first check for a top level annotation
+ncbi_taxon_id = None
+if "ncbi_taxid" in record.annotations:
+# Could be a list of IDs.
+if isinstance(record.annotations["ncbi_taxid"], list):
+if len(record.annotations["ncbi_taxid"]) == 1:
+ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
+else:
+ncbi_taxon_id = record.annotations["ncbi_taxid"]
+if not ncbi_taxon_id:
+# Secondly, look for a source feature
+for f in record.features:
+if f.type == "source":
+quals = getattr(f, "qualifiers", {})
+if "db_xref" in quals:
+for db_xref in f.qualifiers["db_xref"]:
+if db_xref.startswith("taxon:"):
+ncbi_taxon_id = int(db_xref[6:])
+break
+if ncbi_taxon_id:
+break
+try:
+scientific_name = record.annotations["organism"][:255]
+except KeyError:
+scientific_name = None
+try:
+common_name = record.annotations["source"][:255]
+except KeyError:
+common_name = None
+# Note: The maximum length for taxon names in the schema is 255.
+# Cropping it now should help in getting a match when searching,
+# and avoids an error if we try and add these to the database.
+if ncbi_taxon_id:
+# Good, we have the NCBI taxon to go on - this is unambiguous :)
+# Note that the scientific name and common name will only be
+# used if we have to record a stub entry.
+return self._get_taxon_id_from_ncbi_taxon_id(
+ncbi_taxon_id, scientific_name, common_name
+)
+if not common_name and not scientific_name:
+# Nothing to go on... and there is no point adding
+# a new entry to the database.  We'll just leave this
+# sequence's taxon as a NULL in the database.
+return None
+# Next, we'll try to find a match based on the species name
+# (stored in GenBank files as the organism and/or the source).
+if scientific_name:
+taxa = self.adaptor.execute_and_fetch_col0(
+"SELECT taxon_id FROM taxon_name"
+" WHERE name_class = 'scientific name' AND name = %s",
+(scientific_name,),
+)
+if taxa:
+# Good, mapped the scientific name to a taxon table entry
+return taxa[0]
+# Last chance...
+if common_name:
+taxa = self.adaptor.execute_and_fetch_col0(
+"SELECT DISTINCT taxon_id FROM taxon_name WHERE name = %s",
+(common_name,),
+)
+# Its natural that several distinct taxa will have the same common
+# name - in which case we can't resolve the taxon uniquely.
+if len(taxa) > 1:
+raise ValueError(
+"Taxa: %d species have name %r" % (len(taxa), common_name)
+)
+if taxa:
+# Good, mapped the common name to a taxon table entry
+return taxa[0]
+# At this point, as far as we can tell, this species isn't
+# in the taxon table already.  So we'll have to add it.
+# We don't have an NCBI taxonomy ID, so if we do record just
+# a stub entry, there is no simple way to fix this later.
+#
+# TODO - Should we try searching the NCBI taxonomy using the
+# species name?
+#
+# OK, let's try inserting the species.
+# Chances are we don't have enough information ...
+# Furthermore, it won't be in the hierarchy.
+lineage = []
+for c in record.annotations.get("taxonomy", []):
+lineage.append([None, None, c])
+if lineage:
+lineage[-1][1] = "genus"
+lineage.append([None, "species", record.annotations["organism"]])
+# XXX do we have them?
+if "subspecies" in record.annotations:
+lineage.append([None, "subspecies", record.annotations["subspecies"]])
+if "variant" in record.annotations:
+lineage.append([None, "varietas", record.annotations["variant"]])
+lineage[-1][0] = ncbi_taxon_id
+left_value = self.adaptor.execute_one("SELECT MAX(left_value) FROM taxon")[0]
+if not left_value:
+left_value = 0
+left_value += 1
+# XXX -- Brad: Fixing this for now in an ugly way because
+# I am getting overlaps for right_values. I need to dig into this
+# more to actually understand how it works. I'm not sure it is
+# actually working right anyhow.
+right_start_value = self.adaptor.execute_one(
+"SELECT MAX(right_value) FROM taxon"
+)[0]
+if not right_start_value:
+right_start_value = 0
+right_value = right_start_value + 2 * len(lineage) - 1
+parent_taxon_id = None
+for taxon in lineage:
+self.adaptor.execute(
+"INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
+" left_value, right_value)"
+" VALUES (%s, %s, %s, %s, %s)",
+(parent_taxon_id, taxon[0], taxon[1], left_value, right_value),
+)
+taxon_id = self.adaptor.last_id("taxon")
+self.adaptor.execute(
+"INSERT INTO taxon_name(taxon_id, name, name_class)"
+"VALUES (%s, %s, 'scientific name')",
+(taxon_id, taxon[2][:255]),
+)
+# Note the name field is limited to 255, some SwissProt files
+# have a multi-species name which can be longer.  So truncate this.
+left_value += 1
+right_value -= 1
+parent_taxon_id = taxon_id
+if common_name:
+self.adaptor.execute(
+"INSERT INTO taxon_name(taxon_id, name, name_class)"
+"VALUES (%s, %s, 'common name')",
+(taxon_id, common_name),
+)
+return taxon_id
+def _fix_name_class(self, entrez_name):
+"""Map Entrez name terms to those used in taxdump (PRIVATE).
+We need to make this conversion to match the taxon_name.name_class
+values used by the BioSQL load_ncbi_taxonomy.pl script.
+e.g.::
+"ScientificName" -> "scientific name",
+"EquivalentName" -> "equivalent name",
+"Synonym" -> "synonym",
+"""
+# Add any special cases here:
+#
+# known = {}
+# try:
+#     return known[entrez_name]
+# except KeyError:
+#     pass
+# Try automatically by adding spaces before each capital
+def add_space(letter):
+"""Add a space before a capital letter."""
+if letter.isupper():
+return " " + letter.lower()
+else:
+return letter
+answer = "".join(add_space(letter) for letter in entrez_name).strip()
+if answer != answer.lower():
+raise ValueError(
+f"Expected processed entrez_name, '{answer}' to only have lower case letters."
+)
+return answer
+def _update_left_right_taxon_values(self, left_value):
+"""Update the left and right taxon values in the table (PRIVATE)."""
+if not left_value:
+return
+# Due to the UNIQUE constraint on the left and right values in the taxon
+# table we cannot simply update them through an SQL statement as we risk
+# colliding values. Instead we must select all of the rows that we want to
+# update, modify the values in python and then update the rows
+# self.adaptor.execute("UPDATE taxon SET right_value = right_value + 2 "
+#                      "WHERE right_value >= %s", (left_value,))
+# self.adaptor.execute("UPDATE taxon SET left_value = left_value + 2 "
+#                      "WHERE left_value > %s", (left_value,))
+rows = self.adaptor.execute_and_fetchall(
+"SELECT left_value, right_value, taxon_id FROM taxon "
+"WHERE right_value >= %s or left_value > %s",
+(left_value, left_value),
+)
+right_rows = []
+left_rows = []
+for row in rows:
+new_right = row[1]
+new_left = row[0]
+if new_right >= left_value:
+new_right += 2
+if new_left > left_value:
+new_left += 2
+right_rows.append((new_right, row[2]))
+left_rows.append((new_left, row[2]))
+# sort the rows based on the value from largest to smallest
+# should ensure no overlaps
+right_rows = sorted(right_rows, key=lambda x: x[0], reverse=True)
+left_rows = sorted(left_rows, key=lambda x: x[0], reverse=True)
+self.adaptor.executemany(
+"UPDATE taxon SET left_value = %s WHERE taxon_id = %s", left_rows
+)
+self.adaptor.executemany(
+"UPDATE taxon SET right_value = %s WHERE taxon_id = %s", right_rows
+)
+def _get_taxon_id_from_ncbi_taxon_id(
+self, ncbi_taxon_id, scientific_name=None, common_name=None
+):
+"""Get the taxon id for record from NCBI taxon ID (PRIVATE).
+Arguments:
+- ncbi_taxon_id - string containing an NCBI taxon id
+- scientific_name - string, used if a stub entry is recorded
+- common_name - string, used if a stub entry is recorded
+This searches the taxon table using ONLY the NCBI taxon ID
+to find the matching taxon table entry's ID (database key).
+If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
+flag is true, Biopython will attempt to go online using Bio.Entrez
+to fetch the official NCBI lineage, recursing up the tree until an
+existing entry is found in the database or the full lineage has been
+fetched.
+Otherwise the NCBI taxon ID, scientific name and common name are
+recorded as a minimal stub entry in the taxon and taxon_name tables.
+Any partial information about the lineage from the SeqRecord is NOT
+recorded.  This should mean that (re)running the BioSQL script
+load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
+Returns the taxon id (database key for the taxon table, not
+an NCBI taxon ID).
+"""
+if not ncbi_taxon_id:
+raise ValueError("Expected a non-empty value for ncbi_taxon_id.")
+taxon_id = self.adaptor.execute_and_fetch_col0(
+"SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", (int(ncbi_taxon_id),)
+)
+if taxon_id:
+# Good, we have mapped the NCBI taxid to a taxon table entry
+return taxon_id[0]
+# At this point, as far as we can tell, this species isn't
+# in the taxon table already.  So we'll have to add it.
+parent_taxon_id = None
+rank = "species"
+genetic_code = None
+mito_genetic_code = None
+parent_left_value = None
+parent_right_value = None
+left_value = None
+right_value = None
+species_names = []
+if scientific_name:
+species_names.append(("scientific name", scientific_name))
+if common_name:
+species_names.append(("common name", common_name))
+if self.fetch_NCBI_taxonomy:
+# Go online to get the parent taxon ID!
+handle = Entrez.efetch(db="taxonomy", id=ncbi_taxon_id, retmode="XML")
+taxonomic_record = Entrez.read(handle)
+if len(taxonomic_record) == 1:
+if taxonomic_record[0]["TaxId"] != str(ncbi_taxon_id):
+raise ValueError(
+f"ncbi_taxon_id different from parent taxon id. {ncbi_taxon_id} versus {taxonomic_record[0]['TaxId']}"
+)
+(
+parent_taxon_id,
+parent_left_value,
+parent_right_value,
+) = self._get_taxon_id_from_ncbi_lineage(
+taxonomic_record[0]["LineageEx"]
+)
+left_value = parent_right_value
+right_value = parent_right_value + 1
+rank = str(taxonomic_record[0]["Rank"])
+genetic_code = int(taxonomic_record[0]["GeneticCode"]["GCId"])
+mito_genetic_code = int(taxonomic_record[0]["MitoGeneticCode"]["MGCId"])
+species_names = [
+("scientific name", str(taxonomic_record[0]["ScientificName"]))
+]
+try:
+for name_class, names in taxonomic_record[0]["OtherNames"].items():
+name_class = self._fix_name_class(name_class)
+if not isinstance(names, list):
+# The Entrez parser seems to return single entry
+# lists as just a string which is annoying.
+names = [names]
+for name in names:
+# Want to ignore complex things like ClassCDE
+# entries
+if isinstance(name, str):
+species_names.append((name_class, name))
+except KeyError:
+# OtherNames isn't always present,
+# e.g. NCBI taxon 41205, Bromheadia finlaysoniana
+pass
+else:
+pass
+# If we are not allowed to go online, we will record the bare minimum;
+# as long as the NCBI taxon id is present, then (re)running
+# load_ncbi_taxonomy.pl should fill in the taxonomomy lineage
+# (and update the species names).
+#
+# I am NOT going to try and record the lineage, even if it
+# is in the record annotation as a list of names, as we won't
+# know the NCBI taxon IDs for these parent nodes.
+self._update_left_right_taxon_values(left_value)
+self.adaptor.execute(
+"INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
+" genetic_code, mito_genetic_code, left_value, right_value)"
+" VALUES (%s, %s, %s, %s, %s, %s, %s)",
+(
+parent_taxon_id,
+ncbi_taxon_id,
+rank,
+genetic_code,
+mito_genetic_code,
+left_value,
+right_value,
+),
+)
+taxon_id = self.adaptor.last_id("taxon")
+# Record the scientific name, common name, etc
+for name_class, name in species_names:
+self.adaptor.execute(
+"INSERT INTO taxon_name(taxon_id, name, name_class)"
+" VALUES (%s, %s, %s)",
+(taxon_id, name[:255], name_class),
+)
+return taxon_id
+def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
+"""Recursive method to get taxon ID from NCBI lineage (PRIVATE).
+Arguments:
+- taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
+First dictionary in list is the taxonomy root, highest would be
+the species. Each dictionary includes:
+- TaxID (string, NCBI taxon id)
+- Rank (string, e.g. "species", "genus", ..., "phylum", ...)
+- ScientificName (string)
+(and that is all at the time of writing)
+This method will record all the lineage given, returning the taxon id
+(database key, not NCBI taxon id) of the final entry (the species).
+"""
+ncbi_taxon_id = int(taxonomic_lineage[-1]["TaxId"])
+left_value = None
+right_value = None
+parent_left_value = None
+parent_right_value = None
+# Is this in the database already?  Check the taxon table...
+rows = self.adaptor.execute_and_fetchall(
+"SELECT taxon_id, left_value, right_value FROM taxon"
+" WHERE ncbi_taxon_id=%s" % ncbi_taxon_id
+)
+if rows:
+# we could verify that the Scientific Name etc in the database
+# is the same and update it or print a warning if not...
+if len(rows) != 1:
+raise ValueError(f"Expected 1 reponse, got {len(rows)}")
+return rows[0]
+# We have to record this.
+if len(taxonomic_lineage) > 1:
+# Use recursion to find out the taxon id (database key) of the
+# parent.
+(
+parent_taxon_id,
+parent_left_value,
+parent_right_value,
+) = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
+left_value = parent_right_value
+right_value = parent_right_value + 1
+if not isinstance(parent_taxon_id, int):
+raise ValueError(
+f"Expected parent_taxon_id to be an int, got {parent_taxon_id}"
+)
+else:
+# we have reached the top of the lineage but no current taxonomy
+# id has been found
+parent_taxon_id = None
+left_value = self.adaptor.execute_one("SELECT MAX(left_value) FROM taxon")[
+0
+]
+if not left_value:
+left_value = 0
+right_value = left_value + 1
+self._update_left_right_taxon_values(left_value)
+# INSERT new taxon
+rank = str(taxonomic_lineage[-1].get("Rank"))
+self.adaptor.execute(
+"INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, "
+"left_value, right_value) VALUES (%s, %s, %s, %s, %s)",
+(ncbi_taxon_id, parent_taxon_id, rank, left_value, right_value),
+)
+taxon_id = self.adaptor.last_id("taxon")
+# assert isinstance(taxon_id, int), repr(taxon_id)
+# ... and its name in taxon_name
+scientific_name = taxonomic_lineage[-1].get("ScientificName")
+if scientific_name:
+self.adaptor.execute(
+"INSERT INTO taxon_name(taxon_id, name, name_class) "
+"VALUES (%s, %s, 'scientific name')",
+(taxon_id, scientific_name[:255]),
+)
+return taxon_id, left_value, right_value
+def _load_bioentry_table(self, record):
+"""Fill the bioentry table with sequence information (PRIVATE).
+Arguments:
+- record - SeqRecord object to add to the database.
+"""
+# get the pertinent info and insert it
+if record.id.count(".") == 1:  # try to get a version from the id
+# This assumes the string is something like "XXXXXXXX.123"
+accession, version = record.id.split(".")
+try:
+version = int(version)
+except ValueError:
+accession = record.id
+version = 0
+else:  # otherwise just use a version of 0
+accession = record.id
+version = 0
+if (
+"accessions" in record.annotations
+and isinstance(record.annotations["accessions"], list)
+and record.annotations["accessions"]
+):
+# Take the first accession (one if there is more than one)
+accession = record.annotations["accessions"][0]
+# Find the taxon id (this is not just the NCBI Taxon ID)
+# NOTE - If the species isn't defined in the taxon table,
+# a new minimal entry is created.
+taxon_id = self._get_taxon_id(record)
+if "gi" in record.annotations:
+identifier = record.annotations["gi"]
+else:
+identifier = record.id
+# Allow description and division to default to NULL as in BioPerl.
+description = getattr(record, "description", None)
+division = record.annotations.get("data_file_division")
+sql = """
+INSERT INTO bioentry (
+biodatabase_id,
+taxon_id,
+name,
+accession,
+identifier,
+division,
+description,
+version)
+VALUES (
+%s,
+%s,
+%s,
+%s,
+%s,
+%s,
+%s,
+%s)"""
+# print(self.dbid, taxon_id, record.name, accession, identifier, \
+#        division, description, version)
+self.adaptor.execute(
+sql,
+(
+self.dbid,
+taxon_id,
+record.name,
+accession,
+identifier,
+division,
+description,
+version,
+),
+)
+# now retrieve the id for the bioentry
+return self.adaptor.last_id("bioentry")
+def _load_bioentry_date(self, record, bioentry_id):
+"""Add the effective date of the entry into the database (PRIVATE).
+record - a SeqRecord object with an annotated date
+bioentry_id - corresponding database identifier
+"""
+# dates are GenBank style, like:
+# 14-SEP-2000
+date = record.annotations.get("date", strftime("%d-%b-%Y", gmtime()).upper())
+if isinstance(date, list):
+date = date[0]
+annotation_tags_id = self._get_ontology_id("Annotation Tags")
+date_id = self._get_term_id("date_changed", annotation_tags_id)
+sql = (
+"INSERT INTO bioentry_qualifier_value"
+' (bioentry_id, term_id, value, "rank")'
+" VALUES (%s, %s, %s, 1)"
+)
+self.adaptor.execute(sql, (bioentry_id, date_id, date))
+def _load_biosequence(self, record, bioentry_id):
+"""Record SeqRecord's sequence and alphabet in DB (PRIVATE).
+Arguments:
+- record - a SeqRecord object with a seq property
+- bioentry_id - corresponding database identifier
+"""
+if record.seq is None:
+# The biosequence table entry is optional, so if we haven't
+# got a sequence, we don't need to write to the table.
+return
+molecule_type = record.annotations.get("molecule_type", "")
+if "DNA" in molecule_type:
+alphabet = "dna"
+elif "RNA" in molecule_type:
+alphabet = "rna"
+elif "protein" in molecule_type:
+alphabet = "protein"
+else:
+alphabet = "unknown"
+try:
+seq_str = str(record.seq)
+except UndefinedSequenceError:
+seq_str = None
+sql = (
+"INSERT INTO biosequence (bioentry_id, version, "
+"length, seq, alphabet) "
+"VALUES (%s, 0, %s, %s, %s)"
+)
+self.adaptor.execute(sql, (bioentry_id, len(record.seq), seq_str, alphabet))
+def _load_comment(self, record, bioentry_id):
+"""Record a SeqRecord's annotated comment in the database (PRIVATE).
+Arguments:
+- record - a SeqRecord object with an annotated comment
+- bioentry_id - corresponding database identifier
+"""
+comments = record.annotations.get("comment")
+if not comments:
+return
+if not isinstance(comments, list):
+# It should be a string then...
+comments = [comments]
+for index, comment in enumerate(comments):
+comment = comment.replace("\n", " ")
+# TODO - Store each line as a separate entry?  This would preserve
+# the newlines, but we should check BioPerl etc to be consistent.
+sql = (
+'INSERT INTO comment (bioentry_id, comment_text, "rank")'
+" VALUES (%s, %s, %s)"
+)
+self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
+def _load_annotations(self, record, bioentry_id):
+"""Record a SeqRecord's misc annotations in the database (PRIVATE).
+The annotation strings are recorded in the bioentry_qualifier_value
+table, except for special cases like the reference, comment and
+taxonomy which are handled with their own tables.
+Arguments:
+- record - a SeqRecord object with an annotations dictionary
+- bioentry_id - corresponding database identifier
+"""
+mono_sql = (
+"INSERT INTO bioentry_qualifier_value"
+"(bioentry_id, term_id, value)"
+" VALUES (%s, %s, %s)"
+)
+many_sql = (
+"INSERT INTO bioentry_qualifier_value"
+'(bioentry_id, term_id, value, "rank")'
+" VALUES (%s, %s, %s, %s)"
+)
+tag_ontology_id = self._get_ontology_id("Annotation Tags")
+for key, value in record.annotations.items():
+if key in ["molecule_type", "references", "comment", "ncbi_taxid", "date"]:
+# Handled separately
+continue
+term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
+if isinstance(value, (list, tuple)):
+rank = 0
+for entry in value:
+if isinstance(entry, (str, int)):
+# Easy case
+rank += 1
+self.adaptor.execute(
+many_sql, (bioentry_id, term_id, str(entry), rank)
+)
+else:
+pass
+elif isinstance(value, (str, int)):
+# Have a simple single entry, leave rank as the DB default
+self.adaptor.execute(mono_sql, (bioentry_id, term_id, str(value)))
+else:
+pass
+# print("Ignoring annotation '%s' entry of type '%s'" \
+#      % (key, type(value)))
+def _load_reference(self, reference, rank, bioentry_id):
+"""Record SeqRecord's annotated references in the database (PRIVATE).
+Arguments:
+- record - a SeqRecord object with annotated references
+- bioentry_id - corresponding database identifier
+"""
+refs = None
+if reference.medline_id:
+refs = self.adaptor.execute_and_fetch_col0(
+"SELECT reference_id"
+" FROM reference JOIN dbxref USING (dbxref_id)"
+" WHERE dbname = 'MEDLINE' AND accession = %s",
+(reference.medline_id,),
+)
+if not refs and reference.pubmed_id:
+refs = self.adaptor.execute_and_fetch_col0(
+"SELECT reference_id"
+" FROM reference JOIN dbxref USING (dbxref_id)"
+" WHERE dbname = 'PUBMED' AND accession = %s",
+(reference.pubmed_id,),
+)
+if not refs:
+s = []
+for f in reference.authors, reference.title, reference.journal:
+s.append(f or "<undef>")
+crc = crc64("".join(s))
+refs = self.adaptor.execute_and_fetch_col0(
+"SELECT reference_id FROM reference WHERE crc = %s", (crc,)
+)
+if not refs:
+if reference.medline_id:
+dbxref_id = self._add_dbxref("MEDLINE", reference.medline_id, 0)
+elif reference.pubmed_id:
+dbxref_id = self._add_dbxref("PUBMED", reference.pubmed_id, 0)
+else:
+dbxref_id = None
+authors = reference.authors or None
+title = reference.title or None
+# The location/journal field cannot be Null, so default
+# to an empty string rather than None:
+journal = reference.journal or ""
+self.adaptor.execute(
+"INSERT INTO reference (dbxref_id, location,"
+" title, authors, crc)"
+" VALUES (%s, %s, %s, %s, %s)",
+(dbxref_id, journal, title, authors, crc),
+)
+reference_id = self.adaptor.last_id("reference")
+else:
+reference_id = refs[0]
+if reference.location:
+start = 1 + int(str(reference.location[0].start))
+end = int(str(reference.location[0].end))
+else:
+start = None
+end = None
+sql = (
+"INSERT INTO bioentry_reference (bioentry_id, reference_id,"
+' start_pos, end_pos, "rank") VALUES (%s, %s, %s, %s, %s)'
+)
+self.adaptor.execute(sql, (bioentry_id, reference_id, start, end, rank + 1))
+def _load_seqfeature(self, feature, feature_rank, bioentry_id):
+"""Load a biopython SeqFeature into the database (PRIVATE)."""
+# records loaded from a gff file using BCBio.GFF will contain value
+# of 2nd column of the gff as a feature qualifier. The BioSQL wiki
+# suggests that the source should not go in with the other feature
+# mappings but instead be put in the term table
+# (http://www.biosql.org/wiki/Annotation_Mapping)
+try:
+source = feature.qualifiers["source"]
+if isinstance(source, list):
+source = source[0]
+seqfeature_id = self._load_seqfeature_basic(
+feature.type, feature_rank, bioentry_id, source=source
+)
+except KeyError:
+seqfeature_id = self._load_seqfeature_basic(
+feature.type, feature_rank, bioentry_id
+)
+self._load_seqfeature_locations(feature, seqfeature_id)
+self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
+def _load_seqfeature_basic(
+self, feature_type, feature_rank, bioentry_id, source="EMBL/GenBank/SwissProt"
+):
+"""Load the first tables of a seqfeature and returns the id (PRIVATE).
+This loads the "key" of the seqfeature (ie. CDS, gene) and
+the basic seqfeature table itself.
+"""
+ontology_id = self._get_ontology_id("SeqFeature Keys")
+seqfeature_key_id = self._get_term_id(feature_type, ontology_id=ontology_id)
+source_cat_id = self._get_ontology_id("SeqFeature Sources")
+source_term_id = self._get_term_id(source, ontology_id=source_cat_id)
+sql = (
+"INSERT INTO seqfeature (bioentry_id, type_term_id, "
+'source_term_id, "rank") VALUES (%s, %s, %s, %s)'
+)
+self.adaptor.execute(
+sql, (bioentry_id, seqfeature_key_id, source_term_id, feature_rank + 1)
+)
+return self.adaptor.last_id("seqfeature")
+def _load_seqfeature_locations(self, feature, seqfeature_id):
+"""Load all of the locations for a SeqFeature into tables (PRIVATE).
+This adds the locations related to the SeqFeature into the
+seqfeature_location table. Fuzzies are not handled right now.
+For a simple location, ie (1..2), we have a single table row
+with seq_start = 1, seq_end = 2, location_rank = 1.
+For split locations, ie (1..2, 3..4, 5..6) we would have three
+row tables with::
+start = 1, end = 2, rank = 1
+start = 3, end = 4, rank = 2
+start = 5, end = 6, rank = 3
+"""
+# TODO - Record an ontology for the locations (using location.term_id)
+# which for now as in BioPerl we leave defaulting to NULL.
+try:
+if feature.location.operator != "join":
+# e.g. order locations... we don't record "order" so it
+# will become a "join" on reloading. What does BioPerl do?
+import warnings
+from Bio import BiopythonWarning
+warnings.warn(
+"%s location operators are not fully supported"
+% feature.location_operator,
+BiopythonWarning,
+)
+except AttributeError:
+pass
+# This will be a list of length one for a SimpleLocation:
+parts = feature.location.parts
+if parts and {loc.strand for loc in parts} == {-1}:
+# To mimic prior behaviour of Biopython+BioSQL, reverse order
+parts = parts[::-1]
+# TODO - Check what BioPerl does; see also BioSeq.py code
+for rank, loc in enumerate(parts):
+self._insert_location(loc, rank + 1, seqfeature_id)
+def _insert_location(self, location, rank, seqfeature_id):
+"""Add SeqFeature location to seqfeature_location table (PRIVATE).
+TODO - Add location operator to location_qualifier_value?
+"""
+# convert biopython locations to the 1-based location system
+# used in bioSQL
+# XXX This could also handle fuzzies
+try:
+start = int(location.start) + 1
+except TypeError:
+# Handle SwissProt unknown position (?)
+if isinstance(location.start, UnknownPosition):
+start = None
+else:
+raise
+try:
+end = int(location.end)
+except TypeError:
+# Handle SwissProt unknown position (?)
+if isinstance(location.end, UnknownPosition):
+end = None
+else:
+raise
+# Biopython uses None when we don't know strand information but
+# BioSQL requires something (non null) and sets this as zero
+# So we'll use the strand or 0 if Biopython spits out None
+strand = location.strand or 0
+# TODO - Record an ontology term for the location (location.term_id)
+# which for now like BioPerl we'll leave as NULL.
+# This might allow us to record "between" positions properly, but I
+# don't really see how it could work for before/after fuzzy positions
+loc_term_id = None
+if location.ref:
+# sub_feature remote locations when they are in the same db as the
+# current record do not have a value for ref_db, which SeqFeature
+# object stores as None. BioSQL schema requires a varchar and is
+# not NULL
+dbxref_id = self._get_dbxref_id(location.ref_db or "", location.ref)
+else:
+dbxref_id = None
+sql = (
+"INSERT INTO location (seqfeature_id, dbxref_id, term_id,"
+'start_pos, end_pos, strand, "rank") '
+"VALUES (%s, %s, %s, %s, %s, %s, %s)"
+)
+self.adaptor.execute(
+sql, (seqfeature_id, dbxref_id, loc_term_id, start, end, strand, rank)
+)
+"""
+# See Bug 2677
+# TODO - Record the location_operator (e.g. "join" or "order")
+# using the location_qualifier_value table (which we and BioPerl
+# have historically left empty).
+# Note this will need an ontology term for the location qualifier
+# (location_qualifier_value.term_id) for which oddly the schema
+# does not allow NULL.
+if feature.location_operator:
+#e.g. "join" (common),
+#or "order" (see Tests/GenBank/protein_refseq2.gb)
+location_id = self.adaptor.last_id('location')
+loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
+sql = ("INSERT INTO location_qualifier_value"
+"(location_id, term_id, value) "
+"VALUES (%s, %s, %s)")
+self.adaptor.execute(sql, (location_id, loc_qual_term_id,
+feature.location_operator))
+"""
+def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
+"""Insert feature's (key, value) pair qualifiers (PRIVATE).
+Qualifiers should be a dictionary of the form::
+{key : [value1, value2]}
+"""
+tag_ontology_id = self._get_ontology_id("Annotation Tags")
+for qualifier_key in qualifiers:
+# Treat db_xref qualifiers differently to sequence annotation
+# qualifiers by populating the seqfeature_dbxref and dbxref
+# tables.  Other qualifiers go into the seqfeature_qualifier_value
+# and (if new) term tables.
+if qualifier_key != "db_xref":
+qualifier_key_id = self._get_term_id(
+qualifier_key, ontology_id=tag_ontology_id
+)
+# now add all of the values to their table
+entries = qualifiers[qualifier_key]
+if not isinstance(entries, list):
+# Could be a plain string, or an int or a float.
+# However, we exect a list of strings here.
+entries = [entries]
+for qual_value_rank in range(len(entries)):
+qualifier_value = entries[qual_value_rank]
+sql = (
+"INSERT INTO seqfeature_qualifier_value "
+' (seqfeature_id, term_id, "rank", value) VALUES'
+" (%s, %s, %s, %s)"
+)
+self.adaptor.execute(
+sql,
+(
+seqfeature_id,
+qualifier_key_id,
+qual_value_rank + 1,
+qualifier_value,
+),
+)
+else:
+# The dbxref_id qualifier/value sets go into the dbxref table
+# as dbname, accession, version tuples, with dbxref.dbxref_id
+# being automatically assigned, and into the seqfeature_dbxref
+# table as seqfeature_id, dbxref_id, and rank tuples
+self._load_seqfeature_dbxref(qualifiers[qualifier_key], seqfeature_id)
+def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
+"""Add SeqFeature's DB cross-references to the database (PRIVATE).
+Arguments:
+- dbxrefs - List, dbxref data from the source file in the
+format <database>:<accession>
+- seqfeature_id - Int, the identifier for the seqfeature in the
+seqfeature table
+Insert dbxref qualifier data for a seqfeature into the
+seqfeature_dbxref and, if required, dbxref tables.
+The dbxref_id qualifier/value sets go into the dbxref table
+as dbname, accession, version tuples, with dbxref.dbxref_id
+being automatically assigned, and into the seqfeature_dbxref
+table as seqfeature_id, dbxref_id, and rank tuples.
+"""
+# NOTE - In older versions of Biopython, we would map the GenBank
+# db_xref "name", for example "GI" to "GeneIndex", and give a warning
+# for any unknown terms.  This was a long term maintenance problem,
+# and differed from BioPerl and BioJava's implementation.  See bug 2405
+for rank, value in enumerate(dbxrefs):
+# Split the DB:accession format string at colons.  We have to
+# account for multiple-line and multiple-accession entries
+try:
+dbxref_data = value.replace(" ", "").replace("\n", "").split(":")
+db = dbxref_data[0]
+accessions = dbxref_data[1:]
+except Exception:
+raise ValueError(f"Parsing of db_xref failed: '{value}'") from None
+# Loop over all the grabbed accessions, and attempt to fill the
+# table
+for accession in accessions:
+# Get the dbxref_id value for the dbxref data
+dbxref_id = self._get_dbxref_id(db, accession)
+# Insert the seqfeature_dbxref data
+self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
+def _get_dbxref_id(self, db, accession):
+"""Get DB cross-reference for accession (PRIVATE).
+Arguments:
+- db - String, the name of the external database containing
+the accession number
+- accession - String, the accession of the dbxref data
+Finds and returns the dbxref_id for the passed data.  The method
+attempts to find an existing record first, and inserts the data
+if there is no record.
+"""
+# Check for an existing record
+sql = "SELECT dbxref_id FROM dbxref WHERE dbname = %s AND accession = %s"
+dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
+# If there was a record, return the dbxref_id, else create the
+# record and return the created dbxref_id
+if dbxref_id:
+return dbxref_id[0]
+return self._add_dbxref(db, accession, 0)
+def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
+"""Get DB cross-reference, creating it if needed (PRIVATE).
+Check for a pre-existing seqfeature_dbxref entry with the passed
+seqfeature_id and dbxref_id.  If one does not exist, insert new
+data.
+"""
+# Check for an existing record
+sql = (
+"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref "
+"WHERE seqfeature_id = %s AND dbxref_id = %s"
+)
+result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, dbxref_id))
+# If there was a record, return without executing anything, else create
+# the record and return
+if result:
+return result
+return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
+def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
+"""Add DB cross-reference (PRIVATE).
+Insert a seqfeature_dbxref row and return the seqfeature_id and
+dbxref_id
+"""
+sql = (
+"INSERT INTO seqfeature_dbxref "
+'(seqfeature_id, dbxref_id, "rank") VALUES'
+"(%s, %s, %s)"
+)
+self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
+return (seqfeature_id, dbxref_id)
+def _load_dbxrefs(self, record, bioentry_id):
+"""Load any sequence level cross references into the database (PRIVATE).
+See table bioentry_dbxref.
+"""
+for rank, value in enumerate(record.dbxrefs):
+# Split the DB:accession string at first colon.
+# We have to cope with things like:
+# "MGD:MGI:892" (db="MGD", accession="MGI:892")
+# "GO:GO:123" (db="GO", accession="GO:123")
+#
+# Annoyingly I have seen the NCBI use both the style
+# "GO:GO:123" and "GO:123" in different vintages.
+newline_escape_count = value.count("\n")
+if newline_escape_count != 0:
+raise ValueError(
+"Expected a single line in value, got {newline_escape_count}"
+)
+try:
+db, accession = value.split(":", 1)
+db = db.strip()
+accession = accession.strip()
+except Exception:
+raise ValueError(f"Parsing of dbxrefs list failed: '{value}'") from None
+# Get the dbxref_id value for the dbxref data
+dbxref_id = self._get_dbxref_id(db, accession)
+# Insert the bioentry_dbxref  data
+self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
+def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
+"""Get pre-existing db-xref, or create and return it (PRIVATE).
+Check for a pre-existing bioentry_dbxref entry with the passed
+seqfeature_id and dbxref_id.  If one does not exist, insert new
+data
+"""
+# Check for an existing record
+sql = (
+"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref "
+"WHERE bioentry_id = %s AND dbxref_id = %s"
+)
+result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, dbxref_id))
+# If there was a record, return without executing anything, else create
+# the record and return
+if result:
+return result
+return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
+def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
+"""Insert a bioentry_dbxref row (PRIVATE).
+Returns the seqfeature_id and dbxref_id (PRIVATE).
+"""
+sql = (
+"INSERT INTO bioentry_dbxref "
+'(bioentry_id,dbxref_id,"rank") VALUES '
+"(%s, %s, %s)"
+)
+self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
+return (bioentry_id, dbxref_id)
+class DatabaseRemover:
+"""Complement the Loader functionality by fully removing a database.
+This probably isn't really useful for normal purposes, since you
+can just do a::
+DROP DATABASE db_name
+and then recreate the database. But, it's really useful for testing
+purposes.
+"""
+def __init__(self, adaptor, dbid):
+"""Initialize with a database id and adaptor connection."""
+self.adaptor = adaptor
+self.dbid = dbid
+def remove(self):
+"""Remove everything related to the given database id."""
+sql = "DELETE FROM bioentry WHERE biodatabase_id = %s"
+self.adaptor.execute(sql, (self.dbid,))
+sql = "DELETE FROM biodatabase WHERE biodatabase_id = %s"
+self.adaptor.execute(sql, (self.dbid,))

Mercurial > repos > rliterman > csp2

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/Loader.py @ 69:33d812a61356