jpayne@68: # Copyright 2002 by Andrew Dalke.  All rights reserved.
jpayne@68: # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved.
jpayne@68: # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved.
jpayne@68: #
jpayne@68: # This file is part of the Biopython distribution and governed by your
jpayne@68: # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68: # Please see the LICENSE file that should have been included as part of this
jpayne@68: # package.
jpayne@68: #
jpayne@68: # Note that BioSQL (including the database schema and scripts) is
jpayne@68: # available and licensed separately.  Please consult www.biosql.org
jpayne@68: """Implementations of Biopython-like Seq objects on top of BioSQL.
jpayne@68: 
jpayne@68: This allows retrieval of items stored in a BioSQL database using
jpayne@68: a biopython-like SeqRecord and Seq interface.
jpayne@68: 
jpayne@68: Note: Currently we do not support recording per-letter-annotations
jpayne@68: (like quality scores) in BioSQL.
jpayne@68: """
jpayne@68: 
jpayne@68: from typing import List, Optional
jpayne@68: 
jpayne@68: from Bio.Seq import Seq, SequenceDataAbstractBaseClass
jpayne@68: from Bio.SeqRecord import SeqRecord, _RestrictedDict
jpayne@68: from Bio import SeqFeature
jpayne@68: 
jpayne@68: 
jpayne@68: class _BioSQLSequenceData(SequenceDataAbstractBaseClass):
jpayne@68:     """Retrieves sequence data from a BioSQL database (PRIVATE)."""
jpayne@68: 
jpayne@68:     __slots__ = ("primary_id", "adaptor", "_length", "start")
jpayne@68: 
jpayne@68:     def __init__(self, primary_id, adaptor, start=0, length=0):
jpayne@68:         """Create a new _BioSQLSequenceData object referring to a BioSQL entry.
jpayne@68: 
jpayne@68:         You wouldn't normally create a _BioSQLSequenceData object yourself,
jpayne@68:         this is done for you when retrieving a DBSeqRecord object from the
jpayne@68:         database, which creates a Seq object using a _BioSQLSequenceData
jpayne@68:         instance as the data provider.
jpayne@68:         """
jpayne@68:         self.primary_id = primary_id
jpayne@68:         self.adaptor = adaptor
jpayne@68:         self._length = length
jpayne@68:         self.start = start
jpayne@68:         super().__init__()
jpayne@68: 
jpayne@68:     def __len__(self):
jpayne@68:         """Return the length of the sequence."""
jpayne@68:         return self._length
jpayne@68: 
jpayne@68:     def __getitem__(self, key):
jpayne@68:         """Return a subsequence as a bytes or a _BioSQLSequenceData object."""
jpayne@68:         if isinstance(key, slice):
jpayne@68:             start, end, step = key.indices(self._length)
jpayne@68:             size = len(range(start, end, step))
jpayne@68:             if size == 0:
jpayne@68:                 return b""
jpayne@68:         else:
jpayne@68:             # Return a single letter as an integer (consistent with bytes)
jpayne@68:             i = key
jpayne@68:             if i < 0:
jpayne@68:                 i += self._length
jpayne@68:                 if i < 0:
jpayne@68:                     raise IndexError(key)
jpayne@68:             elif i >= self._length:
jpayne@68:                 raise IndexError(key)
jpayne@68:             c = self.adaptor.get_subseq_as_string(
jpayne@68:                 self.primary_id, self.start + i, self.start + i + 1
jpayne@68:             )
jpayne@68:             return ord(c)
jpayne@68: 
jpayne@68:         if step == 1:
jpayne@68:             if start == 0 and size == self._length:
jpayne@68:                 # Return the full sequence as bytes
jpayne@68:                 sequence = self.adaptor.get_subseq_as_string(
jpayne@68:                     self.primary_id, self.start, self.start + self._length
jpayne@68:                 )
jpayne@68:                 return sequence.encode("ASCII")
jpayne@68:             else:
jpayne@68:                 # Return a _BioSQLSequenceData with the start and end adjusted
jpayne@68:                 return _BioSQLSequenceData(
jpayne@68:                     self.primary_id, self.adaptor, self.start + start, size
jpayne@68:                 )
jpayne@68:         else:
jpayne@68:             # Will have to extract the sequence because of the stride
jpayne@68:             full = self.adaptor.get_subseq_as_string(
jpayne@68:                 self.primary_id, self.start + start, self.start + end
jpayne@68:             )
jpayne@68:             return full[::step].encode("ASCII")
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_seq_len(adaptor, primary_id):
jpayne@68:     # The database schema ensures there will be only one matching row
jpayne@68:     seqs = adaptor.execute_and_fetchall(
jpayne@68:         "SELECT length FROM biosequence WHERE bioentry_id = %s", (primary_id,)
jpayne@68:     )
jpayne@68:     if not seqs:
jpayne@68:         return None
jpayne@68:     if len(seqs) != 1:
jpayne@68:         raise ValueError(f"Expected 1 response, got {len(seqs)}.")
jpayne@68:     (given_length,) = seqs[0]
jpayne@68:     return int(given_length)
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_seq(adaptor, primary_id):
jpayne@68:     # The database schema ensures there will be only one matching
jpayne@68:     # row in the table.
jpayne@68: 
jpayne@68:     # If an undefined sequence was recorded, seq will be NULL,
jpayne@68:     # but length will be populated.  This means length(seq)
jpayne@68:     # will return None.
jpayne@68:     seqs = adaptor.execute_and_fetchall(
jpayne@68:         "SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s",
jpayne@68:         (primary_id,),
jpayne@68:     )
jpayne@68:     if not seqs:
jpayne@68:         return
jpayne@68:     if len(seqs) != 1:
jpayne@68:         raise ValueError(f"Expected 1 response, got {len(seqs)}.")
jpayne@68:     moltype, given_length, length = seqs[0]
jpayne@68: 
jpayne@68:     try:
jpayne@68:         length = int(length)
jpayne@68:         given_length = int(given_length)
jpayne@68:         if length != given_length:
jpayne@68:             raise ValueError(
jpayne@68:                 f"'length' differs from sequence length, {given_length}, {length}"
jpayne@68:             )
jpayne@68:         have_seq = True
jpayne@68:     except TypeError:
jpayne@68:         if length is not None:
jpayne@68:             raise ValueError(f"Expected 'length' to be 'None', got {length}.")
jpayne@68:         seqs = adaptor.execute_and_fetchall(
jpayne@68:             "SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s",
jpayne@68:             (primary_id,),
jpayne@68:         )
jpayne@68:         if len(seqs) != 1:
jpayne@68:             raise ValueError(f"Expected 1 response, got {len(seqs)}.")
jpayne@68:         moltype, given_length, seq = seqs[0]
jpayne@68:         if seq:
jpayne@68:             raise ValueError(f"Expected 'seq' to have a falsy value, got {seq}.")
jpayne@68:         length = int(given_length)
jpayne@68:         have_seq = False
jpayne@68:         del seq
jpayne@68:     del given_length
jpayne@68: 
jpayne@68:     if have_seq:
jpayne@68:         data = _BioSQLSequenceData(primary_id, adaptor, start=0, length=length)
jpayne@68:         return Seq(data)
jpayne@68:     else:
jpayne@68:         return Seq(None, length=length)
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_dbxrefs(adaptor, primary_id):
jpayne@68:     """Retrieve the database cross references for the sequence (PRIVATE)."""
jpayne@68:     _dbxrefs = []
jpayne@68:     dbxrefs = adaptor.execute_and_fetchall(
jpayne@68:         "SELECT dbname, accession, version"
jpayne@68:         " FROM bioentry_dbxref join dbxref using (dbxref_id)"
jpayne@68:         " WHERE bioentry_id = %s"
jpayne@68:         ' ORDER BY "rank"',
jpayne@68:         (primary_id,),
jpayne@68:     )
jpayne@68:     for dbname, accession, version in dbxrefs:
jpayne@68:         if version and version != "0":
jpayne@68:             v = f"{accession}.{version}"
jpayne@68:         else:
jpayne@68:             v = accession
jpayne@68:         _dbxrefs.append(f"{dbname}:{v}")
jpayne@68:     return _dbxrefs
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_features(adaptor, primary_id):
jpayne@68:     sql = (
jpayne@68:         'SELECT seqfeature_id, type.name, "rank"'
jpayne@68:         " FROM seqfeature join term type on (type_term_id = type.term_id)"
jpayne@68:         " WHERE bioentry_id = %s"
jpayne@68:         ' ORDER BY "rank"'
jpayne@68:     )
jpayne@68:     results = adaptor.execute_and_fetchall(sql, (primary_id,))
jpayne@68:     seq_feature_list = []
jpayne@68:     for seqfeature_id, seqfeature_type, seqfeature_rank in results:
jpayne@68:         # Get qualifiers [except for db_xref which is stored separately]
jpayne@68:         qvs = adaptor.execute_and_fetchall(
jpayne@68:             "SELECT name, value"
jpayne@68:             " FROM seqfeature_qualifier_value  join term using (term_id)"
jpayne@68:             " WHERE seqfeature_id = %s"
jpayne@68:             ' ORDER BY "rank"',
jpayne@68:             (seqfeature_id,),
jpayne@68:         )
jpayne@68:         qualifiers = {}
jpayne@68:         for qv_name, qv_value in qvs:
jpayne@68:             qualifiers.setdefault(qv_name, []).append(qv_value)
jpayne@68:         # Get db_xrefs [special case of qualifiers]
jpayne@68:         qvs = adaptor.execute_and_fetchall(
jpayne@68:             "SELECT dbxref.dbname, dbxref.accession"
jpayne@68:             " FROM dbxref join seqfeature_dbxref using (dbxref_id)"
jpayne@68:             " WHERE seqfeature_dbxref.seqfeature_id = %s"
jpayne@68:             ' ORDER BY "rank"',
jpayne@68:             (seqfeature_id,),
jpayne@68:         )
jpayne@68:         for qv_name, qv_value in qvs:
jpayne@68:             value = f"{qv_name}:{qv_value}"
jpayne@68:             qualifiers.setdefault("db_xref", []).append(value)
jpayne@68:         # Get locations
jpayne@68:         results = adaptor.execute_and_fetchall(
jpayne@68:             "SELECT location_id, start_pos, end_pos, strand"
jpayne@68:             " FROM location"
jpayne@68:             " WHERE seqfeature_id = %s"
jpayne@68:             ' ORDER BY "rank"',
jpayne@68:             (seqfeature_id,),
jpayne@68:         )
jpayne@68:         locations = []
jpayne@68:         # convert to Python standard form
jpayne@68:         # Convert strand = 0 to strand = None
jpayne@68:         # re: comment in Loader.py:
jpayne@68:         # Biopython uses None when we don't know strand information but
jpayne@68:         # BioSQL requires something (non null) and sets this as zero
jpayne@68:         # So we'll use the strand or 0 if Biopython spits out None
jpayne@68:         for location_id, start, end, strand in results:
jpayne@68:             if start:
jpayne@68:                 start -= 1
jpayne@68:             if strand == 0:
jpayne@68:                 strand = None
jpayne@68:             if strand not in (+1, -1, None):
jpayne@68:                 raise ValueError(
jpayne@68:                     "Invalid strand %s found in database for "
jpayne@68:                     "seqfeature_id %s" % (strand, seqfeature_id)
jpayne@68:                 )
jpayne@68:             if start is not None and end is not None and end < start:
jpayne@68:                 import warnings
jpayne@68:                 from Bio import BiopythonWarning
jpayne@68: 
jpayne@68:                 warnings.warn(
jpayne@68:                     "Inverted location start/end (%i and %i) for "
jpayne@68:                     "seqfeature_id %s" % (start, end, seqfeature_id),
jpayne@68:                     BiopythonWarning,
jpayne@68:                 )
jpayne@68: 
jpayne@68:             # For SwissProt unknown positions (?)
jpayne@68:             if start is None:
jpayne@68:                 start = SeqFeature.UnknownPosition()
jpayne@68:             if end is None:
jpayne@68:                 end = SeqFeature.UnknownPosition()
jpayne@68: 
jpayne@68:             locations.append((location_id, start, end, strand))
jpayne@68:         # Get possible remote reference information
jpayne@68:         remote_results = adaptor.execute_and_fetchall(
jpayne@68:             "SELECT location_id, dbname, accession, version"
jpayne@68:             " FROM location join dbxref using (dbxref_id)"
jpayne@68:             " WHERE seqfeature_id = %s",
jpayne@68:             (seqfeature_id,),
jpayne@68:         )
jpayne@68:         lookup = {}
jpayne@68:         for location_id, dbname, accession, version in remote_results:
jpayne@68:             if version and version != "0":
jpayne@68:                 v = f"{accession}.{version}"
jpayne@68:             else:
jpayne@68:                 v = accession
jpayne@68:             # subfeature remote location db_ref are stored as a empty string
jpayne@68:             # when not present
jpayne@68:             if dbname == "":
jpayne@68:                 dbname = None
jpayne@68:             lookup[location_id] = (dbname, v)
jpayne@68: 
jpayne@68:         feature = SeqFeature.SeqFeature(type=seqfeature_type)
jpayne@68:         # Store the key as a private property
jpayne@68:         feature._seqfeature_id = seqfeature_id
jpayne@68:         feature.qualifiers = qualifiers
jpayne@68:         if len(locations) == 0:
jpayne@68:             pass
jpayne@68:         elif len(locations) == 1:
jpayne@68:             location_id, start, end, strand = locations[0]
jpayne@68:             # See Bug 2677, we currently don't record the location_operator
jpayne@68:             # For consistency with older versions Biopython, default to "".
jpayne@68:             feature.location_operator = _retrieve_location_qualifier_value(
jpayne@68:                 adaptor, location_id
jpayne@68:             )
jpayne@68:             dbname, version = lookup.get(location_id, (None, None))
jpayne@68:             feature.location = SeqFeature.SimpleLocation(start, end)
jpayne@68:             feature.location.strand = strand
jpayne@68:             feature.location.ref_db = dbname
jpayne@68:             feature.location.ref = version
jpayne@68:         else:
jpayne@68:             locs = []
jpayne@68:             for location in locations:
jpayne@68:                 location_id, start, end, strand = location
jpayne@68:                 dbname, version = lookup.get(location_id, (None, None))
jpayne@68:                 locs.append(
jpayne@68:                     SeqFeature.SimpleLocation(
jpayne@68:                         start, end, strand=strand, ref=version, ref_db=dbname
jpayne@68:                     )
jpayne@68:                 )
jpayne@68:             # Locations are typically in biological in order (see negative
jpayne@68:             # strands below), but because of remote locations for
jpayne@68:             # sub-features they are not necessarily in numerical order:
jpayne@68:             strands = {_.strand for _ in locs}
jpayne@68:             if len(strands) == 1 and -1 in strands:
jpayne@68:                 # Evil hack time for backwards compatibility
jpayne@68:                 # TODO - Check if BioPerl and (old) Biopython did the same,
jpayne@68:                 # we may have an existing incompatibility lurking here...
jpayne@68:                 locs = locs[::-1]
jpayne@68:             feature.location = SeqFeature.CompoundLocation(locs, "join")
jpayne@68:             # TODO - See Bug 2677 - we don't yet record location operator,
jpayne@68:             # so for consistency with older versions of Biopython default
jpayne@68:             # to assuming its a join.
jpayne@68:         seq_feature_list.append(feature)
jpayne@68:     return seq_feature_list
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_location_qualifier_value(adaptor, location_id):
jpayne@68:     value = adaptor.execute_and_fetch_col0(
jpayne@68:         "SELECT value FROM location_qualifier_value WHERE location_id = %s",
jpayne@68:         (location_id,),
jpayne@68:     )
jpayne@68:     try:
jpayne@68:         return value[0]
jpayne@68:     except IndexError:
jpayne@68:         return ""
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_annotations(adaptor, primary_id, taxon_id):
jpayne@68:     annotations = {}
jpayne@68:     annotations.update(_retrieve_alphabet(adaptor, primary_id))
jpayne@68:     annotations.update(_retrieve_qualifier_value(adaptor, primary_id))
jpayne@68:     annotations.update(_retrieve_reference(adaptor, primary_id))
jpayne@68:     annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id))
jpayne@68:     annotations.update(_retrieve_comment(adaptor, primary_id))
jpayne@68:     return annotations
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_alphabet(adaptor, primary_id):
jpayne@68:     results = adaptor.execute_and_fetchall(
jpayne@68:         "SELECT alphabet FROM biosequence WHERE bioentry_id = %s", (primary_id,)
jpayne@68:     )
jpayne@68:     if len(results) != 1:
jpayne@68:         raise ValueError(f"Expected 1 response, got {len(results)}.")
jpayne@68:     alphabets = results[0]
jpayne@68:     if len(alphabets) != 1:
jpayne@68:         raise ValueError(f"Expected 1 alphabet in response, got {len(alphabets)}.")
jpayne@68:     alphabet = alphabets[0]
jpayne@68:     if alphabet == "dna":
jpayne@68:         molecule_type = "DNA"
jpayne@68:     elif alphabet == "rna":
jpayne@68:         molecule_type = "RNA"
jpayne@68:     elif alphabet == "protein":
jpayne@68:         molecule_type = "protein"
jpayne@68:     else:
jpayne@68:         molecule_type = None
jpayne@68:     if molecule_type is not None:
jpayne@68:         return {"molecule_type": molecule_type}
jpayne@68:     else:
jpayne@68:         return {}
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_qualifier_value(adaptor, primary_id):
jpayne@68:     qvs = adaptor.execute_and_fetchall(
jpayne@68:         "SELECT name, value"
jpayne@68:         " FROM bioentry_qualifier_value JOIN term USING (term_id)"
jpayne@68:         " WHERE bioentry_id = %s"
jpayne@68:         ' ORDER BY "rank"',
jpayne@68:         (primary_id,),
jpayne@68:     )
jpayne@68:     qualifiers = {}
jpayne@68:     for name, value in qvs:
jpayne@68:         if name == "keyword":
jpayne@68:             name = "keywords"
jpayne@68:         # See handling of "date" in Loader.py
jpayne@68:         elif name == "date_changed":
jpayne@68:             name = "date"
jpayne@68:         elif name == "secondary_accession":
jpayne@68:             name = "accessions"
jpayne@68:         qualifiers.setdefault(name, []).append(value)
jpayne@68:     return qualifiers
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_reference(adaptor, primary_id):
jpayne@68:     # XXX dbxref_qualifier_value
jpayne@68: 
jpayne@68:     refs = adaptor.execute_and_fetchall(
jpayne@68:         "SELECT start_pos, end_pos, "
jpayne@68:         " location, title, authors,"
jpayne@68:         " dbname, accession"
jpayne@68:         " FROM bioentry_reference"
jpayne@68:         " JOIN reference USING (reference_id)"
jpayne@68:         " LEFT JOIN dbxref USING (dbxref_id)"
jpayne@68:         " WHERE bioentry_id = %s"
jpayne@68:         ' ORDER BY "rank"',
jpayne@68:         (primary_id,),
jpayne@68:     )
jpayne@68:     references = []
jpayne@68:     for start, end, location, title, authors, dbname, accession in refs:
jpayne@68:         reference = SeqFeature.Reference()
jpayne@68:         # If the start/end are missing, reference.location is an empty list
jpayne@68:         if (start is not None) or (end is not None):
jpayne@68:             if start is not None:
jpayne@68:                 start -= 1  # python counting
jpayne@68:             reference.location = [SeqFeature.SimpleLocation(start, end)]
jpayne@68:         # Don't replace the default "" with None.
jpayne@68:         if authors:
jpayne@68:             reference.authors = authors
jpayne@68:         if title:
jpayne@68:             reference.title = title
jpayne@68:         reference.journal = location
jpayne@68:         if dbname == "PUBMED":
jpayne@68:             reference.pubmed_id = accession
jpayne@68:         elif dbname == "MEDLINE":
jpayne@68:             reference.medline_id = accession
jpayne@68:         references.append(reference)
jpayne@68:     if references:
jpayne@68:         return {"references": references}
jpayne@68:     else:
jpayne@68:         return {}
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_taxon(adaptor, primary_id, taxon_id):
jpayne@68:     a = {}
jpayne@68:     common_names = adaptor.execute_and_fetch_col0(
jpayne@68:         "SELECT name FROM taxon_name WHERE taxon_id = %s"
jpayne@68:         " AND name_class = 'genbank common name'",
jpayne@68:         (taxon_id,),
jpayne@68:     )
jpayne@68:     if common_names:
jpayne@68:         a["source"] = common_names[0]
jpayne@68:     scientific_names = adaptor.execute_and_fetch_col0(
jpayne@68:         "SELECT name FROM taxon_name WHERE taxon_id = %s"
jpayne@68:         " AND name_class = 'scientific name'",
jpayne@68:         (taxon_id,),
jpayne@68:     )
jpayne@68:     if scientific_names:
jpayne@68:         a["organism"] = scientific_names[0]
jpayne@68:     ncbi_taxids = adaptor.execute_and_fetch_col0(
jpayne@68:         "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)
jpayne@68:     )
jpayne@68:     if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
jpayne@68:         a["ncbi_taxid"] = ncbi_taxids[0]
jpayne@68: 
jpayne@68:     # Old code used the left/right values in the taxon table to get the
jpayne@68:     # taxonomy lineage in one SQL command.  This was actually very slow,
jpayne@68:     # and would fail if the (optional) left/right values were missing.
jpayne@68:     #
jpayne@68:     # The following code is based on a contribution from Eric Gibert, and
jpayne@68:     # relies on the taxon table's parent_taxon_id field only (ignoring the
jpayne@68:     # optional left/right values).  This means that it has to make a
jpayne@68:     # separate SQL query for each entry in the lineage, but it does still
jpayne@68:     # appear to be *much* faster.  See Bug 2494.
jpayne@68:     taxonomy = []
jpayne@68:     while taxon_id:
jpayne@68:         name, rank, parent_taxon_id = adaptor.execute_one(
jpayne@68:             "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id"
jpayne@68:             " FROM taxon, taxon_name"
jpayne@68:             " WHERE taxon.taxon_id=taxon_name.taxon_id"
jpayne@68:             " AND taxon_name.name_class='scientific name'"
jpayne@68:             " AND taxon.taxon_id = %s",
jpayne@68:             (taxon_id,),
jpayne@68:         )
jpayne@68:         if taxon_id == parent_taxon_id:
jpayne@68:             # If the taxon table has been populated by the BioSQL script
jpayne@68:             # load_ncbi_taxonomy.pl this is how top parent nodes are stored.
jpayne@68:             # Personally, I would have used a NULL parent_taxon_id here.
jpayne@68:             break
jpayne@68: 
jpayne@68:         taxonomy.insert(0, name)
jpayne@68:         taxon_id = parent_taxon_id
jpayne@68: 
jpayne@68:     if taxonomy:
jpayne@68:         a["taxonomy"] = taxonomy
jpayne@68:     return a
jpayne@68: 
jpayne@68: 
jpayne@68: def _retrieve_comment(adaptor, primary_id):
jpayne@68:     qvs = adaptor.execute_and_fetchall(
jpayne@68:         'SELECT comment_text FROM comment WHERE bioentry_id=%s ORDER BY "rank"',
jpayne@68:         (primary_id,),
jpayne@68:     )
jpayne@68:     comments = [comm[0] for comm in qvs]
jpayne@68:     # Don't want to add an empty list...
jpayne@68:     if comments:
jpayne@68:         return {"comment": comments}
jpayne@68:     else:
jpayne@68:         return {}
jpayne@68: 
jpayne@68: 
jpayne@68: class DBSeqRecord(SeqRecord):
jpayne@68:     """BioSQL equivalent of the Biopython SeqRecord object."""
jpayne@68: 
jpayne@68:     def __init__(self, adaptor, primary_id):
jpayne@68:         """Create a DBSeqRecord object.
jpayne@68: 
jpayne@68:         Arguments:
jpayne@68:          - adaptor - A BioSQL.BioSeqDatabase.Adaptor object
jpayne@68:          - primary_id - An internal integer ID used by BioSQL
jpayne@68: 
jpayne@68:         You wouldn't normally create a DBSeqRecord object yourself,
jpayne@68:         this is done for you when using a BioSeqDatabase object
jpayne@68:         """
jpayne@68:         self._adaptor = adaptor
jpayne@68:         self._primary_id = primary_id
jpayne@68: 
jpayne@68:         (
jpayne@68:             self._biodatabase_id,
jpayne@68:             self._taxon_id,
jpayne@68:             self.name,
jpayne@68:             accession,
jpayne@68:             version,
jpayne@68:             self._identifier,
jpayne@68:             self._division,
jpayne@68:             self.description,
jpayne@68:         ) = self._adaptor.execute_one(
jpayne@68:             "SELECT biodatabase_id, taxon_id, name, accession, version,"
jpayne@68:             " identifier, division, description"
jpayne@68:             " FROM bioentry"
jpayne@68:             " WHERE bioentry_id = %s",
jpayne@68:             (self._primary_id,),
jpayne@68:         )
jpayne@68:         if version and version != "0":
jpayne@68:             self.id = f"{accession}.{version}"
jpayne@68:         else:
jpayne@68:             self.id = accession
jpayne@68:         # We don't yet record any per-letter-annotations in the
jpayne@68:         # BioSQL database, but we should set this property up
jpayne@68:         # for completeness (and the __str__ method).
jpayne@68:         # We do NOT want to load the sequence from the DB here!
jpayne@68:         length = _retrieve_seq_len(adaptor, primary_id)
jpayne@68:         self._per_letter_annotations = _RestrictedDict(length=length)
jpayne@68: 
jpayne@68:     def __get_seq(self):
jpayne@68:         if not hasattr(self, "_seq"):
jpayne@68:             self._seq = _retrieve_seq(self._adaptor, self._primary_id)
jpayne@68:         return self._seq
jpayne@68: 
jpayne@68:     def __set_seq(self, seq):
jpayne@68:         # TODO - Check consistent with self._per_letter_annotations
jpayne@68:         self._seq = seq
jpayne@68: 
jpayne@68:     def __del_seq(self):
jpayne@68:         del self._seq
jpayne@68: 
jpayne@68:     seq = property(__get_seq, __set_seq, __del_seq, "Seq object")
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def dbxrefs(self) -> List[str]:
jpayne@68:         """Database cross references."""
jpayne@68:         if not hasattr(self, "_dbxrefs"):
jpayne@68:             self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id)
jpayne@68:         return self._dbxrefs
jpayne@68: 
jpayne@68:     @dbxrefs.setter
jpayne@68:     def dbxrefs(self, value: List[str]) -> None:
jpayne@68:         self._dbxrefs = value
jpayne@68: 
jpayne@68:     @dbxrefs.deleter
jpayne@68:     def dbxrefs(self) -> None:
jpayne@68:         del self._dbxrefs
jpayne@68: 
jpayne@68:     def __get_features(self):
jpayne@68:         if not hasattr(self, "_features"):
jpayne@68:             self._features = _retrieve_features(self._adaptor, self._primary_id)
jpayne@68:         return self._features
jpayne@68: 
jpayne@68:     def __set_features(self, features):
jpayne@68:         self._features = features
jpayne@68: 
jpayne@68:     def __del_features(self):
jpayne@68:         del self._features
jpayne@68: 
jpayne@68:     features = property(__get_features, __set_features, __del_features, "Features")
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def annotations(self) -> SeqRecord._AnnotationsDict:
jpayne@68:         """Annotations."""
jpayne@68:         if not hasattr(self, "_annotations"):
jpayne@68:             self._annotations = _retrieve_annotations(
jpayne@68:                 self._adaptor, self._primary_id, self._taxon_id
jpayne@68:             )
jpayne@68:             if self._identifier:
jpayne@68:                 self._annotations["gi"] = self._identifier
jpayne@68:             if self._division:
jpayne@68:                 self._annotations["data_file_division"] = self._division
jpayne@68:         return self._annotations
jpayne@68: 
jpayne@68:     @annotations.setter
jpayne@68:     def annotations(self, value: Optional[SeqRecord._AnnotationsDict]) -> None:
jpayne@68:         if value:
jpayne@68:             self._annotations = value
jpayne@68:         else:
jpayne@68:             self._annotations = {}
jpayne@68: 
jpayne@68:     @annotations.deleter
jpayne@68:     def annotations(self) -> None:
jpayne@68:         del self._annotations