csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/BioSeq.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/BioSeq.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 # Copyright 2002 by Andrew Dalke. All rights reserved.
jpayne@68	2 # Revisions 2007-2016 copyright by Peter Cock. All rights reserved.
jpayne@68	3 # Revisions 2008-2009 copyright by Cymon J. Cox. All rights reserved.
jpayne@68	4 #
jpayne@68	5 # This file is part of the Biopython distribution and governed by your
jpayne@68	6 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
jpayne@68	7 # Please see the LICENSE file that should have been included as part of this
jpayne@68	8 # package.
jpayne@68	9 #
jpayne@68	10 # Note that BioSQL (including the database schema and scripts) is
jpayne@68	11 # available and licensed separately. Please consult www.biosql.org
jpayne@68	12 """Implementations of Biopython-like Seq objects on top of BioSQL.
jpayne@68	13
jpayne@68	14 This allows retrieval of items stored in a BioSQL database using
jpayne@68	15 a biopython-like SeqRecord and Seq interface.
jpayne@68	16
jpayne@68	17 Note: Currently we do not support recording per-letter-annotations
jpayne@68	18 (like quality scores) in BioSQL.
jpayne@68	19 """
jpayne@68	20
jpayne@68	21 from typing import List, Optional
jpayne@68	22
jpayne@68	23 from Bio.Seq import Seq, SequenceDataAbstractBaseClass
jpayne@68	24 from Bio.SeqRecord import SeqRecord, _RestrictedDict
jpayne@68	25 from Bio import SeqFeature
jpayne@68	26
jpayne@68	27
jpayne@68	28 class _BioSQLSequenceData(SequenceDataAbstractBaseClass):
jpayne@68	29 """Retrieves sequence data from a BioSQL database (PRIVATE)."""
jpayne@68	30
jpayne@68	31 __slots__ = ("primary_id", "adaptor", "_length", "start")
jpayne@68	32
jpayne@68	33 def __init__(self, primary_id, adaptor, start=0, length=0):
jpayne@68	34 """Create a new _BioSQLSequenceData object referring to a BioSQL entry.
jpayne@68	35
jpayne@68	36 You wouldn't normally create a _BioSQLSequenceData object yourself,
jpayne@68	37 this is done for you when retrieving a DBSeqRecord object from the
jpayne@68	38 database, which creates a Seq object using a _BioSQLSequenceData
jpayne@68	39 instance as the data provider.
jpayne@68	40 """
jpayne@68	41 self.primary_id = primary_id
jpayne@68	42 self.adaptor = adaptor
jpayne@68	43 self._length = length
jpayne@68	44 self.start = start
jpayne@68	45 super().__init__()
jpayne@68	46
jpayne@68	47 def __len__(self):
jpayne@68	48 """Return the length of the sequence."""
jpayne@68	49 return self._length
jpayne@68	50
jpayne@68	51 def __getitem__(self, key):
jpayne@68	52 """Return a subsequence as a bytes or a _BioSQLSequenceData object."""
jpayne@68	53 if isinstance(key, slice):
jpayne@68	54 start, end, step = key.indices(self._length)
jpayne@68	55 size = len(range(start, end, step))
jpayne@68	56 if size == 0:
jpayne@68	57 return b""
jpayne@68	58 else:
jpayne@68	59 # Return a single letter as an integer (consistent with bytes)
jpayne@68	60 i = key
jpayne@68	61 if i < 0:
jpayne@68	62 i += self._length
jpayne@68	63 if i < 0:
jpayne@68	64 raise IndexError(key)
jpayne@68	65 elif i >= self._length:
jpayne@68	66 raise IndexError(key)
jpayne@68	67 c = self.adaptor.get_subseq_as_string(
jpayne@68	68 self.primary_id, self.start + i, self.start + i + 1
jpayne@68	69 )
jpayne@68	70 return ord(c)
jpayne@68	71
jpayne@68	72 if step == 1:
jpayne@68	73 if start == 0 and size == self._length:
jpayne@68	74 # Return the full sequence as bytes
jpayne@68	75 sequence = self.adaptor.get_subseq_as_string(
jpayne@68	76 self.primary_id, self.start, self.start + self._length
jpayne@68	77 )
jpayne@68	78 return sequence.encode("ASCII")
jpayne@68	79 else:
jpayne@68	80 # Return a _BioSQLSequenceData with the start and end adjusted
jpayne@68	81 return _BioSQLSequenceData(
jpayne@68	82 self.primary_id, self.adaptor, self.start + start, size
jpayne@68	83 )
jpayne@68	84 else:
jpayne@68	85 # Will have to extract the sequence because of the stride
jpayne@68	86 full = self.adaptor.get_subseq_as_string(
jpayne@68	87 self.primary_id, self.start + start, self.start + end
jpayne@68	88 )
jpayne@68	89 return full[::step].encode("ASCII")
jpayne@68	90
jpayne@68	91
jpayne@68	92 def _retrieve_seq_len(adaptor, primary_id):
jpayne@68	93 # The database schema ensures there will be only one matching row
jpayne@68	94 seqs = adaptor.execute_and_fetchall(
jpayne@68	95 "SELECT length FROM biosequence WHERE bioentry_id = %s", (primary_id,)
jpayne@68	96 )
jpayne@68	97 if not seqs:
jpayne@68	98 return None
jpayne@68	99 if len(seqs) != 1:
jpayne@68	100 raise ValueError(f"Expected 1 response, got {len(seqs)}.")
jpayne@68	101 (given_length,) = seqs[0]
jpayne@68	102 return int(given_length)
jpayne@68	103
jpayne@68	104
jpayne@68	105 def _retrieve_seq(adaptor, primary_id):
jpayne@68	106 # The database schema ensures there will be only one matching
jpayne@68	107 # row in the table.
jpayne@68	108
jpayne@68	109 # If an undefined sequence was recorded, seq will be NULL,
jpayne@68	110 # but length will be populated. This means length(seq)
jpayne@68	111 # will return None.
jpayne@68	112 seqs = adaptor.execute_and_fetchall(
jpayne@68	113 "SELECT alphabet, length, length(seq) FROM biosequence WHERE bioentry_id = %s",
jpayne@68	114 (primary_id,),
jpayne@68	115 )
jpayne@68	116 if not seqs:
jpayne@68	117 return
jpayne@68	118 if len(seqs) != 1:
jpayne@68	119 raise ValueError(f"Expected 1 response, got {len(seqs)}.")
jpayne@68	120 moltype, given_length, length = seqs[0]
jpayne@68	121
jpayne@68	122 try:
jpayne@68	123 length = int(length)
jpayne@68	124 given_length = int(given_length)
jpayne@68	125 if length != given_length:
jpayne@68	126 raise ValueError(
jpayne@68	127 f"'length' differs from sequence length, {given_length}, {length}"
jpayne@68	128 )
jpayne@68	129 have_seq = True
jpayne@68	130 except TypeError:
jpayne@68	131 if length is not None:
jpayne@68	132 raise ValueError(f"Expected 'length' to be 'None', got {length}.")
jpayne@68	133 seqs = adaptor.execute_and_fetchall(
jpayne@68	134 "SELECT alphabet, length, seq FROM biosequence WHERE bioentry_id = %s",
jpayne@68	135 (primary_id,),
jpayne@68	136 )
jpayne@68	137 if len(seqs) != 1:
jpayne@68	138 raise ValueError(f"Expected 1 response, got {len(seqs)}.")
jpayne@68	139 moltype, given_length, seq = seqs[0]
jpayne@68	140 if seq:
jpayne@68	141 raise ValueError(f"Expected 'seq' to have a falsy value, got {seq}.")
jpayne@68	142 length = int(given_length)
jpayne@68	143 have_seq = False
jpayne@68	144 del seq
jpayne@68	145 del given_length
jpayne@68	146
jpayne@68	147 if have_seq:
jpayne@68	148 data = _BioSQLSequenceData(primary_id, adaptor, start=0, length=length)
jpayne@68	149 return Seq(data)
jpayne@68	150 else:
jpayne@68	151 return Seq(None, length=length)
jpayne@68	152
jpayne@68	153
jpayne@68	154 def _retrieve_dbxrefs(adaptor, primary_id):
jpayne@68	155 """Retrieve the database cross references for the sequence (PRIVATE)."""
jpayne@68	156 _dbxrefs = []
jpayne@68	157 dbxrefs = adaptor.execute_and_fetchall(
jpayne@68	158 "SELECT dbname, accession, version"
jpayne@68	159 " FROM bioentry_dbxref join dbxref using (dbxref_id)"
jpayne@68	160 " WHERE bioentry_id = %s"
jpayne@68	161 ' ORDER BY "rank"',
jpayne@68	162 (primary_id,),
jpayne@68	163 )
jpayne@68	164 for dbname, accession, version in dbxrefs:
jpayne@68	165 if version and version != "0":
jpayne@68	166 v = f"{accession}.{version}"
jpayne@68	167 else:
jpayne@68	168 v = accession
jpayne@68	169 _dbxrefs.append(f"{dbname}:{v}")
jpayne@68	170 return _dbxrefs
jpayne@68	171
jpayne@68	172
jpayne@68	173 def _retrieve_features(adaptor, primary_id):
jpayne@68	174 sql = (
jpayne@68	175 'SELECT seqfeature_id, type.name, "rank"'
jpayne@68	176 " FROM seqfeature join term type on (type_term_id = type.term_id)"
jpayne@68	177 " WHERE bioentry_id = %s"
jpayne@68	178 ' ORDER BY "rank"'
jpayne@68	179 )
jpayne@68	180 results = adaptor.execute_and_fetchall(sql, (primary_id,))
jpayne@68	181 seq_feature_list = []
jpayne@68	182 for seqfeature_id, seqfeature_type, seqfeature_rank in results:
jpayne@68	183 # Get qualifiers [except for db_xref which is stored separately]
jpayne@68	184 qvs = adaptor.execute_and_fetchall(
jpayne@68	185 "SELECT name, value"
jpayne@68	186 " FROM seqfeature_qualifier_value join term using (term_id)"
jpayne@68	187 " WHERE seqfeature_id = %s"
jpayne@68	188 ' ORDER BY "rank"',
jpayne@68	189 (seqfeature_id,),
jpayne@68	190 )
jpayne@68	191 qualifiers = {}
jpayne@68	192 for qv_name, qv_value in qvs:
jpayne@68	193 qualifiers.setdefault(qv_name, []).append(qv_value)
jpayne@68	194 # Get db_xrefs [special case of qualifiers]
jpayne@68	195 qvs = adaptor.execute_and_fetchall(
jpayne@68	196 "SELECT dbxref.dbname, dbxref.accession"
jpayne@68	197 " FROM dbxref join seqfeature_dbxref using (dbxref_id)"
jpayne@68	198 " WHERE seqfeature_dbxref.seqfeature_id = %s"
jpayne@68	199 ' ORDER BY "rank"',
jpayne@68	200 (seqfeature_id,),
jpayne@68	201 )
jpayne@68	202 for qv_name, qv_value in qvs:
jpayne@68	203 value = f"{qv_name}:{qv_value}"
jpayne@68	204 qualifiers.setdefault("db_xref", []).append(value)
jpayne@68	205 # Get locations
jpayne@68	206 results = adaptor.execute_and_fetchall(
jpayne@68	207 "SELECT location_id, start_pos, end_pos, strand"
jpayne@68	208 " FROM location"
jpayne@68	209 " WHERE seqfeature_id = %s"
jpayne@68	210 ' ORDER BY "rank"',
jpayne@68	211 (seqfeature_id,),
jpayne@68	212 )
jpayne@68	213 locations = []
jpayne@68	214 # convert to Python standard form
jpayne@68	215 # Convert strand = 0 to strand = None
jpayne@68	216 # re: comment in Loader.py:
jpayne@68	217 # Biopython uses None when we don't know strand information but
jpayne@68	218 # BioSQL requires something (non null) and sets this as zero
jpayne@68	219 # So we'll use the strand or 0 if Biopython spits out None
jpayne@68	220 for location_id, start, end, strand in results:
jpayne@68	221 if start:
jpayne@68	222 start -= 1
jpayne@68	223 if strand == 0:
jpayne@68	224 strand = None
jpayne@68	225 if strand not in (+1, -1, None):
jpayne@68	226 raise ValueError(
jpayne@68	227 "Invalid strand %s found in database for "
jpayne@68	228 "seqfeature_id %s" % (strand, seqfeature_id)
jpayne@68	229 )
jpayne@68	230 if start is not None and end is not None and end < start:
jpayne@68	231 import warnings
jpayne@68	232 from Bio import BiopythonWarning
jpayne@68	233
jpayne@68	234 warnings.warn(
jpayne@68	235 "Inverted location start/end (%i and %i) for "
jpayne@68	236 "seqfeature_id %s" % (start, end, seqfeature_id),
jpayne@68	237 BiopythonWarning,
jpayne@68	238 )
jpayne@68	239
jpayne@68	240 # For SwissProt unknown positions (?)
jpayne@68	241 if start is None:
jpayne@68	242 start = SeqFeature.UnknownPosition()
jpayne@68	243 if end is None:
jpayne@68	244 end = SeqFeature.UnknownPosition()
jpayne@68	245
jpayne@68	246 locations.append((location_id, start, end, strand))
jpayne@68	247 # Get possible remote reference information
jpayne@68	248 remote_results = adaptor.execute_and_fetchall(
jpayne@68	249 "SELECT location_id, dbname, accession, version"
jpayne@68	250 " FROM location join dbxref using (dbxref_id)"
jpayne@68	251 " WHERE seqfeature_id = %s",
jpayne@68	252 (seqfeature_id,),
jpayne@68	253 )
jpayne@68	254 lookup = {}
jpayne@68	255 for location_id, dbname, accession, version in remote_results:
jpayne@68	256 if version and version != "0":
jpayne@68	257 v = f"{accession}.{version}"
jpayne@68	258 else:
jpayne@68	259 v = accession
jpayne@68	260 # subfeature remote location db_ref are stored as a empty string
jpayne@68	261 # when not present
jpayne@68	262 if dbname == "":
jpayne@68	263 dbname = None
jpayne@68	264 lookup[location_id] = (dbname, v)
jpayne@68	265
jpayne@68	266 feature = SeqFeature.SeqFeature(type=seqfeature_type)
jpayne@68	267 # Store the key as a private property
jpayne@68	268 feature._seqfeature_id = seqfeature_id
jpayne@68	269 feature.qualifiers = qualifiers
jpayne@68	270 if len(locations) == 0:
jpayne@68	271 pass
jpayne@68	272 elif len(locations) == 1:
jpayne@68	273 location_id, start, end, strand = locations[0]
jpayne@68	274 # See Bug 2677, we currently don't record the location_operator
jpayne@68	275 # For consistency with older versions Biopython, default to "".
jpayne@68	276 feature.location_operator = _retrieve_location_qualifier_value(
jpayne@68	277 adaptor, location_id
jpayne@68	278 )
jpayne@68	279 dbname, version = lookup.get(location_id, (None, None))
jpayne@68	280 feature.location = SeqFeature.SimpleLocation(start, end)
jpayne@68	281 feature.location.strand = strand
jpayne@68	282 feature.location.ref_db = dbname
jpayne@68	283 feature.location.ref = version
jpayne@68	284 else:
jpayne@68	285 locs = []
jpayne@68	286 for location in locations:
jpayne@68	287 location_id, start, end, strand = location
jpayne@68	288 dbname, version = lookup.get(location_id, (None, None))
jpayne@68	289 locs.append(
jpayne@68	290 SeqFeature.SimpleLocation(
jpayne@68	291 start, end, strand=strand, ref=version, ref_db=dbname
jpayne@68	292 )
jpayne@68	293 )
jpayne@68	294 # Locations are typically in biological in order (see negative
jpayne@68	295 # strands below), but because of remote locations for
jpayne@68	296 # sub-features they are not necessarily in numerical order:
jpayne@68	297 strands = {_.strand for _ in locs}
jpayne@68	298 if len(strands) == 1 and -1 in strands:
jpayne@68	299 # Evil hack time for backwards compatibility
jpayne@68	300 # TODO - Check if BioPerl and (old) Biopython did the same,
jpayne@68	301 # we may have an existing incompatibility lurking here...
jpayne@68	302 locs = locs[::-1]
jpayne@68	303 feature.location = SeqFeature.CompoundLocation(locs, "join")
jpayne@68	304 # TODO - See Bug 2677 - we don't yet record location operator,
jpayne@68	305 # so for consistency with older versions of Biopython default
jpayne@68	306 # to assuming its a join.
jpayne@68	307 seq_feature_list.append(feature)
jpayne@68	308 return seq_feature_list
jpayne@68	309
jpayne@68	310
jpayne@68	311 def _retrieve_location_qualifier_value(adaptor, location_id):
jpayne@68	312 value = adaptor.execute_and_fetch_col0(
jpayne@68	313 "SELECT value FROM location_qualifier_value WHERE location_id = %s",
jpayne@68	314 (location_id,),
jpayne@68	315 )
jpayne@68	316 try:
jpayne@68	317 return value[0]
jpayne@68	318 except IndexError:
jpayne@68	319 return ""
jpayne@68	320
jpayne@68	321
jpayne@68	322 def _retrieve_annotations(adaptor, primary_id, taxon_id):
jpayne@68	323 annotations = {}
jpayne@68	324 annotations.update(_retrieve_alphabet(adaptor, primary_id))
jpayne@68	325 annotations.update(_retrieve_qualifier_value(adaptor, primary_id))
jpayne@68	326 annotations.update(_retrieve_reference(adaptor, primary_id))
jpayne@68	327 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id))
jpayne@68	328 annotations.update(_retrieve_comment(adaptor, primary_id))
jpayne@68	329 return annotations
jpayne@68	330
jpayne@68	331
jpayne@68	332 def _retrieve_alphabet(adaptor, primary_id):
jpayne@68	333 results = adaptor.execute_and_fetchall(
jpayne@68	334 "SELECT alphabet FROM biosequence WHERE bioentry_id = %s", (primary_id,)
jpayne@68	335 )
jpayne@68	336 if len(results) != 1:
jpayne@68	337 raise ValueError(f"Expected 1 response, got {len(results)}.")
jpayne@68	338 alphabets = results[0]
jpayne@68	339 if len(alphabets) != 1:
jpayne@68	340 raise ValueError(f"Expected 1 alphabet in response, got {len(alphabets)}.")
jpayne@68	341 alphabet = alphabets[0]
jpayne@68	342 if alphabet == "dna":
jpayne@68	343 molecule_type = "DNA"
jpayne@68	344 elif alphabet == "rna":
jpayne@68	345 molecule_type = "RNA"
jpayne@68	346 elif alphabet == "protein":
jpayne@68	347 molecule_type = "protein"
jpayne@68	348 else:
jpayne@68	349 molecule_type = None
jpayne@68	350 if molecule_type is not None:
jpayne@68	351 return {"molecule_type": molecule_type}
jpayne@68	352 else:
jpayne@68	353 return {}
jpayne@68	354
jpayne@68	355
jpayne@68	356 def _retrieve_qualifier_value(adaptor, primary_id):
jpayne@68	357 qvs = adaptor.execute_and_fetchall(
jpayne@68	358 "SELECT name, value"
jpayne@68	359 " FROM bioentry_qualifier_value JOIN term USING (term_id)"
jpayne@68	360 " WHERE bioentry_id = %s"
jpayne@68	361 ' ORDER BY "rank"',
jpayne@68	362 (primary_id,),
jpayne@68	363 )
jpayne@68	364 qualifiers = {}
jpayne@68	365 for name, value in qvs:
jpayne@68	366 if name == "keyword":
jpayne@68	367 name = "keywords"
jpayne@68	368 # See handling of "date" in Loader.py
jpayne@68	369 elif name == "date_changed":
jpayne@68	370 name = "date"
jpayne@68	371 elif name == "secondary_accession":
jpayne@68	372 name = "accessions"
jpayne@68	373 qualifiers.setdefault(name, []).append(value)
jpayne@68	374 return qualifiers
jpayne@68	375
jpayne@68	376
jpayne@68	377 def _retrieve_reference(adaptor, primary_id):
jpayne@68	378 # XXX dbxref_qualifier_value
jpayne@68	379
jpayne@68	380 refs = adaptor.execute_and_fetchall(
jpayne@68	381 "SELECT start_pos, end_pos, "
jpayne@68	382 " location, title, authors,"
jpayne@68	383 " dbname, accession"
jpayne@68	384 " FROM bioentry_reference"
jpayne@68	385 " JOIN reference USING (reference_id)"
jpayne@68	386 " LEFT JOIN dbxref USING (dbxref_id)"
jpayne@68	387 " WHERE bioentry_id = %s"
jpayne@68	388 ' ORDER BY "rank"',
jpayne@68	389 (primary_id,),
jpayne@68	390 )
jpayne@68	391 references = []
jpayne@68	392 for start, end, location, title, authors, dbname, accession in refs:
jpayne@68	393 reference = SeqFeature.Reference()
jpayne@68	394 # If the start/end are missing, reference.location is an empty list
jpayne@68	395 if (start is not None) or (end is not None):
jpayne@68	396 if start is not None:
jpayne@68	397 start -= 1 # python counting
jpayne@68	398 reference.location = [SeqFeature.SimpleLocation(start, end)]
jpayne@68	399 # Don't replace the default "" with None.
jpayne@68	400 if authors:
jpayne@68	401 reference.authors = authors
jpayne@68	402 if title:
jpayne@68	403 reference.title = title
jpayne@68	404 reference.journal = location
jpayne@68	405 if dbname == "PUBMED":
jpayne@68	406 reference.pubmed_id = accession
jpayne@68	407 elif dbname == "MEDLINE":
jpayne@68	408 reference.medline_id = accession
jpayne@68	409 references.append(reference)
jpayne@68	410 if references:
jpayne@68	411 return {"references": references}
jpayne@68	412 else:
jpayne@68	413 return {}
jpayne@68	414
jpayne@68	415
jpayne@68	416 def _retrieve_taxon(adaptor, primary_id, taxon_id):
jpayne@68	417 a = {}
jpayne@68	418 common_names = adaptor.execute_and_fetch_col0(
jpayne@68	419 "SELECT name FROM taxon_name WHERE taxon_id = %s"
jpayne@68	420 " AND name_class = 'genbank common name'",
jpayne@68	421 (taxon_id,),
jpayne@68	422 )
jpayne@68	423 if common_names:
jpayne@68	424 a["source"] = common_names[0]
jpayne@68	425 scientific_names = adaptor.execute_and_fetch_col0(
jpayne@68	426 "SELECT name FROM taxon_name WHERE taxon_id = %s"
jpayne@68	427 " AND name_class = 'scientific name'",
jpayne@68	428 (taxon_id,),
jpayne@68	429 )
jpayne@68	430 if scientific_names:
jpayne@68	431 a["organism"] = scientific_names[0]
jpayne@68	432 ncbi_taxids = adaptor.execute_and_fetch_col0(
jpayne@68	433 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)
jpayne@68	434 )
jpayne@68	435 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
jpayne@68	436 a["ncbi_taxid"] = ncbi_taxids[0]
jpayne@68	437
jpayne@68	438 # Old code used the left/right values in the taxon table to get the
jpayne@68	439 # taxonomy lineage in one SQL command. This was actually very slow,
jpayne@68	440 # and would fail if the (optional) left/right values were missing.
jpayne@68	441 #
jpayne@68	442 # The following code is based on a contribution from Eric Gibert, and
jpayne@68	443 # relies on the taxon table's parent_taxon_id field only (ignoring the
jpayne@68	444 # optional left/right values). This means that it has to make a
jpayne@68	445 # separate SQL query for each entry in the lineage, but it does still
jpayne@68	446 # appear to be much faster. See Bug 2494.
jpayne@68	447 taxonomy = []
jpayne@68	448 while taxon_id:
jpayne@68	449 name, rank, parent_taxon_id = adaptor.execute_one(
jpayne@68	450 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id"
jpayne@68	451 " FROM taxon, taxon_name"
jpayne@68	452 " WHERE taxon.taxon_id=taxon_name.taxon_id"
jpayne@68	453 " AND taxon_name.name_class='scientific name'"
jpayne@68	454 " AND taxon.taxon_id = %s",
jpayne@68	455 (taxon_id,),
jpayne@68	456 )
jpayne@68	457 if taxon_id == parent_taxon_id:
jpayne@68	458 # If the taxon table has been populated by the BioSQL script
jpayne@68	459 # load_ncbi_taxonomy.pl this is how top parent nodes are stored.
jpayne@68	460 # Personally, I would have used a NULL parent_taxon_id here.
jpayne@68	461 break
jpayne@68	462
jpayne@68	463 taxonomy.insert(0, name)
jpayne@68	464 taxon_id = parent_taxon_id
jpayne@68	465
jpayne@68	466 if taxonomy:
jpayne@68	467 a["taxonomy"] = taxonomy
jpayne@68	468 return a
jpayne@68	469
jpayne@68	470
jpayne@68	471 def _retrieve_comment(adaptor, primary_id):
jpayne@68	472 qvs = adaptor.execute_and_fetchall(
jpayne@68	473 'SELECT comment_text FROM comment WHERE bioentry_id=%s ORDER BY "rank"',
jpayne@68	474 (primary_id,),
jpayne@68	475 )
jpayne@68	476 comments = [comm[0] for comm in qvs]
jpayne@68	477 # Don't want to add an empty list...
jpayne@68	478 if comments:
jpayne@68	479 return {"comment": comments}
jpayne@68	480 else:
jpayne@68	481 return {}
jpayne@68	482
jpayne@68	483
jpayne@68	484 class DBSeqRecord(SeqRecord):
jpayne@68	485 """BioSQL equivalent of the Biopython SeqRecord object."""
jpayne@68	486
jpayne@68	487 def __init__(self, adaptor, primary_id):
jpayne@68	488 """Create a DBSeqRecord object.
jpayne@68	489
jpayne@68	490 Arguments:
jpayne@68	491 - adaptor - A BioSQL.BioSeqDatabase.Adaptor object
jpayne@68	492 - primary_id - An internal integer ID used by BioSQL
jpayne@68	493
jpayne@68	494 You wouldn't normally create a DBSeqRecord object yourself,
jpayne@68	495 this is done for you when using a BioSeqDatabase object
jpayne@68	496 """
jpayne@68	497 self._adaptor = adaptor
jpayne@68	498 self._primary_id = primary_id
jpayne@68	499
jpayne@68	500 (
jpayne@68	501 self._biodatabase_id,
jpayne@68	502 self._taxon_id,
jpayne@68	503 self.name,
jpayne@68	504 accession,
jpayne@68	505 version,
jpayne@68	506 self._identifier,
jpayne@68	507 self._division,
jpayne@68	508 self.description,
jpayne@68	509 ) = self._adaptor.execute_one(
jpayne@68	510 "SELECT biodatabase_id, taxon_id, name, accession, version,"
jpayne@68	511 " identifier, division, description"
jpayne@68	512 " FROM bioentry"
jpayne@68	513 " WHERE bioentry_id = %s",
jpayne@68	514 (self._primary_id,),
jpayne@68	515 )
jpayne@68	516 if version and version != "0":
jpayne@68	517 self.id = f"{accession}.{version}"
jpayne@68	518 else:
jpayne@68	519 self.id = accession
jpayne@68	520 # We don't yet record any per-letter-annotations in the
jpayne@68	521 # BioSQL database, but we should set this property up
jpayne@68	522 # for completeness (and the __str__ method).
jpayne@68	523 # We do NOT want to load the sequence from the DB here!
jpayne@68	524 length = _retrieve_seq_len(adaptor, primary_id)
jpayne@68	525 self._per_letter_annotations = _RestrictedDict(length=length)
jpayne@68	526
jpayne@68	527 def __get_seq(self):
jpayne@68	528 if not hasattr(self, "_seq"):
jpayne@68	529 self._seq = _retrieve_seq(self._adaptor, self._primary_id)
jpayne@68	530 return self._seq
jpayne@68	531
jpayne@68	532 def __set_seq(self, seq):
jpayne@68	533 # TODO - Check consistent with self._per_letter_annotations
jpayne@68	534 self._seq = seq
jpayne@68	535
jpayne@68	536 def __del_seq(self):
jpayne@68	537 del self._seq
jpayne@68	538
jpayne@68	539 seq = property(__get_seq, __set_seq, __del_seq, "Seq object")
jpayne@68	540
jpayne@68	541 @property
jpayne@68	542 def dbxrefs(self) -> List[str]:
jpayne@68	543 """Database cross references."""
jpayne@68	544 if not hasattr(self, "_dbxrefs"):
jpayne@68	545 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id)
jpayne@68	546 return self._dbxrefs
jpayne@68	547
jpayne@68	548 @dbxrefs.setter
jpayne@68	549 def dbxrefs(self, value: List[str]) -> None:
jpayne@68	550 self._dbxrefs = value
jpayne@68	551
jpayne@68	552 @dbxrefs.deleter
jpayne@68	553 def dbxrefs(self) -> None:
jpayne@68	554 del self._dbxrefs
jpayne@68	555
jpayne@68	556 def __get_features(self):
jpayne@68	557 if not hasattr(self, "_features"):
jpayne@68	558 self._features = _retrieve_features(self._adaptor, self._primary_id)
jpayne@68	559 return self._features
jpayne@68	560
jpayne@68	561 def __set_features(self, features):
jpayne@68	562 self._features = features
jpayne@68	563
jpayne@68	564 def __del_features(self):
jpayne@68	565 del self._features
jpayne@68	566
jpayne@68	567 features = property(__get_features, __set_features, __del_features, "Features")
jpayne@68	568
jpayne@68	569 @property
jpayne@68	570 def annotations(self) -> SeqRecord._AnnotationsDict:
jpayne@68	571 """Annotations."""
jpayne@68	572 if not hasattr(self, "_annotations"):
jpayne@68	573 self._annotations = _retrieve_annotations(
jpayne@68	574 self._adaptor, self._primary_id, self._taxon_id
jpayne@68	575 )
jpayne@68	576 if self._identifier:
jpayne@68	577 self._annotations["gi"] = self._identifier
jpayne@68	578 if self._division:
jpayne@68	579 self._annotations["data_file_division"] = self._division
jpayne@68	580 return self._annotations
jpayne@68	581
jpayne@68	582 @annotations.setter
jpayne@68	583 def annotations(self, value: Optional[SeqRecord._AnnotationsDict]) -> None:
jpayne@68	584 if value:
jpayne@68	585 self._annotations = value
jpayne@68	586 else:
jpayne@68	587 self._annotations = {}
jpayne@68	588
jpayne@68	589 @annotations.deleter
jpayne@68	590 def annotations(self) -> None:
jpayne@68	591 del self._annotations

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/BioSQL/BioSeq.py @ 68:5028fdace37b