jpayne@69
|
1 # Copyright 2002 by Andrew Dalke. All rights reserved.
|
jpayne@69
|
2 # Revisions 2007-2016 copyright by Peter Cock. All rights reserved.
|
jpayne@69
|
3 # Revisions 2008 copyright by Cymon J. Cox. All rights reserved.
|
jpayne@69
|
4 #
|
jpayne@69
|
5 # This file is part of the Biopython distribution and governed by your
|
jpayne@69
|
6 # choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
jpayne@69
|
7 # Please see the LICENSE file that should have been included as part of this
|
jpayne@69
|
8 # package.
|
jpayne@69
|
9 #
|
jpayne@69
|
10 # Note that BioSQL (including the database schema and scripts) is
|
jpayne@69
|
11 # available and licensed separately. Please consult www.biosql.org
|
jpayne@69
|
12
|
jpayne@69
|
13 """Load biopython objects into a BioSQL database for persistent storage.
|
jpayne@69
|
14
|
jpayne@69
|
15 This code makes it possible to store biopython objects in a relational
|
jpayne@69
|
16 database and then retrieve them back. You shouldn't use any of the
|
jpayne@69
|
17 classes in this module directly. Rather, call the load() method on
|
jpayne@69
|
18 a database object.
|
jpayne@69
|
19 """
|
jpayne@69
|
20 # standard modules
|
jpayne@69
|
21 from time import gmtime, strftime
|
jpayne@69
|
22
|
jpayne@69
|
23 # biopython
|
jpayne@69
|
24 from Bio.SeqUtils.CheckSum import crc64
|
jpayne@69
|
25 from Bio import Entrez
|
jpayne@69
|
26 from Bio.Seq import UndefinedSequenceError
|
jpayne@69
|
27 from Bio.SeqFeature import UnknownPosition
|
jpayne@69
|
28
|
jpayne@69
|
29
|
jpayne@69
|
30 class DatabaseLoader:
|
jpayne@69
|
31 """Object used to load SeqRecord objects into a BioSQL database."""
|
jpayne@69
|
32
|
jpayne@69
|
33 def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
|
jpayne@69
|
34 """Initialize with connection information for the database.
|
jpayne@69
|
35
|
jpayne@69
|
36 Creating a DatabaseLoader object is normally handled via the
|
jpayne@69
|
37 BioSeqDatabase DBServer object, for example::
|
jpayne@69
|
38
|
jpayne@69
|
39 from BioSQL import BioSeqDatabase
|
jpayne@69
|
40 server = BioSeqDatabase.open_database(driver="MySQLdb",
|
jpayne@69
|
41 user="gbrowse",
|
jpayne@69
|
42 passwd="biosql",
|
jpayne@69
|
43 host="localhost",
|
jpayne@69
|
44 db="test_biosql")
|
jpayne@69
|
45 try:
|
jpayne@69
|
46 db = server["test"]
|
jpayne@69
|
47 except KeyError:
|
jpayne@69
|
48 db = server.new_database("test",
|
jpayne@69
|
49 description="For testing GBrowse")
|
jpayne@69
|
50
|
jpayne@69
|
51 """
|
jpayne@69
|
52 self.adaptor = adaptor
|
jpayne@69
|
53 self.dbid = dbid
|
jpayne@69
|
54 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
|
jpayne@69
|
55
|
jpayne@69
|
56 def load_seqrecord(self, record):
|
jpayne@69
|
57 """Load a Biopython SeqRecord into the database."""
|
jpayne@69
|
58 bioentry_id = self._load_bioentry_table(record)
|
jpayne@69
|
59 self._load_bioentry_date(record, bioentry_id)
|
jpayne@69
|
60 self._load_biosequence(record, bioentry_id)
|
jpayne@69
|
61 self._load_comment(record, bioentry_id)
|
jpayne@69
|
62 self._load_dbxrefs(record, bioentry_id)
|
jpayne@69
|
63 references = record.annotations.get("references", ())
|
jpayne@69
|
64 for reference, rank in zip(references, list(range(len(references)))):
|
jpayne@69
|
65 self._load_reference(reference, rank, bioentry_id)
|
jpayne@69
|
66 self._load_annotations(record, bioentry_id)
|
jpayne@69
|
67 for seq_feature_num in range(len(record.features)):
|
jpayne@69
|
68 seq_feature = record.features[seq_feature_num]
|
jpayne@69
|
69 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
|
jpayne@69
|
70
|
jpayne@69
|
71 def _get_ontology_id(self, name, definition=None):
|
jpayne@69
|
72 """Return identifier for the named ontology (PRIVATE).
|
jpayne@69
|
73
|
jpayne@69
|
74 This looks through the onotology table for a the given entry name.
|
jpayne@69
|
75 If it is not found, a row is added for this ontology (using the
|
jpayne@69
|
76 definition if supplied). In either case, the id corresponding to
|
jpayne@69
|
77 the provided name is returned, so that you can reference it in
|
jpayne@69
|
78 another table.
|
jpayne@69
|
79 """
|
jpayne@69
|
80 oids = self.adaptor.execute_and_fetch_col0(
|
jpayne@69
|
81 "SELECT ontology_id FROM ontology WHERE name = %s", (name,)
|
jpayne@69
|
82 )
|
jpayne@69
|
83 if oids:
|
jpayne@69
|
84 return oids[0]
|
jpayne@69
|
85 self.adaptor.execute(
|
jpayne@69
|
86 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", (name, definition)
|
jpayne@69
|
87 )
|
jpayne@69
|
88 return self.adaptor.last_id("ontology")
|
jpayne@69
|
89
|
jpayne@69
|
90 def _get_term_id(self, name, ontology_id=None, definition=None, identifier=None):
|
jpayne@69
|
91 """Get the id that corresponds to a term (PRIVATE).
|
jpayne@69
|
92
|
jpayne@69
|
93 This looks through the term table for a the given term. If it
|
jpayne@69
|
94 is not found, a new id corresponding to this term is created.
|
jpayne@69
|
95 In either case, the id corresponding to that term is returned, so
|
jpayne@69
|
96 that you can reference it in another table.
|
jpayne@69
|
97
|
jpayne@69
|
98 The ontology_id should be used to disambiguate the term.
|
jpayne@69
|
99 """
|
jpayne@69
|
100 # try to get the term id
|
jpayne@69
|
101 sql = "SELECT term_id FROM term WHERE name = %s"
|
jpayne@69
|
102 fields = [name]
|
jpayne@69
|
103 if ontology_id:
|
jpayne@69
|
104 sql += " AND ontology_id = %s"
|
jpayne@69
|
105 fields.append(ontology_id)
|
jpayne@69
|
106 id_results = self.adaptor.execute_and_fetchall(sql, fields)
|
jpayne@69
|
107 # something is wrong
|
jpayne@69
|
108 if len(id_results) > 1:
|
jpayne@69
|
109 raise ValueError(f"Multiple term ids for {name}: {id_results!r}")
|
jpayne@69
|
110 elif len(id_results) == 1:
|
jpayne@69
|
111 return id_results[0][0]
|
jpayne@69
|
112 else:
|
jpayne@69
|
113 sql = (
|
jpayne@69
|
114 "INSERT INTO term (name, definition,"
|
jpayne@69
|
115 " identifier, ontology_id)"
|
jpayne@69
|
116 " VALUES (%s, %s, %s, %s)"
|
jpayne@69
|
117 )
|
jpayne@69
|
118 self.adaptor.execute(sql, (name, definition, identifier, ontology_id))
|
jpayne@69
|
119 return self.adaptor.last_id("term")
|
jpayne@69
|
120
|
jpayne@69
|
121 def _add_dbxref(self, dbname, accession, version):
|
jpayne@69
|
122 """Insert a dbxref and return its id (PRIVATE)."""
|
jpayne@69
|
123 self.adaptor.execute(
|
jpayne@69
|
124 "INSERT INTO dbxref(dbname, accession, version) VALUES (%s, %s, %s)",
|
jpayne@69
|
125 (dbname, accession, version),
|
jpayne@69
|
126 )
|
jpayne@69
|
127 return self.adaptor.last_id("dbxref")
|
jpayne@69
|
128
|
jpayne@69
|
129 def _get_taxon_id(self, record):
|
jpayne@69
|
130 """Get the taxon id for this record (PRIVATE).
|
jpayne@69
|
131
|
jpayne@69
|
132 Arguments:
|
jpayne@69
|
133 - record - a SeqRecord object
|
jpayne@69
|
134
|
jpayne@69
|
135 This searches the taxon/taxon_name tables using the
|
jpayne@69
|
136 NCBI taxon ID, scientific name and common name to find
|
jpayne@69
|
137 the matching taxon table entry's id.
|
jpayne@69
|
138
|
jpayne@69
|
139 If the species isn't in the taxon table, and we have at
|
jpayne@69
|
140 least the NCBI taxon ID, scientific name or common name,
|
jpayne@69
|
141 at least a minimal stub entry is created in the table.
|
jpayne@69
|
142
|
jpayne@69
|
143 Returns the taxon id (database key for the taxon table,
|
jpayne@69
|
144 not an NCBI taxon ID), or None if the taxonomy information
|
jpayne@69
|
145 is missing.
|
jpayne@69
|
146
|
jpayne@69
|
147 See also the BioSQL script load_ncbi_taxonomy.pl which
|
jpayne@69
|
148 will populate and update the taxon/taxon_name tables
|
jpayne@69
|
149 with the latest information from the NCBI.
|
jpayne@69
|
150 """
|
jpayne@69
|
151 # To find the NCBI taxid, first check for a top level annotation
|
jpayne@69
|
152 ncbi_taxon_id = None
|
jpayne@69
|
153 if "ncbi_taxid" in record.annotations:
|
jpayne@69
|
154 # Could be a list of IDs.
|
jpayne@69
|
155 if isinstance(record.annotations["ncbi_taxid"], list):
|
jpayne@69
|
156 if len(record.annotations["ncbi_taxid"]) == 1:
|
jpayne@69
|
157 ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
|
jpayne@69
|
158 else:
|
jpayne@69
|
159 ncbi_taxon_id = record.annotations["ncbi_taxid"]
|
jpayne@69
|
160 if not ncbi_taxon_id:
|
jpayne@69
|
161 # Secondly, look for a source feature
|
jpayne@69
|
162 for f in record.features:
|
jpayne@69
|
163 if f.type == "source":
|
jpayne@69
|
164 quals = getattr(f, "qualifiers", {})
|
jpayne@69
|
165 if "db_xref" in quals:
|
jpayne@69
|
166 for db_xref in f.qualifiers["db_xref"]:
|
jpayne@69
|
167 if db_xref.startswith("taxon:"):
|
jpayne@69
|
168 ncbi_taxon_id = int(db_xref[6:])
|
jpayne@69
|
169 break
|
jpayne@69
|
170 if ncbi_taxon_id:
|
jpayne@69
|
171 break
|
jpayne@69
|
172
|
jpayne@69
|
173 try:
|
jpayne@69
|
174 scientific_name = record.annotations["organism"][:255]
|
jpayne@69
|
175 except KeyError:
|
jpayne@69
|
176 scientific_name = None
|
jpayne@69
|
177 try:
|
jpayne@69
|
178 common_name = record.annotations["source"][:255]
|
jpayne@69
|
179 except KeyError:
|
jpayne@69
|
180 common_name = None
|
jpayne@69
|
181 # Note: The maximum length for taxon names in the schema is 255.
|
jpayne@69
|
182 # Cropping it now should help in getting a match when searching,
|
jpayne@69
|
183 # and avoids an error if we try and add these to the database.
|
jpayne@69
|
184
|
jpayne@69
|
185 if ncbi_taxon_id:
|
jpayne@69
|
186 # Good, we have the NCBI taxon to go on - this is unambiguous :)
|
jpayne@69
|
187 # Note that the scientific name and common name will only be
|
jpayne@69
|
188 # used if we have to record a stub entry.
|
jpayne@69
|
189 return self._get_taxon_id_from_ncbi_taxon_id(
|
jpayne@69
|
190 ncbi_taxon_id, scientific_name, common_name
|
jpayne@69
|
191 )
|
jpayne@69
|
192
|
jpayne@69
|
193 if not common_name and not scientific_name:
|
jpayne@69
|
194 # Nothing to go on... and there is no point adding
|
jpayne@69
|
195 # a new entry to the database. We'll just leave this
|
jpayne@69
|
196 # sequence's taxon as a NULL in the database.
|
jpayne@69
|
197 return None
|
jpayne@69
|
198
|
jpayne@69
|
199 # Next, we'll try to find a match based on the species name
|
jpayne@69
|
200 # (stored in GenBank files as the organism and/or the source).
|
jpayne@69
|
201 if scientific_name:
|
jpayne@69
|
202 taxa = self.adaptor.execute_and_fetch_col0(
|
jpayne@69
|
203 "SELECT taxon_id FROM taxon_name"
|
jpayne@69
|
204 " WHERE name_class = 'scientific name' AND name = %s",
|
jpayne@69
|
205 (scientific_name,),
|
jpayne@69
|
206 )
|
jpayne@69
|
207 if taxa:
|
jpayne@69
|
208 # Good, mapped the scientific name to a taxon table entry
|
jpayne@69
|
209 return taxa[0]
|
jpayne@69
|
210
|
jpayne@69
|
211 # Last chance...
|
jpayne@69
|
212 if common_name:
|
jpayne@69
|
213 taxa = self.adaptor.execute_and_fetch_col0(
|
jpayne@69
|
214 "SELECT DISTINCT taxon_id FROM taxon_name WHERE name = %s",
|
jpayne@69
|
215 (common_name,),
|
jpayne@69
|
216 )
|
jpayne@69
|
217 # Its natural that several distinct taxa will have the same common
|
jpayne@69
|
218 # name - in which case we can't resolve the taxon uniquely.
|
jpayne@69
|
219 if len(taxa) > 1:
|
jpayne@69
|
220 raise ValueError(
|
jpayne@69
|
221 "Taxa: %d species have name %r" % (len(taxa), common_name)
|
jpayne@69
|
222 )
|
jpayne@69
|
223 if taxa:
|
jpayne@69
|
224 # Good, mapped the common name to a taxon table entry
|
jpayne@69
|
225 return taxa[0]
|
jpayne@69
|
226
|
jpayne@69
|
227 # At this point, as far as we can tell, this species isn't
|
jpayne@69
|
228 # in the taxon table already. So we'll have to add it.
|
jpayne@69
|
229 # We don't have an NCBI taxonomy ID, so if we do record just
|
jpayne@69
|
230 # a stub entry, there is no simple way to fix this later.
|
jpayne@69
|
231 #
|
jpayne@69
|
232 # TODO - Should we try searching the NCBI taxonomy using the
|
jpayne@69
|
233 # species name?
|
jpayne@69
|
234 #
|
jpayne@69
|
235 # OK, let's try inserting the species.
|
jpayne@69
|
236 # Chances are we don't have enough information ...
|
jpayne@69
|
237 # Furthermore, it won't be in the hierarchy.
|
jpayne@69
|
238
|
jpayne@69
|
239 lineage = []
|
jpayne@69
|
240 for c in record.annotations.get("taxonomy", []):
|
jpayne@69
|
241 lineage.append([None, None, c])
|
jpayne@69
|
242 if lineage:
|
jpayne@69
|
243 lineage[-1][1] = "genus"
|
jpayne@69
|
244 lineage.append([None, "species", record.annotations["organism"]])
|
jpayne@69
|
245 # XXX do we have them?
|
jpayne@69
|
246 if "subspecies" in record.annotations:
|
jpayne@69
|
247 lineage.append([None, "subspecies", record.annotations["subspecies"]])
|
jpayne@69
|
248 if "variant" in record.annotations:
|
jpayne@69
|
249 lineage.append([None, "varietas", record.annotations["variant"]])
|
jpayne@69
|
250 lineage[-1][0] = ncbi_taxon_id
|
jpayne@69
|
251
|
jpayne@69
|
252 left_value = self.adaptor.execute_one("SELECT MAX(left_value) FROM taxon")[0]
|
jpayne@69
|
253 if not left_value:
|
jpayne@69
|
254 left_value = 0
|
jpayne@69
|
255 left_value += 1
|
jpayne@69
|
256
|
jpayne@69
|
257 # XXX -- Brad: Fixing this for now in an ugly way because
|
jpayne@69
|
258 # I am getting overlaps for right_values. I need to dig into this
|
jpayne@69
|
259 # more to actually understand how it works. I'm not sure it is
|
jpayne@69
|
260 # actually working right anyhow.
|
jpayne@69
|
261 right_start_value = self.adaptor.execute_one(
|
jpayne@69
|
262 "SELECT MAX(right_value) FROM taxon"
|
jpayne@69
|
263 )[0]
|
jpayne@69
|
264 if not right_start_value:
|
jpayne@69
|
265 right_start_value = 0
|
jpayne@69
|
266 right_value = right_start_value + 2 * len(lineage) - 1
|
jpayne@69
|
267
|
jpayne@69
|
268 parent_taxon_id = None
|
jpayne@69
|
269 for taxon in lineage:
|
jpayne@69
|
270 self.adaptor.execute(
|
jpayne@69
|
271 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
|
jpayne@69
|
272 " left_value, right_value)"
|
jpayne@69
|
273 " VALUES (%s, %s, %s, %s, %s)",
|
jpayne@69
|
274 (parent_taxon_id, taxon[0], taxon[1], left_value, right_value),
|
jpayne@69
|
275 )
|
jpayne@69
|
276 taxon_id = self.adaptor.last_id("taxon")
|
jpayne@69
|
277 self.adaptor.execute(
|
jpayne@69
|
278 "INSERT INTO taxon_name(taxon_id, name, name_class)"
|
jpayne@69
|
279 "VALUES (%s, %s, 'scientific name')",
|
jpayne@69
|
280 (taxon_id, taxon[2][:255]),
|
jpayne@69
|
281 )
|
jpayne@69
|
282 # Note the name field is limited to 255, some SwissProt files
|
jpayne@69
|
283 # have a multi-species name which can be longer. So truncate this.
|
jpayne@69
|
284 left_value += 1
|
jpayne@69
|
285 right_value -= 1
|
jpayne@69
|
286 parent_taxon_id = taxon_id
|
jpayne@69
|
287 if common_name:
|
jpayne@69
|
288 self.adaptor.execute(
|
jpayne@69
|
289 "INSERT INTO taxon_name(taxon_id, name, name_class)"
|
jpayne@69
|
290 "VALUES (%s, %s, 'common name')",
|
jpayne@69
|
291 (taxon_id, common_name),
|
jpayne@69
|
292 )
|
jpayne@69
|
293
|
jpayne@69
|
294 return taxon_id
|
jpayne@69
|
295
|
jpayne@69
|
296 def _fix_name_class(self, entrez_name):
|
jpayne@69
|
297 """Map Entrez name terms to those used in taxdump (PRIVATE).
|
jpayne@69
|
298
|
jpayne@69
|
299 We need to make this conversion to match the taxon_name.name_class
|
jpayne@69
|
300 values used by the BioSQL load_ncbi_taxonomy.pl script.
|
jpayne@69
|
301
|
jpayne@69
|
302 e.g.::
|
jpayne@69
|
303
|
jpayne@69
|
304 "ScientificName" -> "scientific name",
|
jpayne@69
|
305 "EquivalentName" -> "equivalent name",
|
jpayne@69
|
306 "Synonym" -> "synonym",
|
jpayne@69
|
307
|
jpayne@69
|
308 """
|
jpayne@69
|
309 # Add any special cases here:
|
jpayne@69
|
310 #
|
jpayne@69
|
311 # known = {}
|
jpayne@69
|
312 # try:
|
jpayne@69
|
313 # return known[entrez_name]
|
jpayne@69
|
314 # except KeyError:
|
jpayne@69
|
315 # pass
|
jpayne@69
|
316
|
jpayne@69
|
317 # Try automatically by adding spaces before each capital
|
jpayne@69
|
318 def add_space(letter):
|
jpayne@69
|
319 """Add a space before a capital letter."""
|
jpayne@69
|
320 if letter.isupper():
|
jpayne@69
|
321 return " " + letter.lower()
|
jpayne@69
|
322 else:
|
jpayne@69
|
323 return letter
|
jpayne@69
|
324
|
jpayne@69
|
325 answer = "".join(add_space(letter) for letter in entrez_name).strip()
|
jpayne@69
|
326 if answer != answer.lower():
|
jpayne@69
|
327 raise ValueError(
|
jpayne@69
|
328 f"Expected processed entrez_name, '{answer}' to only have lower case letters."
|
jpayne@69
|
329 )
|
jpayne@69
|
330 return answer
|
jpayne@69
|
331
|
jpayne@69
|
332 def _update_left_right_taxon_values(self, left_value):
|
jpayne@69
|
333 """Update the left and right taxon values in the table (PRIVATE)."""
|
jpayne@69
|
334 if not left_value:
|
jpayne@69
|
335 return
|
jpayne@69
|
336 # Due to the UNIQUE constraint on the left and right values in the taxon
|
jpayne@69
|
337 # table we cannot simply update them through an SQL statement as we risk
|
jpayne@69
|
338 # colliding values. Instead we must select all of the rows that we want to
|
jpayne@69
|
339 # update, modify the values in python and then update the rows
|
jpayne@69
|
340 # self.adaptor.execute("UPDATE taxon SET right_value = right_value + 2 "
|
jpayne@69
|
341 # "WHERE right_value >= %s", (left_value,))
|
jpayne@69
|
342 # self.adaptor.execute("UPDATE taxon SET left_value = left_value + 2 "
|
jpayne@69
|
343 # "WHERE left_value > %s", (left_value,))
|
jpayne@69
|
344
|
jpayne@69
|
345 rows = self.adaptor.execute_and_fetchall(
|
jpayne@69
|
346 "SELECT left_value, right_value, taxon_id FROM taxon "
|
jpayne@69
|
347 "WHERE right_value >= %s or left_value > %s",
|
jpayne@69
|
348 (left_value, left_value),
|
jpayne@69
|
349 )
|
jpayne@69
|
350
|
jpayne@69
|
351 right_rows = []
|
jpayne@69
|
352 left_rows = []
|
jpayne@69
|
353 for row in rows:
|
jpayne@69
|
354 new_right = row[1]
|
jpayne@69
|
355 new_left = row[0]
|
jpayne@69
|
356 if new_right >= left_value:
|
jpayne@69
|
357 new_right += 2
|
jpayne@69
|
358
|
jpayne@69
|
359 if new_left > left_value:
|
jpayne@69
|
360 new_left += 2
|
jpayne@69
|
361 right_rows.append((new_right, row[2]))
|
jpayne@69
|
362 left_rows.append((new_left, row[2]))
|
jpayne@69
|
363
|
jpayne@69
|
364 # sort the rows based on the value from largest to smallest
|
jpayne@69
|
365 # should ensure no overlaps
|
jpayne@69
|
366 right_rows = sorted(right_rows, key=lambda x: x[0], reverse=True)
|
jpayne@69
|
367 left_rows = sorted(left_rows, key=lambda x: x[0], reverse=True)
|
jpayne@69
|
368
|
jpayne@69
|
369 self.adaptor.executemany(
|
jpayne@69
|
370 "UPDATE taxon SET left_value = %s WHERE taxon_id = %s", left_rows
|
jpayne@69
|
371 )
|
jpayne@69
|
372 self.adaptor.executemany(
|
jpayne@69
|
373 "UPDATE taxon SET right_value = %s WHERE taxon_id = %s", right_rows
|
jpayne@69
|
374 )
|
jpayne@69
|
375
|
jpayne@69
|
376 def _get_taxon_id_from_ncbi_taxon_id(
|
jpayne@69
|
377 self, ncbi_taxon_id, scientific_name=None, common_name=None
|
jpayne@69
|
378 ):
|
jpayne@69
|
379 """Get the taxon id for record from NCBI taxon ID (PRIVATE).
|
jpayne@69
|
380
|
jpayne@69
|
381 Arguments:
|
jpayne@69
|
382 - ncbi_taxon_id - string containing an NCBI taxon id
|
jpayne@69
|
383 - scientific_name - string, used if a stub entry is recorded
|
jpayne@69
|
384 - common_name - string, used if a stub entry is recorded
|
jpayne@69
|
385
|
jpayne@69
|
386 This searches the taxon table using ONLY the NCBI taxon ID
|
jpayne@69
|
387 to find the matching taxon table entry's ID (database key).
|
jpayne@69
|
388
|
jpayne@69
|
389 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
|
jpayne@69
|
390 flag is true, Biopython will attempt to go online using Bio.Entrez
|
jpayne@69
|
391 to fetch the official NCBI lineage, recursing up the tree until an
|
jpayne@69
|
392 existing entry is found in the database or the full lineage has been
|
jpayne@69
|
393 fetched.
|
jpayne@69
|
394
|
jpayne@69
|
395 Otherwise the NCBI taxon ID, scientific name and common name are
|
jpayne@69
|
396 recorded as a minimal stub entry in the taxon and taxon_name tables.
|
jpayne@69
|
397 Any partial information about the lineage from the SeqRecord is NOT
|
jpayne@69
|
398 recorded. This should mean that (re)running the BioSQL script
|
jpayne@69
|
399 load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
|
jpayne@69
|
400
|
jpayne@69
|
401 Returns the taxon id (database key for the taxon table, not
|
jpayne@69
|
402 an NCBI taxon ID).
|
jpayne@69
|
403 """
|
jpayne@69
|
404 if not ncbi_taxon_id:
|
jpayne@69
|
405 raise ValueError("Expected a non-empty value for ncbi_taxon_id.")
|
jpayne@69
|
406
|
jpayne@69
|
407 taxon_id = self.adaptor.execute_and_fetch_col0(
|
jpayne@69
|
408 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", (int(ncbi_taxon_id),)
|
jpayne@69
|
409 )
|
jpayne@69
|
410 if taxon_id:
|
jpayne@69
|
411 # Good, we have mapped the NCBI taxid to a taxon table entry
|
jpayne@69
|
412 return taxon_id[0]
|
jpayne@69
|
413
|
jpayne@69
|
414 # At this point, as far as we can tell, this species isn't
|
jpayne@69
|
415 # in the taxon table already. So we'll have to add it.
|
jpayne@69
|
416
|
jpayne@69
|
417 parent_taxon_id = None
|
jpayne@69
|
418 rank = "species"
|
jpayne@69
|
419 genetic_code = None
|
jpayne@69
|
420 mito_genetic_code = None
|
jpayne@69
|
421 parent_left_value = None
|
jpayne@69
|
422 parent_right_value = None
|
jpayne@69
|
423 left_value = None
|
jpayne@69
|
424 right_value = None
|
jpayne@69
|
425 species_names = []
|
jpayne@69
|
426 if scientific_name:
|
jpayne@69
|
427 species_names.append(("scientific name", scientific_name))
|
jpayne@69
|
428 if common_name:
|
jpayne@69
|
429 species_names.append(("common name", common_name))
|
jpayne@69
|
430
|
jpayne@69
|
431 if self.fetch_NCBI_taxonomy:
|
jpayne@69
|
432 # Go online to get the parent taxon ID!
|
jpayne@69
|
433 handle = Entrez.efetch(db="taxonomy", id=ncbi_taxon_id, retmode="XML")
|
jpayne@69
|
434 taxonomic_record = Entrez.read(handle)
|
jpayne@69
|
435 if len(taxonomic_record) == 1:
|
jpayne@69
|
436 if taxonomic_record[0]["TaxId"] != str(ncbi_taxon_id):
|
jpayne@69
|
437 raise ValueError(
|
jpayne@69
|
438 f"ncbi_taxon_id different from parent taxon id. {ncbi_taxon_id} versus {taxonomic_record[0]['TaxId']}"
|
jpayne@69
|
439 )
|
jpayne@69
|
440
|
jpayne@69
|
441 (
|
jpayne@69
|
442 parent_taxon_id,
|
jpayne@69
|
443 parent_left_value,
|
jpayne@69
|
444 parent_right_value,
|
jpayne@69
|
445 ) = self._get_taxon_id_from_ncbi_lineage(
|
jpayne@69
|
446 taxonomic_record[0]["LineageEx"]
|
jpayne@69
|
447 )
|
jpayne@69
|
448
|
jpayne@69
|
449 left_value = parent_right_value
|
jpayne@69
|
450 right_value = parent_right_value + 1
|
jpayne@69
|
451
|
jpayne@69
|
452 rank = str(taxonomic_record[0]["Rank"])
|
jpayne@69
|
453
|
jpayne@69
|
454 genetic_code = int(taxonomic_record[0]["GeneticCode"]["GCId"])
|
jpayne@69
|
455
|
jpayne@69
|
456 mito_genetic_code = int(taxonomic_record[0]["MitoGeneticCode"]["MGCId"])
|
jpayne@69
|
457
|
jpayne@69
|
458 species_names = [
|
jpayne@69
|
459 ("scientific name", str(taxonomic_record[0]["ScientificName"]))
|
jpayne@69
|
460 ]
|
jpayne@69
|
461 try:
|
jpayne@69
|
462 for name_class, names in taxonomic_record[0]["OtherNames"].items():
|
jpayne@69
|
463 name_class = self._fix_name_class(name_class)
|
jpayne@69
|
464 if not isinstance(names, list):
|
jpayne@69
|
465 # The Entrez parser seems to return single entry
|
jpayne@69
|
466 # lists as just a string which is annoying.
|
jpayne@69
|
467 names = [names]
|
jpayne@69
|
468 for name in names:
|
jpayne@69
|
469 # Want to ignore complex things like ClassCDE
|
jpayne@69
|
470 # entries
|
jpayne@69
|
471 if isinstance(name, str):
|
jpayne@69
|
472 species_names.append((name_class, name))
|
jpayne@69
|
473 except KeyError:
|
jpayne@69
|
474 # OtherNames isn't always present,
|
jpayne@69
|
475 # e.g. NCBI taxon 41205, Bromheadia finlaysoniana
|
jpayne@69
|
476 pass
|
jpayne@69
|
477 else:
|
jpayne@69
|
478 pass
|
jpayne@69
|
479 # If we are not allowed to go online, we will record the bare minimum;
|
jpayne@69
|
480 # as long as the NCBI taxon id is present, then (re)running
|
jpayne@69
|
481 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage
|
jpayne@69
|
482 # (and update the species names).
|
jpayne@69
|
483 #
|
jpayne@69
|
484 # I am NOT going to try and record the lineage, even if it
|
jpayne@69
|
485 # is in the record annotation as a list of names, as we won't
|
jpayne@69
|
486 # know the NCBI taxon IDs for these parent nodes.
|
jpayne@69
|
487
|
jpayne@69
|
488 self._update_left_right_taxon_values(left_value)
|
jpayne@69
|
489
|
jpayne@69
|
490 self.adaptor.execute(
|
jpayne@69
|
491 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
|
jpayne@69
|
492 " genetic_code, mito_genetic_code, left_value, right_value)"
|
jpayne@69
|
493 " VALUES (%s, %s, %s, %s, %s, %s, %s)",
|
jpayne@69
|
494 (
|
jpayne@69
|
495 parent_taxon_id,
|
jpayne@69
|
496 ncbi_taxon_id,
|
jpayne@69
|
497 rank,
|
jpayne@69
|
498 genetic_code,
|
jpayne@69
|
499 mito_genetic_code,
|
jpayne@69
|
500 left_value,
|
jpayne@69
|
501 right_value,
|
jpayne@69
|
502 ),
|
jpayne@69
|
503 )
|
jpayne@69
|
504
|
jpayne@69
|
505 taxon_id = self.adaptor.last_id("taxon")
|
jpayne@69
|
506
|
jpayne@69
|
507 # Record the scientific name, common name, etc
|
jpayne@69
|
508 for name_class, name in species_names:
|
jpayne@69
|
509 self.adaptor.execute(
|
jpayne@69
|
510 "INSERT INTO taxon_name(taxon_id, name, name_class)"
|
jpayne@69
|
511 " VALUES (%s, %s, %s)",
|
jpayne@69
|
512 (taxon_id, name[:255], name_class),
|
jpayne@69
|
513 )
|
jpayne@69
|
514 return taxon_id
|
jpayne@69
|
515
|
jpayne@69
|
516 def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
|
jpayne@69
|
517 """Recursive method to get taxon ID from NCBI lineage (PRIVATE).
|
jpayne@69
|
518
|
jpayne@69
|
519 Arguments:
|
jpayne@69
|
520 - taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
|
jpayne@69
|
521
|
jpayne@69
|
522 First dictionary in list is the taxonomy root, highest would be
|
jpayne@69
|
523 the species. Each dictionary includes:
|
jpayne@69
|
524
|
jpayne@69
|
525 - TaxID (string, NCBI taxon id)
|
jpayne@69
|
526 - Rank (string, e.g. "species", "genus", ..., "phylum", ...)
|
jpayne@69
|
527 - ScientificName (string)
|
jpayne@69
|
528
|
jpayne@69
|
529 (and that is all at the time of writing)
|
jpayne@69
|
530
|
jpayne@69
|
531 This method will record all the lineage given, returning the taxon id
|
jpayne@69
|
532 (database key, not NCBI taxon id) of the final entry (the species).
|
jpayne@69
|
533 """
|
jpayne@69
|
534 ncbi_taxon_id = int(taxonomic_lineage[-1]["TaxId"])
|
jpayne@69
|
535 left_value = None
|
jpayne@69
|
536 right_value = None
|
jpayne@69
|
537 parent_left_value = None
|
jpayne@69
|
538 parent_right_value = None
|
jpayne@69
|
539 # Is this in the database already? Check the taxon table...
|
jpayne@69
|
540 rows = self.adaptor.execute_and_fetchall(
|
jpayne@69
|
541 "SELECT taxon_id, left_value, right_value FROM taxon"
|
jpayne@69
|
542 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id
|
jpayne@69
|
543 )
|
jpayne@69
|
544 if rows:
|
jpayne@69
|
545 # we could verify that the Scientific Name etc in the database
|
jpayne@69
|
546 # is the same and update it or print a warning if not...
|
jpayne@69
|
547 if len(rows) != 1:
|
jpayne@69
|
548 raise ValueError(f"Expected 1 reponse, got {len(rows)}")
|
jpayne@69
|
549 return rows[0]
|
jpayne@69
|
550
|
jpayne@69
|
551 # We have to record this.
|
jpayne@69
|
552 if len(taxonomic_lineage) > 1:
|
jpayne@69
|
553 # Use recursion to find out the taxon id (database key) of the
|
jpayne@69
|
554 # parent.
|
jpayne@69
|
555 (
|
jpayne@69
|
556 parent_taxon_id,
|
jpayne@69
|
557 parent_left_value,
|
jpayne@69
|
558 parent_right_value,
|
jpayne@69
|
559 ) = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
|
jpayne@69
|
560 left_value = parent_right_value
|
jpayne@69
|
561 right_value = parent_right_value + 1
|
jpayne@69
|
562 if not isinstance(parent_taxon_id, int):
|
jpayne@69
|
563 raise ValueError(
|
jpayne@69
|
564 f"Expected parent_taxon_id to be an int, got {parent_taxon_id}"
|
jpayne@69
|
565 )
|
jpayne@69
|
566 else:
|
jpayne@69
|
567 # we have reached the top of the lineage but no current taxonomy
|
jpayne@69
|
568 # id has been found
|
jpayne@69
|
569 parent_taxon_id = None
|
jpayne@69
|
570 left_value = self.adaptor.execute_one("SELECT MAX(left_value) FROM taxon")[
|
jpayne@69
|
571 0
|
jpayne@69
|
572 ]
|
jpayne@69
|
573 if not left_value:
|
jpayne@69
|
574 left_value = 0
|
jpayne@69
|
575
|
jpayne@69
|
576 right_value = left_value + 1
|
jpayne@69
|
577
|
jpayne@69
|
578 self._update_left_right_taxon_values(left_value)
|
jpayne@69
|
579
|
jpayne@69
|
580 # INSERT new taxon
|
jpayne@69
|
581 rank = str(taxonomic_lineage[-1].get("Rank"))
|
jpayne@69
|
582 self.adaptor.execute(
|
jpayne@69
|
583 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, "
|
jpayne@69
|
584 "left_value, right_value) VALUES (%s, %s, %s, %s, %s)",
|
jpayne@69
|
585 (ncbi_taxon_id, parent_taxon_id, rank, left_value, right_value),
|
jpayne@69
|
586 )
|
jpayne@69
|
587
|
jpayne@69
|
588 taxon_id = self.adaptor.last_id("taxon")
|
jpayne@69
|
589 # assert isinstance(taxon_id, int), repr(taxon_id)
|
jpayne@69
|
590 # ... and its name in taxon_name
|
jpayne@69
|
591 scientific_name = taxonomic_lineage[-1].get("ScientificName")
|
jpayne@69
|
592 if scientific_name:
|
jpayne@69
|
593 self.adaptor.execute(
|
jpayne@69
|
594 "INSERT INTO taxon_name(taxon_id, name, name_class) "
|
jpayne@69
|
595 "VALUES (%s, %s, 'scientific name')",
|
jpayne@69
|
596 (taxon_id, scientific_name[:255]),
|
jpayne@69
|
597 )
|
jpayne@69
|
598 return taxon_id, left_value, right_value
|
jpayne@69
|
599
|
jpayne@69
|
600 def _load_bioentry_table(self, record):
|
jpayne@69
|
601 """Fill the bioentry table with sequence information (PRIVATE).
|
jpayne@69
|
602
|
jpayne@69
|
603 Arguments:
|
jpayne@69
|
604 - record - SeqRecord object to add to the database.
|
jpayne@69
|
605
|
jpayne@69
|
606 """
|
jpayne@69
|
607 # get the pertinent info and insert it
|
jpayne@69
|
608
|
jpayne@69
|
609 if record.id.count(".") == 1: # try to get a version from the id
|
jpayne@69
|
610 # This assumes the string is something like "XXXXXXXX.123"
|
jpayne@69
|
611 accession, version = record.id.split(".")
|
jpayne@69
|
612 try:
|
jpayne@69
|
613 version = int(version)
|
jpayne@69
|
614 except ValueError:
|
jpayne@69
|
615 accession = record.id
|
jpayne@69
|
616 version = 0
|
jpayne@69
|
617 else: # otherwise just use a version of 0
|
jpayne@69
|
618 accession = record.id
|
jpayne@69
|
619 version = 0
|
jpayne@69
|
620
|
jpayne@69
|
621 if (
|
jpayne@69
|
622 "accessions" in record.annotations
|
jpayne@69
|
623 and isinstance(record.annotations["accessions"], list)
|
jpayne@69
|
624 and record.annotations["accessions"]
|
jpayne@69
|
625 ):
|
jpayne@69
|
626 # Take the first accession (one if there is more than one)
|
jpayne@69
|
627 accession = record.annotations["accessions"][0]
|
jpayne@69
|
628
|
jpayne@69
|
629 # Find the taxon id (this is not just the NCBI Taxon ID)
|
jpayne@69
|
630 # NOTE - If the species isn't defined in the taxon table,
|
jpayne@69
|
631 # a new minimal entry is created.
|
jpayne@69
|
632 taxon_id = self._get_taxon_id(record)
|
jpayne@69
|
633
|
jpayne@69
|
634 if "gi" in record.annotations:
|
jpayne@69
|
635 identifier = record.annotations["gi"]
|
jpayne@69
|
636 else:
|
jpayne@69
|
637 identifier = record.id
|
jpayne@69
|
638
|
jpayne@69
|
639 # Allow description and division to default to NULL as in BioPerl.
|
jpayne@69
|
640 description = getattr(record, "description", None)
|
jpayne@69
|
641 division = record.annotations.get("data_file_division")
|
jpayne@69
|
642
|
jpayne@69
|
643 sql = """
|
jpayne@69
|
644 INSERT INTO bioentry (
|
jpayne@69
|
645 biodatabase_id,
|
jpayne@69
|
646 taxon_id,
|
jpayne@69
|
647 name,
|
jpayne@69
|
648 accession,
|
jpayne@69
|
649 identifier,
|
jpayne@69
|
650 division,
|
jpayne@69
|
651 description,
|
jpayne@69
|
652 version)
|
jpayne@69
|
653 VALUES (
|
jpayne@69
|
654 %s,
|
jpayne@69
|
655 %s,
|
jpayne@69
|
656 %s,
|
jpayne@69
|
657 %s,
|
jpayne@69
|
658 %s,
|
jpayne@69
|
659 %s,
|
jpayne@69
|
660 %s,
|
jpayne@69
|
661 %s)"""
|
jpayne@69
|
662 # print(self.dbid, taxon_id, record.name, accession, identifier, \
|
jpayne@69
|
663 # division, description, version)
|
jpayne@69
|
664 self.adaptor.execute(
|
jpayne@69
|
665 sql,
|
jpayne@69
|
666 (
|
jpayne@69
|
667 self.dbid,
|
jpayne@69
|
668 taxon_id,
|
jpayne@69
|
669 record.name,
|
jpayne@69
|
670 accession,
|
jpayne@69
|
671 identifier,
|
jpayne@69
|
672 division,
|
jpayne@69
|
673 description,
|
jpayne@69
|
674 version,
|
jpayne@69
|
675 ),
|
jpayne@69
|
676 )
|
jpayne@69
|
677 # now retrieve the id for the bioentry
|
jpayne@69
|
678 return self.adaptor.last_id("bioentry")
|
jpayne@69
|
679
|
jpayne@69
|
680 def _load_bioentry_date(self, record, bioentry_id):
|
jpayne@69
|
681 """Add the effective date of the entry into the database (PRIVATE).
|
jpayne@69
|
682
|
jpayne@69
|
683 record - a SeqRecord object with an annotated date
|
jpayne@69
|
684 bioentry_id - corresponding database identifier
|
jpayne@69
|
685 """
|
jpayne@69
|
686 # dates are GenBank style, like:
|
jpayne@69
|
687 # 14-SEP-2000
|
jpayne@69
|
688 date = record.annotations.get("date", strftime("%d-%b-%Y", gmtime()).upper())
|
jpayne@69
|
689 if isinstance(date, list):
|
jpayne@69
|
690 date = date[0]
|
jpayne@69
|
691 annotation_tags_id = self._get_ontology_id("Annotation Tags")
|
jpayne@69
|
692 date_id = self._get_term_id("date_changed", annotation_tags_id)
|
jpayne@69
|
693 sql = (
|
jpayne@69
|
694 "INSERT INTO bioentry_qualifier_value"
|
jpayne@69
|
695 ' (bioentry_id, term_id, value, "rank")'
|
jpayne@69
|
696 " VALUES (%s, %s, %s, 1)"
|
jpayne@69
|
697 )
|
jpayne@69
|
698 self.adaptor.execute(sql, (bioentry_id, date_id, date))
|
jpayne@69
|
699
|
jpayne@69
|
700 def _load_biosequence(self, record, bioentry_id):
|
jpayne@69
|
701 """Record SeqRecord's sequence and alphabet in DB (PRIVATE).
|
jpayne@69
|
702
|
jpayne@69
|
703 Arguments:
|
jpayne@69
|
704 - record - a SeqRecord object with a seq property
|
jpayne@69
|
705 - bioentry_id - corresponding database identifier
|
jpayne@69
|
706
|
jpayne@69
|
707 """
|
jpayne@69
|
708 if record.seq is None:
|
jpayne@69
|
709 # The biosequence table entry is optional, so if we haven't
|
jpayne@69
|
710 # got a sequence, we don't need to write to the table.
|
jpayne@69
|
711 return
|
jpayne@69
|
712
|
jpayne@69
|
713 molecule_type = record.annotations.get("molecule_type", "")
|
jpayne@69
|
714 if "DNA" in molecule_type:
|
jpayne@69
|
715 alphabet = "dna"
|
jpayne@69
|
716 elif "RNA" in molecule_type:
|
jpayne@69
|
717 alphabet = "rna"
|
jpayne@69
|
718 elif "protein" in molecule_type:
|
jpayne@69
|
719 alphabet = "protein"
|
jpayne@69
|
720 else:
|
jpayne@69
|
721 alphabet = "unknown"
|
jpayne@69
|
722
|
jpayne@69
|
723 try:
|
jpayne@69
|
724 seq_str = str(record.seq)
|
jpayne@69
|
725 except UndefinedSequenceError:
|
jpayne@69
|
726 seq_str = None
|
jpayne@69
|
727
|
jpayne@69
|
728 sql = (
|
jpayne@69
|
729 "INSERT INTO biosequence (bioentry_id, version, "
|
jpayne@69
|
730 "length, seq, alphabet) "
|
jpayne@69
|
731 "VALUES (%s, 0, %s, %s, %s)"
|
jpayne@69
|
732 )
|
jpayne@69
|
733 self.adaptor.execute(sql, (bioentry_id, len(record.seq), seq_str, alphabet))
|
jpayne@69
|
734
|
jpayne@69
|
735 def _load_comment(self, record, bioentry_id):
|
jpayne@69
|
736 """Record a SeqRecord's annotated comment in the database (PRIVATE).
|
jpayne@69
|
737
|
jpayne@69
|
738 Arguments:
|
jpayne@69
|
739 - record - a SeqRecord object with an annotated comment
|
jpayne@69
|
740 - bioentry_id - corresponding database identifier
|
jpayne@69
|
741
|
jpayne@69
|
742 """
|
jpayne@69
|
743 comments = record.annotations.get("comment")
|
jpayne@69
|
744 if not comments:
|
jpayne@69
|
745 return
|
jpayne@69
|
746 if not isinstance(comments, list):
|
jpayne@69
|
747 # It should be a string then...
|
jpayne@69
|
748 comments = [comments]
|
jpayne@69
|
749
|
jpayne@69
|
750 for index, comment in enumerate(comments):
|
jpayne@69
|
751 comment = comment.replace("\n", " ")
|
jpayne@69
|
752 # TODO - Store each line as a separate entry? This would preserve
|
jpayne@69
|
753 # the newlines, but we should check BioPerl etc to be consistent.
|
jpayne@69
|
754 sql = (
|
jpayne@69
|
755 'INSERT INTO comment (bioentry_id, comment_text, "rank")'
|
jpayne@69
|
756 " VALUES (%s, %s, %s)"
|
jpayne@69
|
757 )
|
jpayne@69
|
758 self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
|
jpayne@69
|
759
|
jpayne@69
|
760 def _load_annotations(self, record, bioentry_id):
|
jpayne@69
|
761 """Record a SeqRecord's misc annotations in the database (PRIVATE).
|
jpayne@69
|
762
|
jpayne@69
|
763 The annotation strings are recorded in the bioentry_qualifier_value
|
jpayne@69
|
764 table, except for special cases like the reference, comment and
|
jpayne@69
|
765 taxonomy which are handled with their own tables.
|
jpayne@69
|
766
|
jpayne@69
|
767 Arguments:
|
jpayne@69
|
768 - record - a SeqRecord object with an annotations dictionary
|
jpayne@69
|
769 - bioentry_id - corresponding database identifier
|
jpayne@69
|
770
|
jpayne@69
|
771 """
|
jpayne@69
|
772 mono_sql = (
|
jpayne@69
|
773 "INSERT INTO bioentry_qualifier_value"
|
jpayne@69
|
774 "(bioentry_id, term_id, value)"
|
jpayne@69
|
775 " VALUES (%s, %s, %s)"
|
jpayne@69
|
776 )
|
jpayne@69
|
777 many_sql = (
|
jpayne@69
|
778 "INSERT INTO bioentry_qualifier_value"
|
jpayne@69
|
779 '(bioentry_id, term_id, value, "rank")'
|
jpayne@69
|
780 " VALUES (%s, %s, %s, %s)"
|
jpayne@69
|
781 )
|
jpayne@69
|
782 tag_ontology_id = self._get_ontology_id("Annotation Tags")
|
jpayne@69
|
783 for key, value in record.annotations.items():
|
jpayne@69
|
784 if key in ["molecule_type", "references", "comment", "ncbi_taxid", "date"]:
|
jpayne@69
|
785 # Handled separately
|
jpayne@69
|
786 continue
|
jpayne@69
|
787 term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
|
jpayne@69
|
788 if isinstance(value, (list, tuple)):
|
jpayne@69
|
789 rank = 0
|
jpayne@69
|
790 for entry in value:
|
jpayne@69
|
791 if isinstance(entry, (str, int)):
|
jpayne@69
|
792 # Easy case
|
jpayne@69
|
793 rank += 1
|
jpayne@69
|
794 self.adaptor.execute(
|
jpayne@69
|
795 many_sql, (bioentry_id, term_id, str(entry), rank)
|
jpayne@69
|
796 )
|
jpayne@69
|
797 else:
|
jpayne@69
|
798 pass
|
jpayne@69
|
799 elif isinstance(value, (str, int)):
|
jpayne@69
|
800 # Have a simple single entry, leave rank as the DB default
|
jpayne@69
|
801 self.adaptor.execute(mono_sql, (bioentry_id, term_id, str(value)))
|
jpayne@69
|
802 else:
|
jpayne@69
|
803 pass
|
jpayne@69
|
804 # print("Ignoring annotation '%s' entry of type '%s'" \
|
jpayne@69
|
805 # % (key, type(value)))
|
jpayne@69
|
806
|
jpayne@69
|
807 def _load_reference(self, reference, rank, bioentry_id):
|
jpayne@69
|
808 """Record SeqRecord's annotated references in the database (PRIVATE).
|
jpayne@69
|
809
|
jpayne@69
|
810 Arguments:
|
jpayne@69
|
811 - record - a SeqRecord object with annotated references
|
jpayne@69
|
812 - bioentry_id - corresponding database identifier
|
jpayne@69
|
813
|
jpayne@69
|
814 """
|
jpayne@69
|
815 refs = None
|
jpayne@69
|
816 if reference.medline_id:
|
jpayne@69
|
817 refs = self.adaptor.execute_and_fetch_col0(
|
jpayne@69
|
818 "SELECT reference_id"
|
jpayne@69
|
819 " FROM reference JOIN dbxref USING (dbxref_id)"
|
jpayne@69
|
820 " WHERE dbname = 'MEDLINE' AND accession = %s",
|
jpayne@69
|
821 (reference.medline_id,),
|
jpayne@69
|
822 )
|
jpayne@69
|
823 if not refs and reference.pubmed_id:
|
jpayne@69
|
824 refs = self.adaptor.execute_and_fetch_col0(
|
jpayne@69
|
825 "SELECT reference_id"
|
jpayne@69
|
826 " FROM reference JOIN dbxref USING (dbxref_id)"
|
jpayne@69
|
827 " WHERE dbname = 'PUBMED' AND accession = %s",
|
jpayne@69
|
828 (reference.pubmed_id,),
|
jpayne@69
|
829 )
|
jpayne@69
|
830 if not refs:
|
jpayne@69
|
831 s = []
|
jpayne@69
|
832 for f in reference.authors, reference.title, reference.journal:
|
jpayne@69
|
833 s.append(f or "<undef>")
|
jpayne@69
|
834 crc = crc64("".join(s))
|
jpayne@69
|
835 refs = self.adaptor.execute_and_fetch_col0(
|
jpayne@69
|
836 "SELECT reference_id FROM reference WHERE crc = %s", (crc,)
|
jpayne@69
|
837 )
|
jpayne@69
|
838 if not refs:
|
jpayne@69
|
839 if reference.medline_id:
|
jpayne@69
|
840 dbxref_id = self._add_dbxref("MEDLINE", reference.medline_id, 0)
|
jpayne@69
|
841 elif reference.pubmed_id:
|
jpayne@69
|
842 dbxref_id = self._add_dbxref("PUBMED", reference.pubmed_id, 0)
|
jpayne@69
|
843 else:
|
jpayne@69
|
844 dbxref_id = None
|
jpayne@69
|
845 authors = reference.authors or None
|
jpayne@69
|
846 title = reference.title or None
|
jpayne@69
|
847 # The location/journal field cannot be Null, so default
|
jpayne@69
|
848 # to an empty string rather than None:
|
jpayne@69
|
849 journal = reference.journal or ""
|
jpayne@69
|
850 self.adaptor.execute(
|
jpayne@69
|
851 "INSERT INTO reference (dbxref_id, location,"
|
jpayne@69
|
852 " title, authors, crc)"
|
jpayne@69
|
853 " VALUES (%s, %s, %s, %s, %s)",
|
jpayne@69
|
854 (dbxref_id, journal, title, authors, crc),
|
jpayne@69
|
855 )
|
jpayne@69
|
856 reference_id = self.adaptor.last_id("reference")
|
jpayne@69
|
857 else:
|
jpayne@69
|
858 reference_id = refs[0]
|
jpayne@69
|
859
|
jpayne@69
|
860 if reference.location:
|
jpayne@69
|
861 start = 1 + int(str(reference.location[0].start))
|
jpayne@69
|
862 end = int(str(reference.location[0].end))
|
jpayne@69
|
863 else:
|
jpayne@69
|
864 start = None
|
jpayne@69
|
865 end = None
|
jpayne@69
|
866
|
jpayne@69
|
867 sql = (
|
jpayne@69
|
868 "INSERT INTO bioentry_reference (bioentry_id, reference_id,"
|
jpayne@69
|
869 ' start_pos, end_pos, "rank") VALUES (%s, %s, %s, %s, %s)'
|
jpayne@69
|
870 )
|
jpayne@69
|
871 self.adaptor.execute(sql, (bioentry_id, reference_id, start, end, rank + 1))
|
jpayne@69
|
872
|
jpayne@69
|
873 def _load_seqfeature(self, feature, feature_rank, bioentry_id):
|
jpayne@69
|
874 """Load a biopython SeqFeature into the database (PRIVATE)."""
|
jpayne@69
|
875 # records loaded from a gff file using BCBio.GFF will contain value
|
jpayne@69
|
876 # of 2nd column of the gff as a feature qualifier. The BioSQL wiki
|
jpayne@69
|
877 # suggests that the source should not go in with the other feature
|
jpayne@69
|
878 # mappings but instead be put in the term table
|
jpayne@69
|
879 # (http://www.biosql.org/wiki/Annotation_Mapping)
|
jpayne@69
|
880 try:
|
jpayne@69
|
881 source = feature.qualifiers["source"]
|
jpayne@69
|
882 if isinstance(source, list):
|
jpayne@69
|
883 source = source[0]
|
jpayne@69
|
884 seqfeature_id = self._load_seqfeature_basic(
|
jpayne@69
|
885 feature.type, feature_rank, bioentry_id, source=source
|
jpayne@69
|
886 )
|
jpayne@69
|
887 except KeyError:
|
jpayne@69
|
888 seqfeature_id = self._load_seqfeature_basic(
|
jpayne@69
|
889 feature.type, feature_rank, bioentry_id
|
jpayne@69
|
890 )
|
jpayne@69
|
891
|
jpayne@69
|
892 self._load_seqfeature_locations(feature, seqfeature_id)
|
jpayne@69
|
893 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
|
jpayne@69
|
894
|
jpayne@69
|
895 def _load_seqfeature_basic(
|
jpayne@69
|
896 self, feature_type, feature_rank, bioentry_id, source="EMBL/GenBank/SwissProt"
|
jpayne@69
|
897 ):
|
jpayne@69
|
898 """Load the first tables of a seqfeature and returns the id (PRIVATE).
|
jpayne@69
|
899
|
jpayne@69
|
900 This loads the "key" of the seqfeature (ie. CDS, gene) and
|
jpayne@69
|
901 the basic seqfeature table itself.
|
jpayne@69
|
902 """
|
jpayne@69
|
903 ontology_id = self._get_ontology_id("SeqFeature Keys")
|
jpayne@69
|
904 seqfeature_key_id = self._get_term_id(feature_type, ontology_id=ontology_id)
|
jpayne@69
|
905 source_cat_id = self._get_ontology_id("SeqFeature Sources")
|
jpayne@69
|
906 source_term_id = self._get_term_id(source, ontology_id=source_cat_id)
|
jpayne@69
|
907
|
jpayne@69
|
908 sql = (
|
jpayne@69
|
909 "INSERT INTO seqfeature (bioentry_id, type_term_id, "
|
jpayne@69
|
910 'source_term_id, "rank") VALUES (%s, %s, %s, %s)'
|
jpayne@69
|
911 )
|
jpayne@69
|
912 self.adaptor.execute(
|
jpayne@69
|
913 sql, (bioentry_id, seqfeature_key_id, source_term_id, feature_rank + 1)
|
jpayne@69
|
914 )
|
jpayne@69
|
915 return self.adaptor.last_id("seqfeature")
|
jpayne@69
|
916
|
jpayne@69
|
917 def _load_seqfeature_locations(self, feature, seqfeature_id):
|
jpayne@69
|
918 """Load all of the locations for a SeqFeature into tables (PRIVATE).
|
jpayne@69
|
919
|
jpayne@69
|
920 This adds the locations related to the SeqFeature into the
|
jpayne@69
|
921 seqfeature_location table. Fuzzies are not handled right now.
|
jpayne@69
|
922 For a simple location, ie (1..2), we have a single table row
|
jpayne@69
|
923 with seq_start = 1, seq_end = 2, location_rank = 1.
|
jpayne@69
|
924
|
jpayne@69
|
925 For split locations, ie (1..2, 3..4, 5..6) we would have three
|
jpayne@69
|
926 row tables with::
|
jpayne@69
|
927
|
jpayne@69
|
928 start = 1, end = 2, rank = 1
|
jpayne@69
|
929 start = 3, end = 4, rank = 2
|
jpayne@69
|
930 start = 5, end = 6, rank = 3
|
jpayne@69
|
931
|
jpayne@69
|
932 """
|
jpayne@69
|
933 # TODO - Record an ontology for the locations (using location.term_id)
|
jpayne@69
|
934 # which for now as in BioPerl we leave defaulting to NULL.
|
jpayne@69
|
935 try:
|
jpayne@69
|
936 if feature.location.operator != "join":
|
jpayne@69
|
937 # e.g. order locations... we don't record "order" so it
|
jpayne@69
|
938 # will become a "join" on reloading. What does BioPerl do?
|
jpayne@69
|
939 import warnings
|
jpayne@69
|
940 from Bio import BiopythonWarning
|
jpayne@69
|
941
|
jpayne@69
|
942 warnings.warn(
|
jpayne@69
|
943 "%s location operators are not fully supported"
|
jpayne@69
|
944 % feature.location_operator,
|
jpayne@69
|
945 BiopythonWarning,
|
jpayne@69
|
946 )
|
jpayne@69
|
947 except AttributeError:
|
jpayne@69
|
948 pass
|
jpayne@69
|
949 # This will be a list of length one for a SimpleLocation:
|
jpayne@69
|
950 parts = feature.location.parts
|
jpayne@69
|
951 if parts and {loc.strand for loc in parts} == {-1}:
|
jpayne@69
|
952 # To mimic prior behaviour of Biopython+BioSQL, reverse order
|
jpayne@69
|
953 parts = parts[::-1]
|
jpayne@69
|
954 # TODO - Check what BioPerl does; see also BioSeq.py code
|
jpayne@69
|
955 for rank, loc in enumerate(parts):
|
jpayne@69
|
956 self._insert_location(loc, rank + 1, seqfeature_id)
|
jpayne@69
|
957
|
jpayne@69
|
958 def _insert_location(self, location, rank, seqfeature_id):
|
jpayne@69
|
959 """Add SeqFeature location to seqfeature_location table (PRIVATE).
|
jpayne@69
|
960
|
jpayne@69
|
961 TODO - Add location operator to location_qualifier_value?
|
jpayne@69
|
962 """
|
jpayne@69
|
963 # convert biopython locations to the 1-based location system
|
jpayne@69
|
964 # used in bioSQL
|
jpayne@69
|
965 # XXX This could also handle fuzzies
|
jpayne@69
|
966
|
jpayne@69
|
967 try:
|
jpayne@69
|
968 start = int(location.start) + 1
|
jpayne@69
|
969 except TypeError:
|
jpayne@69
|
970 # Handle SwissProt unknown position (?)
|
jpayne@69
|
971 if isinstance(location.start, UnknownPosition):
|
jpayne@69
|
972 start = None
|
jpayne@69
|
973 else:
|
jpayne@69
|
974 raise
|
jpayne@69
|
975
|
jpayne@69
|
976 try:
|
jpayne@69
|
977 end = int(location.end)
|
jpayne@69
|
978 except TypeError:
|
jpayne@69
|
979 # Handle SwissProt unknown position (?)
|
jpayne@69
|
980 if isinstance(location.end, UnknownPosition):
|
jpayne@69
|
981 end = None
|
jpayne@69
|
982 else:
|
jpayne@69
|
983 raise
|
jpayne@69
|
984
|
jpayne@69
|
985 # Biopython uses None when we don't know strand information but
|
jpayne@69
|
986 # BioSQL requires something (non null) and sets this as zero
|
jpayne@69
|
987 # So we'll use the strand or 0 if Biopython spits out None
|
jpayne@69
|
988 strand = location.strand or 0
|
jpayne@69
|
989
|
jpayne@69
|
990 # TODO - Record an ontology term for the location (location.term_id)
|
jpayne@69
|
991 # which for now like BioPerl we'll leave as NULL.
|
jpayne@69
|
992 # This might allow us to record "between" positions properly, but I
|
jpayne@69
|
993 # don't really see how it could work for before/after fuzzy positions
|
jpayne@69
|
994 loc_term_id = None
|
jpayne@69
|
995
|
jpayne@69
|
996 if location.ref:
|
jpayne@69
|
997 # sub_feature remote locations when they are in the same db as the
|
jpayne@69
|
998 # current record do not have a value for ref_db, which SeqFeature
|
jpayne@69
|
999 # object stores as None. BioSQL schema requires a varchar and is
|
jpayne@69
|
1000 # not NULL
|
jpayne@69
|
1001 dbxref_id = self._get_dbxref_id(location.ref_db or "", location.ref)
|
jpayne@69
|
1002 else:
|
jpayne@69
|
1003 dbxref_id = None
|
jpayne@69
|
1004
|
jpayne@69
|
1005 sql = (
|
jpayne@69
|
1006 "INSERT INTO location (seqfeature_id, dbxref_id, term_id,"
|
jpayne@69
|
1007 'start_pos, end_pos, strand, "rank") '
|
jpayne@69
|
1008 "VALUES (%s, %s, %s, %s, %s, %s, %s)"
|
jpayne@69
|
1009 )
|
jpayne@69
|
1010 self.adaptor.execute(
|
jpayne@69
|
1011 sql, (seqfeature_id, dbxref_id, loc_term_id, start, end, strand, rank)
|
jpayne@69
|
1012 )
|
jpayne@69
|
1013
|
jpayne@69
|
1014 """
|
jpayne@69
|
1015 # See Bug 2677
|
jpayne@69
|
1016 # TODO - Record the location_operator (e.g. "join" or "order")
|
jpayne@69
|
1017 # using the location_qualifier_value table (which we and BioPerl
|
jpayne@69
|
1018 # have historically left empty).
|
jpayne@69
|
1019 # Note this will need an ontology term for the location qualifier
|
jpayne@69
|
1020 # (location_qualifier_value.term_id) for which oddly the schema
|
jpayne@69
|
1021 # does not allow NULL.
|
jpayne@69
|
1022 if feature.location_operator:
|
jpayne@69
|
1023 #e.g. "join" (common),
|
jpayne@69
|
1024 #or "order" (see Tests/GenBank/protein_refseq2.gb)
|
jpayne@69
|
1025 location_id = self.adaptor.last_id('location')
|
jpayne@69
|
1026 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
|
jpayne@69
|
1027 sql = ("INSERT INTO location_qualifier_value"
|
jpayne@69
|
1028 "(location_id, term_id, value) "
|
jpayne@69
|
1029 "VALUES (%s, %s, %s)")
|
jpayne@69
|
1030 self.adaptor.execute(sql, (location_id, loc_qual_term_id,
|
jpayne@69
|
1031 feature.location_operator))
|
jpayne@69
|
1032 """
|
jpayne@69
|
1033
|
jpayne@69
|
1034 def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
|
jpayne@69
|
1035 """Insert feature's (key, value) pair qualifiers (PRIVATE).
|
jpayne@69
|
1036
|
jpayne@69
|
1037 Qualifiers should be a dictionary of the form::
|
jpayne@69
|
1038
|
jpayne@69
|
1039 {key : [value1, value2]}
|
jpayne@69
|
1040
|
jpayne@69
|
1041 """
|
jpayne@69
|
1042 tag_ontology_id = self._get_ontology_id("Annotation Tags")
|
jpayne@69
|
1043 for qualifier_key in qualifiers:
|
jpayne@69
|
1044 # Treat db_xref qualifiers differently to sequence annotation
|
jpayne@69
|
1045 # qualifiers by populating the seqfeature_dbxref and dbxref
|
jpayne@69
|
1046 # tables. Other qualifiers go into the seqfeature_qualifier_value
|
jpayne@69
|
1047 # and (if new) term tables.
|
jpayne@69
|
1048 if qualifier_key != "db_xref":
|
jpayne@69
|
1049 qualifier_key_id = self._get_term_id(
|
jpayne@69
|
1050 qualifier_key, ontology_id=tag_ontology_id
|
jpayne@69
|
1051 )
|
jpayne@69
|
1052 # now add all of the values to their table
|
jpayne@69
|
1053 entries = qualifiers[qualifier_key]
|
jpayne@69
|
1054 if not isinstance(entries, list):
|
jpayne@69
|
1055 # Could be a plain string, or an int or a float.
|
jpayne@69
|
1056 # However, we exect a list of strings here.
|
jpayne@69
|
1057 entries = [entries]
|
jpayne@69
|
1058 for qual_value_rank in range(len(entries)):
|
jpayne@69
|
1059 qualifier_value = entries[qual_value_rank]
|
jpayne@69
|
1060 sql = (
|
jpayne@69
|
1061 "INSERT INTO seqfeature_qualifier_value "
|
jpayne@69
|
1062 ' (seqfeature_id, term_id, "rank", value) VALUES'
|
jpayne@69
|
1063 " (%s, %s, %s, %s)"
|
jpayne@69
|
1064 )
|
jpayne@69
|
1065 self.adaptor.execute(
|
jpayne@69
|
1066 sql,
|
jpayne@69
|
1067 (
|
jpayne@69
|
1068 seqfeature_id,
|
jpayne@69
|
1069 qualifier_key_id,
|
jpayne@69
|
1070 qual_value_rank + 1,
|
jpayne@69
|
1071 qualifier_value,
|
jpayne@69
|
1072 ),
|
jpayne@69
|
1073 )
|
jpayne@69
|
1074 else:
|
jpayne@69
|
1075 # The dbxref_id qualifier/value sets go into the dbxref table
|
jpayne@69
|
1076 # as dbname, accession, version tuples, with dbxref.dbxref_id
|
jpayne@69
|
1077 # being automatically assigned, and into the seqfeature_dbxref
|
jpayne@69
|
1078 # table as seqfeature_id, dbxref_id, and rank tuples
|
jpayne@69
|
1079 self._load_seqfeature_dbxref(qualifiers[qualifier_key], seqfeature_id)
|
jpayne@69
|
1080
|
jpayne@69
|
1081 def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
|
jpayne@69
|
1082 """Add SeqFeature's DB cross-references to the database (PRIVATE).
|
jpayne@69
|
1083
|
jpayne@69
|
1084 Arguments:
|
jpayne@69
|
1085 - dbxrefs - List, dbxref data from the source file in the
|
jpayne@69
|
1086 format <database>:<accession>
|
jpayne@69
|
1087 - seqfeature_id - Int, the identifier for the seqfeature in the
|
jpayne@69
|
1088 seqfeature table
|
jpayne@69
|
1089
|
jpayne@69
|
1090 Insert dbxref qualifier data for a seqfeature into the
|
jpayne@69
|
1091 seqfeature_dbxref and, if required, dbxref tables.
|
jpayne@69
|
1092 The dbxref_id qualifier/value sets go into the dbxref table
|
jpayne@69
|
1093 as dbname, accession, version tuples, with dbxref.dbxref_id
|
jpayne@69
|
1094 being automatically assigned, and into the seqfeature_dbxref
|
jpayne@69
|
1095 table as seqfeature_id, dbxref_id, and rank tuples.
|
jpayne@69
|
1096 """
|
jpayne@69
|
1097 # NOTE - In older versions of Biopython, we would map the GenBank
|
jpayne@69
|
1098 # db_xref "name", for example "GI" to "GeneIndex", and give a warning
|
jpayne@69
|
1099 # for any unknown terms. This was a long term maintenance problem,
|
jpayne@69
|
1100 # and differed from BioPerl and BioJava's implementation. See bug 2405
|
jpayne@69
|
1101 for rank, value in enumerate(dbxrefs):
|
jpayne@69
|
1102 # Split the DB:accession format string at colons. We have to
|
jpayne@69
|
1103 # account for multiple-line and multiple-accession entries
|
jpayne@69
|
1104 try:
|
jpayne@69
|
1105 dbxref_data = value.replace(" ", "").replace("\n", "").split(":")
|
jpayne@69
|
1106 db = dbxref_data[0]
|
jpayne@69
|
1107 accessions = dbxref_data[1:]
|
jpayne@69
|
1108 except Exception:
|
jpayne@69
|
1109 raise ValueError(f"Parsing of db_xref failed: '{value}'") from None
|
jpayne@69
|
1110 # Loop over all the grabbed accessions, and attempt to fill the
|
jpayne@69
|
1111 # table
|
jpayne@69
|
1112 for accession in accessions:
|
jpayne@69
|
1113 # Get the dbxref_id value for the dbxref data
|
jpayne@69
|
1114 dbxref_id = self._get_dbxref_id(db, accession)
|
jpayne@69
|
1115 # Insert the seqfeature_dbxref data
|
jpayne@69
|
1116 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
|
jpayne@69
|
1117
|
jpayne@69
|
1118 def _get_dbxref_id(self, db, accession):
|
jpayne@69
|
1119 """Get DB cross-reference for accession (PRIVATE).
|
jpayne@69
|
1120
|
jpayne@69
|
1121 Arguments:
|
jpayne@69
|
1122 - db - String, the name of the external database containing
|
jpayne@69
|
1123 the accession number
|
jpayne@69
|
1124 - accession - String, the accession of the dbxref data
|
jpayne@69
|
1125
|
jpayne@69
|
1126 Finds and returns the dbxref_id for the passed data. The method
|
jpayne@69
|
1127 attempts to find an existing record first, and inserts the data
|
jpayne@69
|
1128 if there is no record.
|
jpayne@69
|
1129 """
|
jpayne@69
|
1130 # Check for an existing record
|
jpayne@69
|
1131 sql = "SELECT dbxref_id FROM dbxref WHERE dbname = %s AND accession = %s"
|
jpayne@69
|
1132 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
|
jpayne@69
|
1133 # If there was a record, return the dbxref_id, else create the
|
jpayne@69
|
1134 # record and return the created dbxref_id
|
jpayne@69
|
1135 if dbxref_id:
|
jpayne@69
|
1136 return dbxref_id[0]
|
jpayne@69
|
1137 return self._add_dbxref(db, accession, 0)
|
jpayne@69
|
1138
|
jpayne@69
|
1139 def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
|
jpayne@69
|
1140 """Get DB cross-reference, creating it if needed (PRIVATE).
|
jpayne@69
|
1141
|
jpayne@69
|
1142 Check for a pre-existing seqfeature_dbxref entry with the passed
|
jpayne@69
|
1143 seqfeature_id and dbxref_id. If one does not exist, insert new
|
jpayne@69
|
1144 data.
|
jpayne@69
|
1145 """
|
jpayne@69
|
1146 # Check for an existing record
|
jpayne@69
|
1147 sql = (
|
jpayne@69
|
1148 "SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref "
|
jpayne@69
|
1149 "WHERE seqfeature_id = %s AND dbxref_id = %s"
|
jpayne@69
|
1150 )
|
jpayne@69
|
1151 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, dbxref_id))
|
jpayne@69
|
1152 # If there was a record, return without executing anything, else create
|
jpayne@69
|
1153 # the record and return
|
jpayne@69
|
1154 if result:
|
jpayne@69
|
1155 return result
|
jpayne@69
|
1156 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
|
jpayne@69
|
1157
|
jpayne@69
|
1158 def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
|
jpayne@69
|
1159 """Add DB cross-reference (PRIVATE).
|
jpayne@69
|
1160
|
jpayne@69
|
1161 Insert a seqfeature_dbxref row and return the seqfeature_id and
|
jpayne@69
|
1162 dbxref_id
|
jpayne@69
|
1163 """
|
jpayne@69
|
1164 sql = (
|
jpayne@69
|
1165 "INSERT INTO seqfeature_dbxref "
|
jpayne@69
|
1166 '(seqfeature_id, dbxref_id, "rank") VALUES'
|
jpayne@69
|
1167 "(%s, %s, %s)"
|
jpayne@69
|
1168 )
|
jpayne@69
|
1169 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
|
jpayne@69
|
1170 return (seqfeature_id, dbxref_id)
|
jpayne@69
|
1171
|
jpayne@69
|
1172 def _load_dbxrefs(self, record, bioentry_id):
|
jpayne@69
|
1173 """Load any sequence level cross references into the database (PRIVATE).
|
jpayne@69
|
1174
|
jpayne@69
|
1175 See table bioentry_dbxref.
|
jpayne@69
|
1176 """
|
jpayne@69
|
1177 for rank, value in enumerate(record.dbxrefs):
|
jpayne@69
|
1178 # Split the DB:accession string at first colon.
|
jpayne@69
|
1179 # We have to cope with things like:
|
jpayne@69
|
1180 # "MGD:MGI:892" (db="MGD", accession="MGI:892")
|
jpayne@69
|
1181 # "GO:GO:123" (db="GO", accession="GO:123")
|
jpayne@69
|
1182 #
|
jpayne@69
|
1183 # Annoyingly I have seen the NCBI use both the style
|
jpayne@69
|
1184 # "GO:GO:123" and "GO:123" in different vintages.
|
jpayne@69
|
1185 newline_escape_count = value.count("\n")
|
jpayne@69
|
1186 if newline_escape_count != 0:
|
jpayne@69
|
1187 raise ValueError(
|
jpayne@69
|
1188 "Expected a single line in value, got {newline_escape_count}"
|
jpayne@69
|
1189 )
|
jpayne@69
|
1190 try:
|
jpayne@69
|
1191 db, accession = value.split(":", 1)
|
jpayne@69
|
1192 db = db.strip()
|
jpayne@69
|
1193 accession = accession.strip()
|
jpayne@69
|
1194 except Exception:
|
jpayne@69
|
1195 raise ValueError(f"Parsing of dbxrefs list failed: '{value}'") from None
|
jpayne@69
|
1196 # Get the dbxref_id value for the dbxref data
|
jpayne@69
|
1197 dbxref_id = self._get_dbxref_id(db, accession)
|
jpayne@69
|
1198 # Insert the bioentry_dbxref data
|
jpayne@69
|
1199 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
|
jpayne@69
|
1200
|
jpayne@69
|
1201 def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
|
jpayne@69
|
1202 """Get pre-existing db-xref, or create and return it (PRIVATE).
|
jpayne@69
|
1203
|
jpayne@69
|
1204 Check for a pre-existing bioentry_dbxref entry with the passed
|
jpayne@69
|
1205 seqfeature_id and dbxref_id. If one does not exist, insert new
|
jpayne@69
|
1206 data
|
jpayne@69
|
1207 """
|
jpayne@69
|
1208 # Check for an existing record
|
jpayne@69
|
1209 sql = (
|
jpayne@69
|
1210 "SELECT bioentry_id, dbxref_id FROM bioentry_dbxref "
|
jpayne@69
|
1211 "WHERE bioentry_id = %s AND dbxref_id = %s"
|
jpayne@69
|
1212 )
|
jpayne@69
|
1213 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, dbxref_id))
|
jpayne@69
|
1214 # If there was a record, return without executing anything, else create
|
jpayne@69
|
1215 # the record and return
|
jpayne@69
|
1216 if result:
|
jpayne@69
|
1217 return result
|
jpayne@69
|
1218 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
|
jpayne@69
|
1219
|
jpayne@69
|
1220 def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
|
jpayne@69
|
1221 """Insert a bioentry_dbxref row (PRIVATE).
|
jpayne@69
|
1222
|
jpayne@69
|
1223 Returns the seqfeature_id and dbxref_id (PRIVATE).
|
jpayne@69
|
1224 """
|
jpayne@69
|
1225 sql = (
|
jpayne@69
|
1226 "INSERT INTO bioentry_dbxref "
|
jpayne@69
|
1227 '(bioentry_id,dbxref_id,"rank") VALUES '
|
jpayne@69
|
1228 "(%s, %s, %s)"
|
jpayne@69
|
1229 )
|
jpayne@69
|
1230 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
|
jpayne@69
|
1231 return (bioentry_id, dbxref_id)
|
jpayne@69
|
1232
|
jpayne@69
|
1233
|
jpayne@69
|
1234 class DatabaseRemover:
|
jpayne@69
|
1235 """Complement the Loader functionality by fully removing a database.
|
jpayne@69
|
1236
|
jpayne@69
|
1237 This probably isn't really useful for normal purposes, since you
|
jpayne@69
|
1238 can just do a::
|
jpayne@69
|
1239
|
jpayne@69
|
1240 DROP DATABASE db_name
|
jpayne@69
|
1241
|
jpayne@69
|
1242 and then recreate the database. But, it's really useful for testing
|
jpayne@69
|
1243 purposes.
|
jpayne@69
|
1244 """
|
jpayne@69
|
1245
|
jpayne@69
|
1246 def __init__(self, adaptor, dbid):
|
jpayne@69
|
1247 """Initialize with a database id and adaptor connection."""
|
jpayne@69
|
1248 self.adaptor = adaptor
|
jpayne@69
|
1249 self.dbid = dbid
|
jpayne@69
|
1250
|
jpayne@69
|
1251 def remove(self):
|
jpayne@69
|
1252 """Remove everything related to the given database id."""
|
jpayne@69
|
1253 sql = "DELETE FROM bioentry WHERE biodatabase_id = %s"
|
jpayne@69
|
1254 self.adaptor.execute(sql, (self.dbid,))
|
jpayne@69
|
1255 sql = "DELETE FROM biodatabase WHERE biodatabase_id = %s"
|
jpayne@69
|
1256 self.adaptor.execute(sql, (self.dbid,))
|