kkonganti@0
|
1 """Builds SQLite3 databases"""
|
kkonganti@0
|
2
|
kkonganti@0
|
3 import logging, os, pickle, re, requests, sqlite3, sys, time
|
kkonganti@0
|
4 import lexmapr.ontology_reasoner as ontr
|
kkonganti@0
|
5 from nltk.tokenize import word_tokenize
|
kkonganti@0
|
6 from lexmapr.pipeline_helpers import punctuation_treatment
|
kkonganti@0
|
7 from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db
|
kkonganti@0
|
8 from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels
|
kkonganti@0
|
9 from lexmapr.pipeline_resources import get_resource_label_permutations
|
kkonganti@0
|
10
|
kkonganti@0
|
11 logging.getLogger('requests').setLevel(logging.WARNING)
|
kkonganti@0
|
12 logging.getLogger('urllib3').setLevel(logging.WARNING)
|
kkonganti@0
|
13
|
kkonganti@0
|
14
|
kkonganti@0
|
15 # TODO: might replace pickle with ujson
|
kkonganti@0
|
16 def _pickle_save(data_to_save, file_path):
|
kkonganti@0
|
17 '''Write a pickle file'''
|
kkonganti@0
|
18 with open(file_path,'wb') as SAVE_file:
|
kkonganti@0
|
19 pickle.dump(data_to_save, SAVE_file)
|
kkonganti@0
|
20
|
kkonganti@0
|
21
|
kkonganti@0
|
22 def _pickle_load(file_path):
|
kkonganti@0
|
23 '''Read a pickle file'''
|
kkonganti@0
|
24 with open(file_path,'rb') as LOAD_file:
|
kkonganti@0
|
25 return(pickle.load(LOAD_file))
|
kkonganti@0
|
26
|
kkonganti@0
|
27
|
kkonganti@0
|
28 def _get_ontols(ontol_interest):
|
kkonganti@0
|
29 '''Obtain URLs for ontologies of interest'''
|
kkonganti@0
|
30 ontol_dic = {}
|
kkonganti@0
|
31 embl_resp = requests.get(embl_ontologies)
|
kkonganti@0
|
32 resp_blocks = re.findall('<tr>([\s\S]+?)</tr>',embl_resp.content.decode('utf-8'))
|
kkonganti@0
|
33 for resp_block in resp_blocks:
|
kkonganti@0
|
34 try:
|
kkonganti@0
|
35 embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1)
|
kkonganti@0
|
36 embl_name = re.search('<a href=\"/ols/ontologies/' + embl_abbr.lower() +\
|
kkonganti@0
|
37 '\">([\s\S]+?)</a>', resp_block).group(1)
|
kkonganti@0
|
38 embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1)
|
kkonganti@0
|
39 if embl_link.startswith('ontologies'):
|
kkonganti@0
|
40 embl_link = embl_link[len('ontologies'):]
|
kkonganti@0
|
41 # TODO: with Python 3.9- embl_link.removeprefix('ontologies')
|
kkonganti@0
|
42 except(AttributeError):
|
kkonganti@0
|
43 continue
|
kkonganti@0
|
44 if embl_abbr in ontol_interest:
|
kkonganti@0
|
45 ontol_dic[embl_abbr] = (embl_name, embl_link)
|
kkonganti@0
|
46 # Continue if not find all ontologies of interest specified in definitions.py
|
kkonganti@0
|
47 not_found = set(ontol_interest).difference(set(ontol_dic.keys()))
|
kkonganti@0
|
48 if not_found:
|
kkonganti@0
|
49 if len(not_found) == 1:
|
kkonganti@0
|
50 logging.warning(f'Did not find ontology: ' + ', '.join(not_found))
|
kkonganti@0
|
51 else:
|
kkonganti@0
|
52 logging.warning(f'Did not find ontologies: ' + ', '.join(not_found))
|
kkonganti@0
|
53 if ontol_dic == {}:
|
kkonganti@0
|
54 sys.exit('Zero ontologies found from user-given list')
|
kkonganti@0
|
55 return(ontol_dic)
|
kkonganti@0
|
56
|
kkonganti@0
|
57
|
kkonganti@0
|
58 def _check_make(db_file, remake_cache, ontol_interest):
|
kkonganti@0
|
59 '''Check if database file should be remade'''
|
kkonganti@0
|
60 if os.path.exists(db_file) and remake_cache == False:
|
kkonganti@0
|
61 if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')):
|
kkonganti@0
|
62 if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')):
|
kkonganti@0
|
63 return(False)
|
kkonganti@0
|
64 try:
|
kkonganti@0
|
65 os.remove(db_file)
|
kkonganti@0
|
66 except(FileNotFoundError):
|
kkonganti@0
|
67 pass
|
kkonganti@0
|
68 return(True)
|
kkonganti@0
|
69
|
kkonganti@0
|
70
|
kkonganti@0
|
71 def _db_insert(db_cursor, table_name, key_term, val_term):
|
kkonganti@0
|
72 '''Insert new data into a database table'''
|
kkonganti@0
|
73 if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='':
|
kkonganti@0
|
74 return
|
kkonganti@0
|
75 db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)",
|
kkonganti@0
|
76 {'key':key_term.strip(), 'value':val_term.strip()})
|
kkonganti@0
|
77
|
kkonganti@0
|
78
|
kkonganti@0
|
79 def _get_imports(file_handle):
|
kkonganti@0
|
80 '''Check for required imports; append any new patterns to pattern_strs'''
|
kkonganti@0
|
81 pattern_strs = []
|
kkonganti@0
|
82 pattern_strs.append('<owl:imports rdf:resource=\"&obo;(.*)"/>')
|
kkonganti@0
|
83 pattern_strs.append('<owl:imports rdf:resource=\"(http://.*)\"/>')
|
kkonganti@0
|
84 whole_file = str(file_handle.read())
|
kkonganti@0
|
85 for patt_str in pattern_strs:
|
kkonganti@0
|
86 import_match = re.findall(patt_str, whole_file)
|
kkonganti@0
|
87 if import_match != []:
|
kkonganti@0
|
88 import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match]
|
kkonganti@0
|
89 break
|
kkonganti@0
|
90 return(import_match)
|
kkonganti@0
|
91
|
kkonganti@0
|
92
|
kkonganti@0
|
93 def _section_file(file_handle, break_pattern,
|
kkonganti@0
|
94 stt_at=' // Classes', end_at=' // Annotations'):
|
kkonganti@0
|
95 '''Break OWL files into readable sections for each ontology accession'''
|
kkonganti@0
|
96 whole_file = str(file_handle.read())
|
kkonganti@0
|
97 if stt_at != '':
|
kkonganti@0
|
98 if re.search(stt_at, whole_file):
|
kkonganti@0
|
99 whole_file = ''.join(whole_file.split(stt_at)[1:])
|
kkonganti@0
|
100 if end_at != '':
|
kkonganti@0
|
101 if re.search(end_at, whole_file):
|
kkonganti@0
|
102 whole_file = ''.join(whole_file.split(end_at)[:-1])
|
kkonganti@0
|
103 file_sections = whole_file.split(break_pattern)
|
kkonganti@0
|
104 return(file_sections[1:-1])
|
kkonganti@0
|
105
|
kkonganti@0
|
106
|
kkonganti@0
|
107 def _labels_synonyms(obo_list, have_label=False):
|
kkonganti@0
|
108 '''Identify labels, ids and exact ontology synonyms'''
|
kkonganti@0
|
109 obo_ids = []
|
kkonganti@0
|
110 for obo_string in obo_list:
|
kkonganti@0
|
111 id_pattern = '(\w+) -->'
|
kkonganti@0
|
112 lab_pattern = '\<rdfs:label[\s\S]*?\>([\s\S]+?)\<\/rdfs:label\>'
|
kkonganti@0
|
113 syn_pattern = '\<oboInOwl:hasExactSynonym[\s\S]*?\>(.*?)\</oboInOwl:hasExactSynonym'
|
kkonganti@0
|
114 obo_id = re.search(id_pattern, obo_string).group(1)
|
kkonganti@0
|
115 # Do not save ids that are not formatted as expected, or placeholders/obsolete/Apollo:SV
|
kkonganti@0
|
116 if re.search('APOLLO:SV', obo_id) or re.search('APOLLO_SV', obo_id):
|
kkonganti@0
|
117 continue
|
kkonganti@0
|
118 elif obo_id in missing_ontol_labels:
|
kkonganti@0
|
119 continue
|
kkonganti@0
|
120 elif re.search(':', obo_id) or re.search('_', obo_id):
|
kkonganti@0
|
121 try:
|
kkonganti@0
|
122 obo_label = re.findall(lab_pattern, obo_string)[-1]
|
kkonganti@0
|
123 if re.search('^obsolete:', obo_label):
|
kkonganti@0
|
124 continue
|
kkonganti@0
|
125 except(IndexError):
|
kkonganti@0
|
126 obo_label = ''
|
kkonganti@0
|
127 obo_synonyms = re.findall(syn_pattern, obo_string)
|
kkonganti@0
|
128 obo_ids.append([str(obo_id.lstrip('_')), str(obo_label), obo_synonyms])
|
kkonganti@0
|
129 if have_label:
|
kkonganti@0
|
130 obo_ids = [x for x in obo_ids if x[1] != '']
|
kkonganti@0
|
131 return(obo_ids)
|
kkonganti@0
|
132
|
kkonganti@0
|
133
|
kkonganti@0
|
134 def get_synonyms(remake_cache, ontol_interest):
|
kkonganti@0
|
135 '''Create database of predefined synonyms'''
|
kkonganti@0
|
136 os.makedirs(owl_dir, exist_ok=True)
|
kkonganti@0
|
137 if _check_make(synonym_db, remake_cache, ontol_interest) == False:
|
kkonganti@0
|
138 return
|
kkonganti@0
|
139
|
kkonganti@0
|
140 conn = sqlite3.connect(synonym_db)
|
kkonganti@0
|
141 c = conn.cursor()
|
kkonganti@0
|
142 c.execute("CREATE TABLE label_synonyms (key TEXT, value TEXT, UNIQUE(key, value))")
|
kkonganti@0
|
143 print('{:<40}'.format('\tRetrieving predefined synonyms'),end='\r')
|
kkonganti@0
|
144 with open('lexmapr/predefined_resources/resource_synonyms.csv', 'r') as IN_file:
|
kkonganti@0
|
145 for read_line in IN_file.readlines():
|
kkonganti@0
|
146 split_line = read_line.split(',')
|
kkonganti@0
|
147 try:
|
kkonganti@0
|
148 ontol_name = split_line[2].split('_')[0].upper()
|
kkonganti@0
|
149 except(IndexError):
|
kkonganti@0
|
150 ontol_name = ontol_interest[0]
|
kkonganti@0
|
151 if ontol_name in ontol_interest:
|
kkonganti@0
|
152 _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]),
|
kkonganti@0
|
153 punctuation_treatment(split_line[1]))
|
kkonganti@0
|
154 conn.commit()
|
kkonganti@0
|
155 with open('lexmapr/predefined_resources/SynLex.csv', 'r') as IN_file:
|
kkonganti@0
|
156 IN_file.readline()
|
kkonganti@0
|
157 for read_line in IN_file.readlines():
|
kkonganti@0
|
158 split_line = read_line.split(',')
|
kkonganti@0
|
159 try:
|
kkonganti@0
|
160 ontol_name = split_line[2].split('_')[0].upper().strip()
|
kkonganti@0
|
161 except(IndexError):
|
kkonganti@0
|
162 ontol_name = ontol_interest[0]
|
kkonganti@0
|
163 if ontol_name in ontol_interest:
|
kkonganti@0
|
164 _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]),
|
kkonganti@0
|
165 punctuation_treatment(split_line[1]))
|
kkonganti@0
|
166 conn.commit()
|
kkonganti@0
|
167 conn.close()
|
kkonganti@0
|
168 return
|
kkonganti@0
|
169
|
kkonganti@0
|
170
|
kkonganti@0
|
171 def get_resource_ids(remake_cache, ontol_interest):
|
kkonganti@0
|
172 '''Create database of online resources and update synonym database'''
|
kkonganti@0
|
173 if _check_make(ontol_db, remake_cache, ontol_interest) == False:
|
kkonganti@0
|
174 return
|
kkonganti@0
|
175
|
kkonganti@0
|
176 conn = sqlite3.connect(ontol_db)
|
kkonganti@0
|
177 c = conn.cursor()
|
kkonganti@0
|
178 c.execute("CREATE TABLE non_standard_resource_ids (key TEXT, value TEXT, UNIQUE(key))")
|
kkonganti@0
|
179 c.execute("CREATE TABLE standard_resource_labels (key TEXT, value TEXT, UNIQUE(key, value))")
|
kkonganti@0
|
180 c.execute("""CREATE TABLE standard_resource_permutations (key TEXT, value TEXT,
|
kkonganti@0
|
181 UNIQUE(key, value))""")
|
kkonganti@0
|
182 sonn = sqlite3.connect(synonym_db)
|
kkonganti@0
|
183 s = sonn.cursor()
|
kkonganti@0
|
184
|
kkonganti@0
|
185 ontol_dic = _get_ontols(ontol_interest)
|
kkonganti@0
|
186 ontol_urls = [purl_link+x[1] for x in ontol_dic.values()]
|
kkonganti@0
|
187 for ontol_name in ontol_dic:
|
kkonganti@0
|
188 print('{:<40}'.format('\tRetrieving '+ontol_name[:15]+' terms'), end='\r')
|
kkonganti@0
|
189 no_label = set()
|
kkonganti@0
|
190 found_label = set()
|
kkonganti@0
|
191
|
kkonganti@0
|
192 if not os.path.isfile(os.path.join(owl_dir,ontol_name+'.owl')):
|
kkonganti@0
|
193 owl_download = requests.get(embl_ontologies+ontol_dic[ontol_name][1])
|
kkonganti@0
|
194 with open(os.path.join(owl_dir,ontol_name+'.owl'), 'w', encoding='utf-8') as O_file:
|
kkonganti@0
|
195 O_file.write(owl_download.content.decode('utf-8'))
|
kkonganti@0
|
196
|
kkonganti@0
|
197 with open(os.path.join(owl_dir,ontol_name+'.owl'),'r', encoding='utf-8') as OWL_file:
|
kkonganti@0
|
198 owl_sections = _section_file(OWL_file, '<!-- '+purl_link)
|
kkonganti@0
|
199 obo_ids = _labels_synonyms(owl_sections)
|
kkonganti@0
|
200 OWL_file.seek(0)
|
kkonganti@0
|
201 import_urls = [x for x in _get_imports(OWL_file) if x not in ontol_urls]
|
kkonganti@0
|
202
|
kkonganti@0
|
203 for imp_url in import_urls:
|
kkonganti@0
|
204 imp_download = requests.get(imp_url)
|
kkonganti@0
|
205 imp_file = os.path.join(owl_dir,imp_url.split('/')[-1])
|
kkonganti@0
|
206 with open(imp_file, 'w', encoding='utf-8') as T_file:
|
kkonganti@0
|
207 T_file.write(imp_download.content.decode('utf-8'))
|
kkonganti@0
|
208 # Assume possible to get duplicate names for different imports, so rewrite file
|
kkonganti@0
|
209 with open(imp_file, 'r', encoding='utf-8') as T_file:
|
kkonganti@0
|
210 owl_sections = _section_file(T_file, '<!-- '+purl_link, stt_at='', end_at='')
|
kkonganti@0
|
211 obo_ids.extend(_labels_synonyms(owl_sections, have_label=True))
|
kkonganti@0
|
212 os.remove(imp_file)
|
kkonganti@0
|
213 # TODO: more elegant way to find which ids don't have labels and to remove duplicates?
|
kkonganti@0
|
214 [no_label.add(x[0]) if x[1]=='' else found_label.add(x[0]) for x in obo_ids]
|
kkonganti@0
|
215 no_label_found = list(no_label.difference(found_label))
|
kkonganti@0
|
216 for ontol_term in obo_ids:
|
kkonganti@0
|
217 if ontol_term[0] in no_label_found:
|
kkonganti@0
|
218 ontol_acc = ontr.Ontology_accession(':'+ontol_term[0])
|
kkonganti@0
|
219 if ontol_acc.label != 'unk':
|
kkonganti@0
|
220 if not re.search('^obsolete:', ontol_acc.label):
|
kkonganti@0
|
221 ontol_term[1] = ontol_acc.label
|
kkonganti@0
|
222 time.sleep(0.05)
|
kkonganti@0
|
223 if ontol_term[1] == '':
|
kkonganti@0
|
224 continue
|
kkonganti@0
|
225 ontol_label = punctuation_treatment(ontol_term[1])
|
kkonganti@0
|
226 _db_insert(c, 'non_standard_resource_ids', ontol_term[0], ontol_term[1])
|
kkonganti@0
|
227 _db_insert(c, 'standard_resource_labels', ontol_label, ontol_term[0])
|
kkonganti@0
|
228 # Note from v 0.7: To limit performance overhead, we ignore resource labels with
|
kkonganti@0
|
229 # more than 7 tokens, as permutating too many tokens can be costly.
|
kkonganti@0
|
230 # Ignore NCBI taxon terms
|
kkonganti@0
|
231 if len(word_tokenize(ontol_label)) < 7 and len(word_tokenize(ontol_label)) > 1:
|
kkonganti@0
|
232 if not re.search('NCBITaxon', ontol_term[0]):
|
kkonganti@0
|
233 for permutation in get_resource_label_permutations(ontol_label):
|
kkonganti@0
|
234 _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0])
|
kkonganti@0
|
235 # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations?
|
kkonganti@0
|
236 elif len(word_tokenize(ontol_label)) == 2:
|
kkonganti@0
|
237 bi_name = ontol_label.split()
|
kkonganti@0
|
238 _db_insert(c, 'standard_resource_permutations',
|
kkonganti@0
|
239 bi_name[0][0]+' '+bi_name[1], ontol_term[0])
|
kkonganti@0
|
240 for syn_term in ontol_term[2]:
|
kkonganti@0
|
241 _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label)
|
kkonganti@0
|
242 conn.commit()
|
kkonganti@0
|
243 sonn.commit()
|
kkonganti@0
|
244
|
kkonganti@0
|
245 conn.close()
|
kkonganti@0
|
246 sonn.close()
|
kkonganti@0
|
247 _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle'))
|
kkonganti@0
|
248 return
|