Mercurial > repos > kkonganti > cfsan_lexmapr2
comparison lexmapr/create_databases.py @ 0:f5c39d0447be
"planemo upload"
author | kkonganti |
---|---|
date | Wed, 31 Aug 2022 14:32:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f5c39d0447be |
---|---|
1 """Builds SQLite3 databases""" | |
2 | |
3 import logging, os, pickle, re, requests, sqlite3, sys, time | |
4 import lexmapr.ontology_reasoner as ontr | |
5 from nltk.tokenize import word_tokenize | |
6 from lexmapr.pipeline_helpers import punctuation_treatment | |
7 from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db | |
8 from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels | |
9 from lexmapr.pipeline_resources import get_resource_label_permutations | |
10 | |
11 logging.getLogger('requests').setLevel(logging.WARNING) | |
12 logging.getLogger('urllib3').setLevel(logging.WARNING) | |
13 | |
14 | |
15 # TODO: might replace pickle with ujson | |
16 def _pickle_save(data_to_save, file_path): | |
17 '''Write a pickle file''' | |
18 with open(file_path,'wb') as SAVE_file: | |
19 pickle.dump(data_to_save, SAVE_file) | |
20 | |
21 | |
22 def _pickle_load(file_path): | |
23 '''Read a pickle file''' | |
24 with open(file_path,'rb') as LOAD_file: | |
25 return(pickle.load(LOAD_file)) | |
26 | |
27 | |
28 def _get_ontols(ontol_interest): | |
29 '''Obtain URLs for ontologies of interest''' | |
30 ontol_dic = {} | |
31 embl_resp = requests.get(embl_ontologies) | |
32 resp_blocks = re.findall('<tr>([\s\S]+?)</tr>',embl_resp.content.decode('utf-8')) | |
33 for resp_block in resp_blocks: | |
34 try: | |
35 embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1) | |
36 embl_name = re.search('<a href=\"/ols/ontologies/' + embl_abbr.lower() +\ | |
37 '\">([\s\S]+?)</a>', resp_block).group(1) | |
38 embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1) | |
39 if embl_link.startswith('ontologies'): | |
40 embl_link = embl_link[len('ontologies'):] | |
41 # TODO: with Python 3.9- embl_link.removeprefix('ontologies') | |
42 except(AttributeError): | |
43 continue | |
44 if embl_abbr in ontol_interest: | |
45 ontol_dic[embl_abbr] = (embl_name, embl_link) | |
46 # Continue if not find all ontologies of interest specified in definitions.py | |
47 not_found = set(ontol_interest).difference(set(ontol_dic.keys())) | |
48 if not_found: | |
49 if len(not_found) == 1: | |
50 logging.warning(f'Did not find ontology: ' + ', '.join(not_found)) | |
51 else: | |
52 logging.warning(f'Did not find ontologies: ' + ', '.join(not_found)) | |
53 if ontol_dic == {}: | |
54 sys.exit('Zero ontologies found from user-given list') | |
55 return(ontol_dic) | |
56 | |
57 | |
58 def _check_make(db_file, remake_cache, ontol_interest): | |
59 '''Check if database file should be remade''' | |
60 if os.path.exists(db_file) and remake_cache == False: | |
61 if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')): | |
62 if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')): | |
63 return(False) | |
64 try: | |
65 os.remove(db_file) | |
66 except(FileNotFoundError): | |
67 pass | |
68 return(True) | |
69 | |
70 | |
71 def _db_insert(db_cursor, table_name, key_term, val_term): | |
72 '''Insert new data into a database table''' | |
73 if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='': | |
74 return | |
75 db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)", | |
76 {'key':key_term.strip(), 'value':val_term.strip()}) | |
77 | |
78 | |
79 def _get_imports(file_handle): | |
80 '''Check for required imports; append any new patterns to pattern_strs''' | |
81 pattern_strs = [] | |
82 pattern_strs.append('<owl:imports rdf:resource=\"&obo;(.*)"/>') | |
83 pattern_strs.append('<owl:imports rdf:resource=\"(http://.*)\"/>') | |
84 whole_file = str(file_handle.read()) | |
85 for patt_str in pattern_strs: | |
86 import_match = re.findall(patt_str, whole_file) | |
87 if import_match != []: | |
88 import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match] | |
89 break | |
90 return(import_match) | |
91 | |
92 | |
93 def _section_file(file_handle, break_pattern, | |
94 stt_at=' // Classes', end_at=' // Annotations'): | |
95 '''Break OWL files into readable sections for each ontology accession''' | |
96 whole_file = str(file_handle.read()) | |
97 if stt_at != '': | |
98 if re.search(stt_at, whole_file): | |
99 whole_file = ''.join(whole_file.split(stt_at)[1:]) | |
100 if end_at != '': | |
101 if re.search(end_at, whole_file): | |
102 whole_file = ''.join(whole_file.split(end_at)[:-1]) | |
103 file_sections = whole_file.split(break_pattern) | |
104 return(file_sections[1:-1]) | |
105 | |
106 | |
107 def _labels_synonyms(obo_list, have_label=False): | |
108 '''Identify labels, ids and exact ontology synonyms''' | |
109 obo_ids = [] | |
110 for obo_string in obo_list: | |
111 id_pattern = '(\w+) -->' | |
112 lab_pattern = '\<rdfs:label[\s\S]*?\>([\s\S]+?)\<\/rdfs:label\>' | |
113 syn_pattern = '\<oboInOwl:hasExactSynonym[\s\S]*?\>(.*?)\</oboInOwl:hasExactSynonym' | |
114 obo_id = re.search(id_pattern, obo_string).group(1) | |
115 # Do not save ids that are not formatted as expected, or placeholders/obsolete/Apollo:SV | |
116 if re.search('APOLLO:SV', obo_id) or re.search('APOLLO_SV', obo_id): | |
117 continue | |
118 elif obo_id in missing_ontol_labels: | |
119 continue | |
120 elif re.search(':', obo_id) or re.search('_', obo_id): | |
121 try: | |
122 obo_label = re.findall(lab_pattern, obo_string)[-1] | |
123 if re.search('^obsolete:', obo_label): | |
124 continue | |
125 except(IndexError): | |
126 obo_label = '' | |
127 obo_synonyms = re.findall(syn_pattern, obo_string) | |
128 obo_ids.append([str(obo_id.lstrip('_')), str(obo_label), obo_synonyms]) | |
129 if have_label: | |
130 obo_ids = [x for x in obo_ids if x[1] != ''] | |
131 return(obo_ids) | |
132 | |
133 | |
134 def get_synonyms(remake_cache, ontol_interest): | |
135 '''Create database of predefined synonyms''' | |
136 os.makedirs(owl_dir, exist_ok=True) | |
137 if _check_make(synonym_db, remake_cache, ontol_interest) == False: | |
138 return | |
139 | |
140 conn = sqlite3.connect(synonym_db) | |
141 c = conn.cursor() | |
142 c.execute("CREATE TABLE label_synonyms (key TEXT, value TEXT, UNIQUE(key, value))") | |
143 print('{:<40}'.format('\tRetrieving predefined synonyms'),end='\r') | |
144 with open('lexmapr/predefined_resources/resource_synonyms.csv', 'r') as IN_file: | |
145 for read_line in IN_file.readlines(): | |
146 split_line = read_line.split(',') | |
147 try: | |
148 ontol_name = split_line[2].split('_')[0].upper() | |
149 except(IndexError): | |
150 ontol_name = ontol_interest[0] | |
151 if ontol_name in ontol_interest: | |
152 _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]), | |
153 punctuation_treatment(split_line[1])) | |
154 conn.commit() | |
155 with open('lexmapr/predefined_resources/SynLex.csv', 'r') as IN_file: | |
156 IN_file.readline() | |
157 for read_line in IN_file.readlines(): | |
158 split_line = read_line.split(',') | |
159 try: | |
160 ontol_name = split_line[2].split('_')[0].upper().strip() | |
161 except(IndexError): | |
162 ontol_name = ontol_interest[0] | |
163 if ontol_name in ontol_interest: | |
164 _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]), | |
165 punctuation_treatment(split_line[1])) | |
166 conn.commit() | |
167 conn.close() | |
168 return | |
169 | |
170 | |
171 def get_resource_ids(remake_cache, ontol_interest): | |
172 '''Create database of online resources and update synonym database''' | |
173 if _check_make(ontol_db, remake_cache, ontol_interest) == False: | |
174 return | |
175 | |
176 conn = sqlite3.connect(ontol_db) | |
177 c = conn.cursor() | |
178 c.execute("CREATE TABLE non_standard_resource_ids (key TEXT, value TEXT, UNIQUE(key))") | |
179 c.execute("CREATE TABLE standard_resource_labels (key TEXT, value TEXT, UNIQUE(key, value))") | |
180 c.execute("""CREATE TABLE standard_resource_permutations (key TEXT, value TEXT, | |
181 UNIQUE(key, value))""") | |
182 sonn = sqlite3.connect(synonym_db) | |
183 s = sonn.cursor() | |
184 | |
185 ontol_dic = _get_ontols(ontol_interest) | |
186 ontol_urls = [purl_link+x[1] for x in ontol_dic.values()] | |
187 for ontol_name in ontol_dic: | |
188 print('{:<40}'.format('\tRetrieving '+ontol_name[:15]+' terms'), end='\r') | |
189 no_label = set() | |
190 found_label = set() | |
191 | |
192 if not os.path.isfile(os.path.join(owl_dir,ontol_name+'.owl')): | |
193 owl_download = requests.get(embl_ontologies+ontol_dic[ontol_name][1]) | |
194 with open(os.path.join(owl_dir,ontol_name+'.owl'), 'w', encoding='utf-8') as O_file: | |
195 O_file.write(owl_download.content.decode('utf-8')) | |
196 | |
197 with open(os.path.join(owl_dir,ontol_name+'.owl'),'r', encoding='utf-8') as OWL_file: | |
198 owl_sections = _section_file(OWL_file, '<!-- '+purl_link) | |
199 obo_ids = _labels_synonyms(owl_sections) | |
200 OWL_file.seek(0) | |
201 import_urls = [x for x in _get_imports(OWL_file) if x not in ontol_urls] | |
202 | |
203 for imp_url in import_urls: | |
204 imp_download = requests.get(imp_url) | |
205 imp_file = os.path.join(owl_dir,imp_url.split('/')[-1]) | |
206 with open(imp_file, 'w', encoding='utf-8') as T_file: | |
207 T_file.write(imp_download.content.decode('utf-8')) | |
208 # Assume possible to get duplicate names for different imports, so rewrite file | |
209 with open(imp_file, 'r', encoding='utf-8') as T_file: | |
210 owl_sections = _section_file(T_file, '<!-- '+purl_link, stt_at='', end_at='') | |
211 obo_ids.extend(_labels_synonyms(owl_sections, have_label=True)) | |
212 os.remove(imp_file) | |
213 # TODO: more elegant way to find which ids don't have labels and to remove duplicates? | |
214 [no_label.add(x[0]) if x[1]=='' else found_label.add(x[0]) for x in obo_ids] | |
215 no_label_found = list(no_label.difference(found_label)) | |
216 for ontol_term in obo_ids: | |
217 if ontol_term[0] in no_label_found: | |
218 ontol_acc = ontr.Ontology_accession(':'+ontol_term[0]) | |
219 if ontol_acc.label != 'unk': | |
220 if not re.search('^obsolete:', ontol_acc.label): | |
221 ontol_term[1] = ontol_acc.label | |
222 time.sleep(0.05) | |
223 if ontol_term[1] == '': | |
224 continue | |
225 ontol_label = punctuation_treatment(ontol_term[1]) | |
226 _db_insert(c, 'non_standard_resource_ids', ontol_term[0], ontol_term[1]) | |
227 _db_insert(c, 'standard_resource_labels', ontol_label, ontol_term[0]) | |
228 # Note from v 0.7: To limit performance overhead, we ignore resource labels with | |
229 # more than 7 tokens, as permutating too many tokens can be costly. | |
230 # Ignore NCBI taxon terms | |
231 if len(word_tokenize(ontol_label)) < 7 and len(word_tokenize(ontol_label)) > 1: | |
232 if not re.search('NCBITaxon', ontol_term[0]): | |
233 for permutation in get_resource_label_permutations(ontol_label): | |
234 _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0]) | |
235 # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations? | |
236 elif len(word_tokenize(ontol_label)) == 2: | |
237 bi_name = ontol_label.split() | |
238 _db_insert(c, 'standard_resource_permutations', | |
239 bi_name[0][0]+' '+bi_name[1], ontol_term[0]) | |
240 for syn_term in ontol_term[2]: | |
241 _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label) | |
242 conn.commit() | |
243 sonn.commit() | |
244 | |
245 conn.close() | |
246 sonn.close() | |
247 _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle')) | |
248 return |