kkonganti@0: """Builds SQLite3 databases""" kkonganti@0: kkonganti@0: import logging, os, pickle, re, requests, sqlite3, sys, time kkonganti@0: import lexmapr.ontology_reasoner as ontr kkonganti@0: from nltk.tokenize import word_tokenize kkonganti@0: from lexmapr.pipeline_helpers import punctuation_treatment kkonganti@0: from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db kkonganti@0: from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels kkonganti@0: from lexmapr.pipeline_resources import get_resource_label_permutations kkonganti@0: kkonganti@0: logging.getLogger('requests').setLevel(logging.WARNING) kkonganti@0: logging.getLogger('urllib3').setLevel(logging.WARNING) kkonganti@0: kkonganti@0: kkonganti@0: # TODO: might replace pickle with ujson kkonganti@0: def _pickle_save(data_to_save, file_path): kkonganti@0: '''Write a pickle file''' kkonganti@0: with open(file_path,'wb') as SAVE_file: kkonganti@0: pickle.dump(data_to_save, SAVE_file) kkonganti@0: kkonganti@0: kkonganti@0: def _pickle_load(file_path): kkonganti@0: '''Read a pickle file''' kkonganti@0: with open(file_path,'rb') as LOAD_file: kkonganti@0: return(pickle.load(LOAD_file)) kkonganti@0: kkonganti@0: kkonganti@0: def _get_ontols(ontol_interest): kkonganti@0: '''Obtain URLs for ontologies of interest''' kkonganti@0: ontol_dic = {} kkonganti@0: embl_resp = requests.get(embl_ontologies) kkonganti@0: resp_blocks = re.findall('([\s\S]+?)',embl_resp.content.decode('utf-8')) kkonganti@0: for resp_block in resp_blocks: kkonganti@0: try: kkonganti@0: embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1) kkonganti@0: embl_name = re.search('([\s\S]+?)', resp_block).group(1) kkonganti@0: embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1) kkonganti@0: if embl_link.startswith('ontologies'): kkonganti@0: embl_link = embl_link[len('ontologies'):] kkonganti@0: # TODO: with Python 3.9- embl_link.removeprefix('ontologies') kkonganti@0: except(AttributeError): kkonganti@0: continue kkonganti@0: if embl_abbr in ontol_interest: kkonganti@0: ontol_dic[embl_abbr] = (embl_name, embl_link) kkonganti@0: # Continue if not find all ontologies of interest specified in definitions.py kkonganti@0: not_found = set(ontol_interest).difference(set(ontol_dic.keys())) kkonganti@0: if not_found: kkonganti@0: if len(not_found) == 1: kkonganti@0: logging.warning(f'Did not find ontology: ' + ', '.join(not_found)) kkonganti@0: else: kkonganti@0: logging.warning(f'Did not find ontologies: ' + ', '.join(not_found)) kkonganti@0: if ontol_dic == {}: kkonganti@0: sys.exit('Zero ontologies found from user-given list') kkonganti@0: return(ontol_dic) kkonganti@0: kkonganti@0: kkonganti@0: def _check_make(db_file, remake_cache, ontol_interest): kkonganti@0: '''Check if database file should be remade''' kkonganti@0: if os.path.exists(db_file) and remake_cache == False: kkonganti@0: if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')): kkonganti@0: if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')): kkonganti@0: return(False) kkonganti@0: try: kkonganti@0: os.remove(db_file) kkonganti@0: except(FileNotFoundError): kkonganti@0: pass kkonganti@0: return(True) kkonganti@0: kkonganti@0: kkonganti@0: def _db_insert(db_cursor, table_name, key_term, val_term): kkonganti@0: '''Insert new data into a database table''' kkonganti@0: if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='': kkonganti@0: return kkonganti@0: db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)", kkonganti@0: {'key':key_term.strip(), 'value':val_term.strip()}) kkonganti@0: kkonganti@0: kkonganti@0: def _get_imports(file_handle): kkonganti@0: '''Check for required imports; append any new patterns to pattern_strs''' kkonganti@0: pattern_strs = [] kkonganti@0: pattern_strs.append('') kkonganti@0: pattern_strs.append('') kkonganti@0: whole_file = str(file_handle.read()) kkonganti@0: for patt_str in pattern_strs: kkonganti@0: import_match = re.findall(patt_str, whole_file) kkonganti@0: if import_match != []: kkonganti@0: import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match] kkonganti@0: break kkonganti@0: return(import_match) kkonganti@0: kkonganti@0: kkonganti@0: def _section_file(file_handle, break_pattern, kkonganti@0: stt_at=' // Classes', end_at=' // Annotations'): kkonganti@0: '''Break OWL files into readable sections for each ontology accession''' kkonganti@0: whole_file = str(file_handle.read()) kkonganti@0: if stt_at != '': kkonganti@0: if re.search(stt_at, whole_file): kkonganti@0: whole_file = ''.join(whole_file.split(stt_at)[1:]) kkonganti@0: if end_at != '': kkonganti@0: if re.search(end_at, whole_file): kkonganti@0: whole_file = ''.join(whole_file.split(end_at)[:-1]) kkonganti@0: file_sections = whole_file.split(break_pattern) kkonganti@0: return(file_sections[1:-1]) kkonganti@0: kkonganti@0: kkonganti@0: def _labels_synonyms(obo_list, have_label=False): kkonganti@0: '''Identify labels, ids and exact ontology synonyms''' kkonganti@0: obo_ids = [] kkonganti@0: for obo_string in obo_list: kkonganti@0: id_pattern = '(\w+) -->' kkonganti@0: lab_pattern = '\([\s\S]+?)\<\/rdfs:label\>' kkonganti@0: syn_pattern = '\(.*?)\ 1: kkonganti@0: if not re.search('NCBITaxon', ontol_term[0]): kkonganti@0: for permutation in get_resource_label_permutations(ontol_label): kkonganti@0: _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0]) kkonganti@0: # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations? kkonganti@0: elif len(word_tokenize(ontol_label)) == 2: kkonganti@0: bi_name = ontol_label.split() kkonganti@0: _db_insert(c, 'standard_resource_permutations', kkonganti@0: bi_name[0][0]+' '+bi_name[1], ontol_term[0]) kkonganti@0: for syn_term in ontol_term[2]: kkonganti@0: _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label) kkonganti@0: conn.commit() kkonganti@0: sonn.commit() kkonganti@0: kkonganti@0: conn.close() kkonganti@0: sonn.close() kkonganti@0: _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle')) kkonganti@0: return