kkonganti@0: """Builds SQLite3 databases"""
kkonganti@0:
kkonganti@0: import logging, os, pickle, re, requests, sqlite3, sys, time
kkonganti@0: import lexmapr.ontology_reasoner as ontr
kkonganti@0: from nltk.tokenize import word_tokenize
kkonganti@0: from lexmapr.pipeline_helpers import punctuation_treatment
kkonganti@0: from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db
kkonganti@0: from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels
kkonganti@0: from lexmapr.pipeline_resources import get_resource_label_permutations
kkonganti@0:
kkonganti@0: logging.getLogger('requests').setLevel(logging.WARNING)
kkonganti@0: logging.getLogger('urllib3').setLevel(logging.WARNING)
kkonganti@0:
kkonganti@0:
kkonganti@0: # TODO: might replace pickle with ujson
kkonganti@0: def _pickle_save(data_to_save, file_path):
kkonganti@0: '''Write a pickle file'''
kkonganti@0: with open(file_path,'wb') as SAVE_file:
kkonganti@0: pickle.dump(data_to_save, SAVE_file)
kkonganti@0:
kkonganti@0:
kkonganti@0: def _pickle_load(file_path):
kkonganti@0: '''Read a pickle file'''
kkonganti@0: with open(file_path,'rb') as LOAD_file:
kkonganti@0: return(pickle.load(LOAD_file))
kkonganti@0:
kkonganti@0:
kkonganti@0: def _get_ontols(ontol_interest):
kkonganti@0: '''Obtain URLs for ontologies of interest'''
kkonganti@0: ontol_dic = {}
kkonganti@0: embl_resp = requests.get(embl_ontologies)
kkonganti@0: resp_blocks = re.findall('
([\s\S]+?)
',embl_resp.content.decode('utf-8'))
kkonganti@0: for resp_block in resp_blocks:
kkonganti@0: try:
kkonganti@0: embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1)
kkonganti@0: embl_name = re.search('([\s\S]+?)', resp_block).group(1)
kkonganti@0: embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1)
kkonganti@0: if embl_link.startswith('ontologies'):
kkonganti@0: embl_link = embl_link[len('ontologies'):]
kkonganti@0: # TODO: with Python 3.9- embl_link.removeprefix('ontologies')
kkonganti@0: except(AttributeError):
kkonganti@0: continue
kkonganti@0: if embl_abbr in ontol_interest:
kkonganti@0: ontol_dic[embl_abbr] = (embl_name, embl_link)
kkonganti@0: # Continue if not find all ontologies of interest specified in definitions.py
kkonganti@0: not_found = set(ontol_interest).difference(set(ontol_dic.keys()))
kkonganti@0: if not_found:
kkonganti@0: if len(not_found) == 1:
kkonganti@0: logging.warning(f'Did not find ontology: ' + ', '.join(not_found))
kkonganti@0: else:
kkonganti@0: logging.warning(f'Did not find ontologies: ' + ', '.join(not_found))
kkonganti@0: if ontol_dic == {}:
kkonganti@0: sys.exit('Zero ontologies found from user-given list')
kkonganti@0: return(ontol_dic)
kkonganti@0:
kkonganti@0:
kkonganti@0: def _check_make(db_file, remake_cache, ontol_interest):
kkonganti@0: '''Check if database file should be remade'''
kkonganti@0: if os.path.exists(db_file) and remake_cache == False:
kkonganti@0: if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')):
kkonganti@0: if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')):
kkonganti@0: return(False)
kkonganti@0: try:
kkonganti@0: os.remove(db_file)
kkonganti@0: except(FileNotFoundError):
kkonganti@0: pass
kkonganti@0: return(True)
kkonganti@0:
kkonganti@0:
kkonganti@0: def _db_insert(db_cursor, table_name, key_term, val_term):
kkonganti@0: '''Insert new data into a database table'''
kkonganti@0: if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='':
kkonganti@0: return
kkonganti@0: db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)",
kkonganti@0: {'key':key_term.strip(), 'value':val_term.strip()})
kkonganti@0:
kkonganti@0:
kkonganti@0: def _get_imports(file_handle):
kkonganti@0: '''Check for required imports; append any new patterns to pattern_strs'''
kkonganti@0: pattern_strs = []
kkonganti@0: pattern_strs.append('')
kkonganti@0: pattern_strs.append('')
kkonganti@0: whole_file = str(file_handle.read())
kkonganti@0: for patt_str in pattern_strs:
kkonganti@0: import_match = re.findall(patt_str, whole_file)
kkonganti@0: if import_match != []:
kkonganti@0: import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match]
kkonganti@0: break
kkonganti@0: return(import_match)
kkonganti@0:
kkonganti@0:
kkonganti@0: def _section_file(file_handle, break_pattern,
kkonganti@0: stt_at=' // Classes', end_at=' // Annotations'):
kkonganti@0: '''Break OWL files into readable sections for each ontology accession'''
kkonganti@0: whole_file = str(file_handle.read())
kkonganti@0: if stt_at != '':
kkonganti@0: if re.search(stt_at, whole_file):
kkonganti@0: whole_file = ''.join(whole_file.split(stt_at)[1:])
kkonganti@0: if end_at != '':
kkonganti@0: if re.search(end_at, whole_file):
kkonganti@0: whole_file = ''.join(whole_file.split(end_at)[:-1])
kkonganti@0: file_sections = whole_file.split(break_pattern)
kkonganti@0: return(file_sections[1:-1])
kkonganti@0:
kkonganti@0:
kkonganti@0: def _labels_synonyms(obo_list, have_label=False):
kkonganti@0: '''Identify labels, ids and exact ontology synonyms'''
kkonganti@0: obo_ids = []
kkonganti@0: for obo_string in obo_list:
kkonganti@0: id_pattern = '(\w+) -->'
kkonganti@0: lab_pattern = '\([\s\S]+?)\<\/rdfs:label\>'
kkonganti@0: syn_pattern = '\(.*?)\ 1:
kkonganti@0: if not re.search('NCBITaxon', ontol_term[0]):
kkonganti@0: for permutation in get_resource_label_permutations(ontol_label):
kkonganti@0: _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0])
kkonganti@0: # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations?
kkonganti@0: elif len(word_tokenize(ontol_label)) == 2:
kkonganti@0: bi_name = ontol_label.split()
kkonganti@0: _db_insert(c, 'standard_resource_permutations',
kkonganti@0: bi_name[0][0]+' '+bi_name[1], ontol_term[0])
kkonganti@0: for syn_term in ontol_term[2]:
kkonganti@0: _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label)
kkonganti@0: conn.commit()
kkonganti@0: sonn.commit()
kkonganti@0:
kkonganti@0: conn.close()
kkonganti@0: sonn.close()
kkonganti@0: _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle'))
kkonganti@0: return