cstrittmatter@0: """Builds SQLite3 databases""" cstrittmatter@0: cstrittmatter@0: import logging, os, pickle, re, requests, sqlite3, sys, time cstrittmatter@0: import lexmapr.ontology_reasoner as ontr cstrittmatter@0: from nltk.tokenize import word_tokenize cstrittmatter@0: from lexmapr.pipeline_helpers import punctuation_treatment cstrittmatter@0: from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db cstrittmatter@0: from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels cstrittmatter@0: from lexmapr.pipeline_resources import get_resource_label_permutations cstrittmatter@0: cstrittmatter@0: logging.getLogger('requests').setLevel(logging.WARNING) cstrittmatter@0: logging.getLogger('urllib3').setLevel(logging.WARNING) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: # TODO: might replace pickle with ujson cstrittmatter@0: def _pickle_save(data_to_save, file_path): cstrittmatter@0: '''Write a pickle file''' cstrittmatter@0: with open(file_path,'wb') as SAVE_file: cstrittmatter@0: pickle.dump(data_to_save, SAVE_file) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _pickle_load(file_path): cstrittmatter@0: '''Read a pickle file''' cstrittmatter@0: with open(file_path,'rb') as LOAD_file: cstrittmatter@0: return(pickle.load(LOAD_file)) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _get_ontols(ontol_interest): cstrittmatter@0: '''Obtain URLs for ontologies of interest''' cstrittmatter@0: ontol_dic = {} cstrittmatter@0: embl_resp = requests.get(embl_ontologies) cstrittmatter@0: resp_blocks = re.findall('([\s\S]+?)',embl_resp.content.decode('utf-8')) cstrittmatter@0: for resp_block in resp_blocks: cstrittmatter@0: try: cstrittmatter@0: embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1) cstrittmatter@0: embl_name = re.search('([\s\S]+?)', resp_block).group(1) cstrittmatter@0: embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1) cstrittmatter@0: if embl_link.startswith('ontologies'): cstrittmatter@0: embl_link = embl_link[len('ontologies'):] cstrittmatter@0: # TODO: with Python 3.9- embl_link.removeprefix('ontologies') cstrittmatter@0: except(AttributeError): cstrittmatter@0: continue cstrittmatter@0: if embl_abbr in ontol_interest: cstrittmatter@0: ontol_dic[embl_abbr] = (embl_name, embl_link) cstrittmatter@0: # Continue if not find all ontologies of interest specified in definitions.py cstrittmatter@0: not_found = set(ontol_interest).difference(set(ontol_dic.keys())) cstrittmatter@0: if not_found: cstrittmatter@0: if len(not_found) == 1: cstrittmatter@0: logging.warning(f'Did not find ontology: ' + ', '.join(not_found)) cstrittmatter@0: else: cstrittmatter@0: logging.warning(f'Did not find ontologies: ' + ', '.join(not_found)) cstrittmatter@0: if ontol_dic == {}: cstrittmatter@0: sys.exit('Zero ontologies found from user-given list') cstrittmatter@0: return(ontol_dic) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _check_make(db_file, remake_cache, ontol_interest): cstrittmatter@0: '''Check if database file should be remade''' cstrittmatter@0: if os.path.exists(db_file) and remake_cache == False: cstrittmatter@0: if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')): cstrittmatter@0: if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')): cstrittmatter@0: return(False) cstrittmatter@0: try: cstrittmatter@0: os.remove(db_file) cstrittmatter@0: except(FileNotFoundError): cstrittmatter@0: pass cstrittmatter@0: return(True) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _db_insert(db_cursor, table_name, key_term, val_term): cstrittmatter@0: '''Insert new data into a database table''' cstrittmatter@0: if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='': cstrittmatter@0: return cstrittmatter@0: db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)", cstrittmatter@0: {'key':key_term.strip(), 'value':val_term.strip()}) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _get_imports(file_handle): cstrittmatter@0: '''Check for required imports; append any new patterns to pattern_strs''' cstrittmatter@0: pattern_strs = [] cstrittmatter@0: pattern_strs.append('') cstrittmatter@0: pattern_strs.append('') cstrittmatter@0: whole_file = str(file_handle.read()) cstrittmatter@0: for patt_str in pattern_strs: cstrittmatter@0: import_match = re.findall(patt_str, whole_file) cstrittmatter@0: if import_match != []: cstrittmatter@0: import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match] cstrittmatter@0: break cstrittmatter@0: return(import_match) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _section_file(file_handle, break_pattern, cstrittmatter@0: stt_at=' // Classes', end_at=' // Annotations'): cstrittmatter@0: '''Break OWL files into readable sections for each ontology accession''' cstrittmatter@0: whole_file = str(file_handle.read()) cstrittmatter@0: if stt_at != '': cstrittmatter@0: if re.search(stt_at, whole_file): cstrittmatter@0: whole_file = ''.join(whole_file.split(stt_at)[1:]) cstrittmatter@0: if end_at != '': cstrittmatter@0: if re.search(end_at, whole_file): cstrittmatter@0: whole_file = ''.join(whole_file.split(end_at)[:-1]) cstrittmatter@0: file_sections = whole_file.split(break_pattern) cstrittmatter@0: return(file_sections[1:-1]) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _labels_synonyms(obo_list, have_label=False): cstrittmatter@0: '''Identify labels, ids and exact ontology synonyms''' cstrittmatter@0: obo_ids = [] cstrittmatter@0: for obo_string in obo_list: cstrittmatter@0: id_pattern = '(\w+) -->' cstrittmatter@0: lab_pattern = '\([\s\S]+?)\<\/rdfs:label\>' cstrittmatter@0: syn_pattern = '\(.*?)\ 1: cstrittmatter@0: if not re.search('NCBITaxon', ontol_term[0]): cstrittmatter@0: for permutation in get_resource_label_permutations(ontol_label): cstrittmatter@0: _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0]) cstrittmatter@0: # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations? cstrittmatter@0: elif len(word_tokenize(ontol_label)) == 2: cstrittmatter@0: bi_name = ontol_label.split() cstrittmatter@0: _db_insert(c, 'standard_resource_permutations', cstrittmatter@0: bi_name[0][0]+' '+bi_name[1], ontol_term[0]) cstrittmatter@0: for syn_term in ontol_term[2]: cstrittmatter@0: _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label) cstrittmatter@0: conn.commit() cstrittmatter@0: sonn.commit() cstrittmatter@0: cstrittmatter@0: conn.close() cstrittmatter@0: sonn.close() cstrittmatter@0: _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle')) cstrittmatter@0: return