cstrittmatter@0: """Builds SQLite3 databases"""
cstrittmatter@0:
cstrittmatter@0: import logging, os, pickle, re, requests, sqlite3, sys, time
cstrittmatter@0: import lexmapr.ontology_reasoner as ontr
cstrittmatter@0: from nltk.tokenize import word_tokenize
cstrittmatter@0: from lexmapr.pipeline_helpers import punctuation_treatment
cstrittmatter@0: from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db
cstrittmatter@0: from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels
cstrittmatter@0: from lexmapr.pipeline_resources import get_resource_label_permutations
cstrittmatter@0:
cstrittmatter@0: logging.getLogger('requests').setLevel(logging.WARNING)
cstrittmatter@0: logging.getLogger('urllib3').setLevel(logging.WARNING)
cstrittmatter@0:
cstrittmatter@0:
cstrittmatter@0: # TODO: might replace pickle with ujson
cstrittmatter@0: def _pickle_save(data_to_save, file_path):
cstrittmatter@0: '''Write a pickle file'''
cstrittmatter@0: with open(file_path,'wb') as SAVE_file:
cstrittmatter@0: pickle.dump(data_to_save, SAVE_file)
cstrittmatter@0:
cstrittmatter@0:
cstrittmatter@0: def _pickle_load(file_path):
cstrittmatter@0: '''Read a pickle file'''
cstrittmatter@0: with open(file_path,'rb') as LOAD_file:
cstrittmatter@0: return(pickle.load(LOAD_file))
cstrittmatter@0:
cstrittmatter@0:
cstrittmatter@0: def _get_ontols(ontol_interest):
cstrittmatter@0: '''Obtain URLs for ontologies of interest'''
cstrittmatter@0: ontol_dic = {}
cstrittmatter@0: embl_resp = requests.get(embl_ontologies)
cstrittmatter@0: resp_blocks = re.findall('
([\s\S]+?)
',embl_resp.content.decode('utf-8'))
cstrittmatter@0: for resp_block in resp_blocks:
cstrittmatter@0: try:
cstrittmatter@0: embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1)
cstrittmatter@0: embl_name = re.search('([\s\S]+?)', resp_block).group(1)
cstrittmatter@0: embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1)
cstrittmatter@0: if embl_link.startswith('ontologies'):
cstrittmatter@0: embl_link = embl_link[len('ontologies'):]
cstrittmatter@0: # TODO: with Python 3.9- embl_link.removeprefix('ontologies')
cstrittmatter@0: except(AttributeError):
cstrittmatter@0: continue
cstrittmatter@0: if embl_abbr in ontol_interest:
cstrittmatter@0: ontol_dic[embl_abbr] = (embl_name, embl_link)
cstrittmatter@0: # Continue if not find all ontologies of interest specified in definitions.py
cstrittmatter@0: not_found = set(ontol_interest).difference(set(ontol_dic.keys()))
cstrittmatter@0: if not_found:
cstrittmatter@0: if len(not_found) == 1:
cstrittmatter@0: logging.warning(f'Did not find ontology: ' + ', '.join(not_found))
cstrittmatter@0: else:
cstrittmatter@0: logging.warning(f'Did not find ontologies: ' + ', '.join(not_found))
cstrittmatter@0: if ontol_dic == {}:
cstrittmatter@0: sys.exit('Zero ontologies found from user-given list')
cstrittmatter@0: return(ontol_dic)
cstrittmatter@0:
cstrittmatter@0:
cstrittmatter@0: def _check_make(db_file, remake_cache, ontol_interest):
cstrittmatter@0: '''Check if database file should be remade'''
cstrittmatter@0: if os.path.exists(db_file) and remake_cache == False:
cstrittmatter@0: if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')):
cstrittmatter@0: if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')):
cstrittmatter@0: return(False)
cstrittmatter@0: try:
cstrittmatter@0: os.remove(db_file)
cstrittmatter@0: except(FileNotFoundError):
cstrittmatter@0: pass
cstrittmatter@0: return(True)
cstrittmatter@0:
cstrittmatter@0:
cstrittmatter@0: def _db_insert(db_cursor, table_name, key_term, val_term):
cstrittmatter@0: '''Insert new data into a database table'''
cstrittmatter@0: if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='':
cstrittmatter@0: return
cstrittmatter@0: db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)",
cstrittmatter@0: {'key':key_term.strip(), 'value':val_term.strip()})
cstrittmatter@0:
cstrittmatter@0:
cstrittmatter@0: def _get_imports(file_handle):
cstrittmatter@0: '''Check for required imports; append any new patterns to pattern_strs'''
cstrittmatter@0: pattern_strs = []
cstrittmatter@0: pattern_strs.append('')
cstrittmatter@0: pattern_strs.append('')
cstrittmatter@0: whole_file = str(file_handle.read())
cstrittmatter@0: for patt_str in pattern_strs:
cstrittmatter@0: import_match = re.findall(patt_str, whole_file)
cstrittmatter@0: if import_match != []:
cstrittmatter@0: import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match]
cstrittmatter@0: break
cstrittmatter@0: return(import_match)
cstrittmatter@0:
cstrittmatter@0:
cstrittmatter@0: def _section_file(file_handle, break_pattern,
cstrittmatter@0: stt_at=' // Classes', end_at=' // Annotations'):
cstrittmatter@0: '''Break OWL files into readable sections for each ontology accession'''
cstrittmatter@0: whole_file = str(file_handle.read())
cstrittmatter@0: if stt_at != '':
cstrittmatter@0: if re.search(stt_at, whole_file):
cstrittmatter@0: whole_file = ''.join(whole_file.split(stt_at)[1:])
cstrittmatter@0: if end_at != '':
cstrittmatter@0: if re.search(end_at, whole_file):
cstrittmatter@0: whole_file = ''.join(whole_file.split(end_at)[:-1])
cstrittmatter@0: file_sections = whole_file.split(break_pattern)
cstrittmatter@0: return(file_sections[1:-1])
cstrittmatter@0:
cstrittmatter@0:
cstrittmatter@0: def _labels_synonyms(obo_list, have_label=False):
cstrittmatter@0: '''Identify labels, ids and exact ontology synonyms'''
cstrittmatter@0: obo_ids = []
cstrittmatter@0: for obo_string in obo_list:
cstrittmatter@0: id_pattern = '(\w+) -->'
cstrittmatter@0: lab_pattern = '\([\s\S]+?)\<\/rdfs:label\>'
cstrittmatter@0: syn_pattern = '\(.*?)\ 1:
cstrittmatter@0: if not re.search('NCBITaxon', ontol_term[0]):
cstrittmatter@0: for permutation in get_resource_label_permutations(ontol_label):
cstrittmatter@0: _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0])
cstrittmatter@0: # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations?
cstrittmatter@0: elif len(word_tokenize(ontol_label)) == 2:
cstrittmatter@0: bi_name = ontol_label.split()
cstrittmatter@0: _db_insert(c, 'standard_resource_permutations',
cstrittmatter@0: bi_name[0][0]+' '+bi_name[1], ontol_term[0])
cstrittmatter@0: for syn_term in ontol_term[2]:
cstrittmatter@0: _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label)
cstrittmatter@0: conn.commit()
cstrittmatter@0: sonn.commit()
cstrittmatter@0:
cstrittmatter@0: conn.close()
cstrittmatter@0: sonn.close()
cstrittmatter@0: _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle'))
cstrittmatter@0: return