Mercurial > repos > kkonganti > cfsan_lexmapr2
diff lexmapr/create_databases.py @ 3:be95a7ce968a tip
"planemo upload"
author | kkonganti |
---|---|
date | Tue, 13 Sep 2022 11:32:24 -0400 |
parents | 5244e7465767 |
children |
line wrap: on
line diff
--- a/lexmapr/create_databases.py Wed Aug 31 14:32:14 2022 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,248 +0,0 @@ -"""Builds SQLite3 databases""" - -import logging, os, pickle, re, requests, sqlite3, sys, time -import lexmapr.ontology_reasoner as ontr -from nltk.tokenize import word_tokenize -from lexmapr.pipeline_helpers import punctuation_treatment -from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db -from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels -from lexmapr.pipeline_resources import get_resource_label_permutations - -logging.getLogger('requests').setLevel(logging.WARNING) -logging.getLogger('urllib3').setLevel(logging.WARNING) - - -# TODO: might replace pickle with ujson -def _pickle_save(data_to_save, file_path): - '''Write a pickle file''' - with open(file_path,'wb') as SAVE_file: - pickle.dump(data_to_save, SAVE_file) - - -def _pickle_load(file_path): - '''Read a pickle file''' - with open(file_path,'rb') as LOAD_file: - return(pickle.load(LOAD_file)) - - -def _get_ontols(ontol_interest): - '''Obtain URLs for ontologies of interest''' - ontol_dic = {} - embl_resp = requests.get(embl_ontologies) - resp_blocks = re.findall('<tr>([\s\S]+?)</tr>',embl_resp.content.decode('utf-8')) - for resp_block in resp_blocks: - try: - embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1) - embl_name = re.search('<a href=\"/ols/ontologies/' + embl_abbr.lower() +\ - '\">([\s\S]+?)</a>', resp_block).group(1) - embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1) - if embl_link.startswith('ontologies'): - embl_link = embl_link[len('ontologies'):] - # TODO: with Python 3.9- embl_link.removeprefix('ontologies') - except(AttributeError): - continue - if embl_abbr in ontol_interest: - ontol_dic[embl_abbr] = (embl_name, embl_link) - # Continue if not find all ontologies of interest specified in definitions.py - not_found = set(ontol_interest).difference(set(ontol_dic.keys())) - if not_found: - if len(not_found) == 1: - logging.warning(f'Did not find ontology: ' + ', '.join(not_found)) - else: - logging.warning(f'Did not find ontologies: ' + ', '.join(not_found)) - if ontol_dic == {}: - sys.exit('Zero ontologies found from user-given list') - return(ontol_dic) - - -def _check_make(db_file, remake_cache, ontol_interest): - '''Check if database file should be remade''' - if os.path.exists(db_file) and remake_cache == False: - if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')): - if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')): - return(False) - try: - os.remove(db_file) - except(FileNotFoundError): - pass - return(True) - - -def _db_insert(db_cursor, table_name, key_term, val_term): - '''Insert new data into a database table''' - if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='': - return - db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)", - {'key':key_term.strip(), 'value':val_term.strip()}) - - -def _get_imports(file_handle): - '''Check for required imports; append any new patterns to pattern_strs''' - pattern_strs = [] - pattern_strs.append('<owl:imports rdf:resource=\"&obo;(.*)"/>') - pattern_strs.append('<owl:imports rdf:resource=\"(http://.*)\"/>') - whole_file = str(file_handle.read()) - for patt_str in pattern_strs: - import_match = re.findall(patt_str, whole_file) - if import_match != []: - import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match] - break - return(import_match) - - -def _section_file(file_handle, break_pattern, - stt_at=' // Classes', end_at=' // Annotations'): - '''Break OWL files into readable sections for each ontology accession''' - whole_file = str(file_handle.read()) - if stt_at != '': - if re.search(stt_at, whole_file): - whole_file = ''.join(whole_file.split(stt_at)[1:]) - if end_at != '': - if re.search(end_at, whole_file): - whole_file = ''.join(whole_file.split(end_at)[:-1]) - file_sections = whole_file.split(break_pattern) - return(file_sections[1:-1]) - - -def _labels_synonyms(obo_list, have_label=False): - '''Identify labels, ids and exact ontology synonyms''' - obo_ids = [] - for obo_string in obo_list: - id_pattern = '(\w+) -->' - lab_pattern = '\<rdfs:label[\s\S]*?\>([\s\S]+?)\<\/rdfs:label\>' - syn_pattern = '\<oboInOwl:hasExactSynonym[\s\S]*?\>(.*?)\</oboInOwl:hasExactSynonym' - obo_id = re.search(id_pattern, obo_string).group(1) - # Do not save ids that are not formatted as expected, or placeholders/obsolete/Apollo:SV - if re.search('APOLLO:SV', obo_id) or re.search('APOLLO_SV', obo_id): - continue - elif obo_id in missing_ontol_labels: - continue - elif re.search(':', obo_id) or re.search('_', obo_id): - try: - obo_label = re.findall(lab_pattern, obo_string)[-1] - if re.search('^obsolete:', obo_label): - continue - except(IndexError): - obo_label = '' - obo_synonyms = re.findall(syn_pattern, obo_string) - obo_ids.append([str(obo_id.lstrip('_')), str(obo_label), obo_synonyms]) - if have_label: - obo_ids = [x for x in obo_ids if x[1] != ''] - return(obo_ids) - - -def get_synonyms(remake_cache, ontol_interest): - '''Create database of predefined synonyms''' - os.makedirs(owl_dir, exist_ok=True) - if _check_make(synonym_db, remake_cache, ontol_interest) == False: - return - - conn = sqlite3.connect(synonym_db) - c = conn.cursor() - c.execute("CREATE TABLE label_synonyms (key TEXT, value TEXT, UNIQUE(key, value))") - print('{:<40}'.format('\tRetrieving predefined synonyms'),end='\r') - with open('lexmapr/predefined_resources/resource_synonyms.csv', 'r') as IN_file: - for read_line in IN_file.readlines(): - split_line = read_line.split(',') - try: - ontol_name = split_line[2].split('_')[0].upper() - except(IndexError): - ontol_name = ontol_interest[0] - if ontol_name in ontol_interest: - _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]), - punctuation_treatment(split_line[1])) - conn.commit() - with open('lexmapr/predefined_resources/SynLex.csv', 'r') as IN_file: - IN_file.readline() - for read_line in IN_file.readlines(): - split_line = read_line.split(',') - try: - ontol_name = split_line[2].split('_')[0].upper().strip() - except(IndexError): - ontol_name = ontol_interest[0] - if ontol_name in ontol_interest: - _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]), - punctuation_treatment(split_line[1])) - conn.commit() - conn.close() - return - - -def get_resource_ids(remake_cache, ontol_interest): - '''Create database of online resources and update synonym database''' - if _check_make(ontol_db, remake_cache, ontol_interest) == False: - return - - conn = sqlite3.connect(ontol_db) - c = conn.cursor() - c.execute("CREATE TABLE non_standard_resource_ids (key TEXT, value TEXT, UNIQUE(key))") - c.execute("CREATE TABLE standard_resource_labels (key TEXT, value TEXT, UNIQUE(key, value))") - c.execute("""CREATE TABLE standard_resource_permutations (key TEXT, value TEXT, - UNIQUE(key, value))""") - sonn = sqlite3.connect(synonym_db) - s = sonn.cursor() - - ontol_dic = _get_ontols(ontol_interest) - ontol_urls = [purl_link+x[1] for x in ontol_dic.values()] - for ontol_name in ontol_dic: - print('{:<40}'.format('\tRetrieving '+ontol_name[:15]+' terms'), end='\r') - no_label = set() - found_label = set() - - if not os.path.isfile(os.path.join(owl_dir,ontol_name+'.owl')): - owl_download = requests.get(embl_ontologies+ontol_dic[ontol_name][1]) - with open(os.path.join(owl_dir,ontol_name+'.owl'), 'w', encoding='utf-8') as O_file: - O_file.write(owl_download.content.decode('utf-8')) - - with open(os.path.join(owl_dir,ontol_name+'.owl'),'r', encoding='utf-8') as OWL_file: - owl_sections = _section_file(OWL_file, '<!-- '+purl_link) - obo_ids = _labels_synonyms(owl_sections) - OWL_file.seek(0) - import_urls = [x for x in _get_imports(OWL_file) if x not in ontol_urls] - - for imp_url in import_urls: - imp_download = requests.get(imp_url) - imp_file = os.path.join(owl_dir,imp_url.split('/')[-1]) - with open(imp_file, 'w', encoding='utf-8') as T_file: - T_file.write(imp_download.content.decode('utf-8')) - # Assume possible to get duplicate names for different imports, so rewrite file - with open(imp_file, 'r', encoding='utf-8') as T_file: - owl_sections = _section_file(T_file, '<!-- '+purl_link, stt_at='', end_at='') - obo_ids.extend(_labels_synonyms(owl_sections, have_label=True)) - os.remove(imp_file) - # TODO: more elegant way to find which ids don't have labels and to remove duplicates? - [no_label.add(x[0]) if x[1]=='' else found_label.add(x[0]) for x in obo_ids] - no_label_found = list(no_label.difference(found_label)) - for ontol_term in obo_ids: - if ontol_term[0] in no_label_found: - ontol_acc = ontr.Ontology_accession(':'+ontol_term[0]) - if ontol_acc.label != 'unk': - if not re.search('^obsolete:', ontol_acc.label): - ontol_term[1] = ontol_acc.label - time.sleep(0.05) - if ontol_term[1] == '': - continue - ontol_label = punctuation_treatment(ontol_term[1]) - _db_insert(c, 'non_standard_resource_ids', ontol_term[0], ontol_term[1]) - _db_insert(c, 'standard_resource_labels', ontol_label, ontol_term[0]) - # Note from v 0.7: To limit performance overhead, we ignore resource labels with - # more than 7 tokens, as permutating too many tokens can be costly. - # Ignore NCBI taxon terms - if len(word_tokenize(ontol_label)) < 7 and len(word_tokenize(ontol_label)) > 1: - if not re.search('NCBITaxon', ontol_term[0]): - for permutation in get_resource_label_permutations(ontol_label): - _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0]) - # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations? - elif len(word_tokenize(ontol_label)) == 2: - bi_name = ontol_label.split() - _db_insert(c, 'standard_resource_permutations', - bi_name[0][0]+' '+bi_name[1], ontol_term[0]) - for syn_term in ontol_term[2]: - _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label) - conn.commit() - sonn.commit() - - conn.close() - sonn.close() - _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle')) - return