diff lexmapr/create_databases.py @ 3:be95a7ce968a tip

"planemo upload"
author kkonganti
date Tue, 13 Sep 2022 11:32:24 -0400
parents 5244e7465767
children
line wrap: on
line diff
--- a/lexmapr/create_databases.py	Wed Aug 31 14:32:14 2022 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,248 +0,0 @@
-"""Builds SQLite3 databases"""
-
-import logging, os, pickle, re, requests, sqlite3, sys, time
-import lexmapr.ontology_reasoner as ontr
-from nltk.tokenize import word_tokenize
-from lexmapr.pipeline_helpers import punctuation_treatment
-from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db
-from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels
-from lexmapr.pipeline_resources import get_resource_label_permutations
-
-logging.getLogger('requests').setLevel(logging.WARNING)
-logging.getLogger('urllib3').setLevel(logging.WARNING)
-
-
-# TODO: might replace pickle with ujson
-def _pickle_save(data_to_save, file_path):
-    '''Write a pickle file'''
-    with open(file_path,'wb') as SAVE_file:
-        pickle.dump(data_to_save, SAVE_file)
-
-
-def _pickle_load(file_path):
-    '''Read a pickle file'''
-    with open(file_path,'rb') as LOAD_file:
-        return(pickle.load(LOAD_file))
-
-
-def _get_ontols(ontol_interest):
-    '''Obtain URLs for ontologies of interest'''
-    ontol_dic = {}
-    embl_resp = requests.get(embl_ontologies)
-    resp_blocks = re.findall('<tr>([\s\S]+?)</tr>',embl_resp.content.decode('utf-8'))
-    for resp_block in resp_blocks:
-        try:
-            embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1)
-            embl_name = re.search('<a href=\"/ols/ontologies/' + embl_abbr.lower() +\
-                                  '\">([\s\S]+?)</a>', resp_block).group(1)
-            embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1)
-            if embl_link.startswith('ontologies'):
-                embl_link = embl_link[len('ontologies'):]
-        # TODO: with Python 3.9- embl_link.removeprefix('ontologies')
-        except(AttributeError):
-            continue
-        if embl_abbr in ontol_interest:
-            ontol_dic[embl_abbr] = (embl_name, embl_link)
-    # Continue if not find all ontologies of interest specified in definitions.py
-    not_found = set(ontol_interest).difference(set(ontol_dic.keys()))
-    if not_found:
-        if len(not_found) == 1:
-            logging.warning(f'Did not find ontology: ' + ', '.join(not_found))
-        else:
-            logging.warning(f'Did not find ontologies: ' + ', '.join(not_found))
-    if ontol_dic == {}:
-        sys.exit('Zero ontologies found from user-given list')
-    return(ontol_dic)
-
-
-def _check_make(db_file, remake_cache, ontol_interest):
-    '''Check if database file should be remade'''
-    if os.path.exists(db_file) and remake_cache == False:
-        if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')):
-            if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')):
-                return(False)
-    try:
-        os.remove(db_file)
-    except(FileNotFoundError):
-        pass
-    return(True)
-
-
-def _db_insert(db_cursor, table_name, key_term, val_term):
-    '''Insert new data into a database table'''
-    if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='':
-        return
-    db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)",
-                      {'key':key_term.strip(), 'value':val_term.strip()})
-    
-
-def _get_imports(file_handle):
-    '''Check for required imports; append any new patterns to pattern_strs'''
-    pattern_strs = []
-    pattern_strs.append('<owl:imports rdf:resource=\"&obo;(.*)"/>')
-    pattern_strs.append('<owl:imports rdf:resource=\"(http://.*)\"/>')
-    whole_file = str(file_handle.read())
-    for patt_str in pattern_strs:
-        import_match = re.findall(patt_str, whole_file)
-        if import_match != []:
-            import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match]
-            break
-    return(import_match)
-
-
-def _section_file(file_handle, break_pattern,
-                 stt_at='    // Classes', end_at='    // Annotations'):
-    '''Break OWL files into readable sections for each ontology accession'''
-    whole_file = str(file_handle.read())
-    if stt_at != '':
-        if re.search(stt_at, whole_file):
-            whole_file = ''.join(whole_file.split(stt_at)[1:])
-    if end_at != '':
-        if re.search(end_at, whole_file):
-            whole_file = ''.join(whole_file.split(end_at)[:-1])
-    file_sections = whole_file.split(break_pattern)
-    return(file_sections[1:-1])
-
-
-def _labels_synonyms(obo_list, have_label=False):
-    '''Identify labels, ids and exact ontology synonyms'''
-    obo_ids = []
-    for obo_string in obo_list:
-        id_pattern = '(\w+) -->'
-        lab_pattern = '\<rdfs:label[\s\S]*?\>([\s\S]+?)\<\/rdfs:label\>'
-        syn_pattern = '\<oboInOwl:hasExactSynonym[\s\S]*?\>(.*?)\</oboInOwl:hasExactSynonym'
-        obo_id = re.search(id_pattern, obo_string).group(1)
-        # Do not save ids that are not formatted as expected, or placeholders/obsolete/Apollo:SV
-        if re.search('APOLLO:SV', obo_id) or re.search('APOLLO_SV', obo_id):
-            continue
-        elif obo_id in missing_ontol_labels:
-            continue
-        elif re.search(':', obo_id) or re.search('_', obo_id):
-            try:
-                obo_label = re.findall(lab_pattern, obo_string)[-1] 
-                if re.search('^obsolete:', obo_label):
-                    continue
-            except(IndexError):
-                obo_label = ''
-            obo_synonyms = re.findall(syn_pattern, obo_string)
-            obo_ids.append([str(obo_id.lstrip('_')), str(obo_label), obo_synonyms])
-    if have_label:
-        obo_ids = [x for x in obo_ids if x[1] != '']
-    return(obo_ids)
-
-
-def get_synonyms(remake_cache, ontol_interest):
-    '''Create database of predefined synonyms'''
-    os.makedirs(owl_dir, exist_ok=True)
-    if _check_make(synonym_db, remake_cache, ontol_interest) == False:
-        return
-
-    conn = sqlite3.connect(synonym_db)
-    c = conn.cursor()
-    c.execute("CREATE TABLE label_synonyms (key TEXT, value TEXT, UNIQUE(key, value))")
-    print('{:<40}'.format('\tRetrieving predefined synonyms'),end='\r')
-    with open('lexmapr/predefined_resources/resource_synonyms.csv', 'r') as IN_file:
-        for read_line in IN_file.readlines():
-            split_line = read_line.split(',')
-            try:
-                ontol_name = split_line[2].split('_')[0].upper()
-            except(IndexError):
-                ontol_name = ontol_interest[0]
-            if ontol_name in ontol_interest:
-                _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]),
-                           punctuation_treatment(split_line[1]))
-    conn.commit()
-    with open('lexmapr/predefined_resources/SynLex.csv', 'r') as IN_file:
-        IN_file.readline()
-        for read_line in IN_file.readlines():
-            split_line = read_line.split(',')
-            try:
-                ontol_name = split_line[2].split('_')[0].upper().strip()
-            except(IndexError):
-                ontol_name = ontol_interest[0]
-            if ontol_name in ontol_interest:
-                _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]),
-                           punctuation_treatment(split_line[1]))
-    conn.commit()
-    conn.close()
-    return
-
-
-def get_resource_ids(remake_cache, ontol_interest):
-    '''Create database of online resources and update synonym database'''
-    if _check_make(ontol_db, remake_cache, ontol_interest) == False:
-        return
-
-    conn = sqlite3.connect(ontol_db)
-    c = conn.cursor()
-    c.execute("CREATE TABLE non_standard_resource_ids (key TEXT, value TEXT, UNIQUE(key))")
-    c.execute("CREATE TABLE standard_resource_labels (key TEXT, value TEXT, UNIQUE(key, value))")
-    c.execute("""CREATE TABLE standard_resource_permutations (key TEXT, value TEXT,
-                                                              UNIQUE(key, value))""")
-    sonn = sqlite3.connect(synonym_db)
-    s = sonn.cursor()
-
-    ontol_dic = _get_ontols(ontol_interest)
-    ontol_urls = [purl_link+x[1] for x in ontol_dic.values()]
-    for ontol_name in ontol_dic:
-        print('{:<40}'.format('\tRetrieving '+ontol_name[:15]+' terms'), end='\r')
-        no_label = set()
-        found_label = set()
-
-        if not os.path.isfile(os.path.join(owl_dir,ontol_name+'.owl')):
-            owl_download = requests.get(embl_ontologies+ontol_dic[ontol_name][1])
-            with open(os.path.join(owl_dir,ontol_name+'.owl'), 'w', encoding='utf-8') as O_file:
-                O_file.write(owl_download.content.decode('utf-8'))
-
-        with open(os.path.join(owl_dir,ontol_name+'.owl'),'r', encoding='utf-8') as OWL_file:
-            owl_sections = _section_file(OWL_file, '<!-- '+purl_link)
-            obo_ids = _labels_synonyms(owl_sections)
-            OWL_file.seek(0)
-            import_urls = [x for x in _get_imports(OWL_file) if x not in ontol_urls]            
-
-        for imp_url in import_urls:
-            imp_download = requests.get(imp_url)
-            imp_file = os.path.join(owl_dir,imp_url.split('/')[-1])
-            with open(imp_file, 'w', encoding='utf-8') as T_file:
-                T_file.write(imp_download.content.decode('utf-8'))
-            # Assume possible to get duplicate names for different imports, so rewrite file
-            with open(imp_file, 'r', encoding='utf-8') as T_file:
-                owl_sections = _section_file(T_file, '<!-- '+purl_link, stt_at='', end_at='')
-                obo_ids.extend(_labels_synonyms(owl_sections, have_label=True))
-            os.remove(imp_file)
-        # TODO: more elegant way to find which ids don't have labels and to remove duplicates?
-        [no_label.add(x[0]) if x[1]=='' else found_label.add(x[0]) for x in obo_ids]
-        no_label_found = list(no_label.difference(found_label))
-        for ontol_term in obo_ids:
-            if ontol_term[0] in no_label_found:
-                ontol_acc = ontr.Ontology_accession(':'+ontol_term[0])
-                if ontol_acc.label != 'unk':
-                    if not re.search('^obsolete:', ontol_acc.label):
-                        ontol_term[1] = ontol_acc.label
-                time.sleep(0.05)
-            if ontol_term[1] == '':
-                continue
-            ontol_label = punctuation_treatment(ontol_term[1])
-            _db_insert(c, 'non_standard_resource_ids', ontol_term[0], ontol_term[1])
-            _db_insert(c, 'standard_resource_labels', ontol_label, ontol_term[0])
-            # Note from v 0.7: To limit performance overhead, we ignore resource labels with
-            # more than 7 tokens, as permutating too many tokens can be costly.
-            # Ignore NCBI taxon terms
-            if len(word_tokenize(ontol_label)) < 7 and len(word_tokenize(ontol_label)) > 1:
-                if not re.search('NCBITaxon', ontol_term[0]):
-                    for permutation in get_resource_label_permutations(ontol_label):
-                        _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0])
-                # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations?
-                elif len(word_tokenize(ontol_label)) == 2:
-                    bi_name = ontol_label.split()
-                    _db_insert(c, 'standard_resource_permutations',
-                               bi_name[0][0]+' '+bi_name[1], ontol_term[0])
-            for syn_term in ontol_term[2]:
-                _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label)
-        conn.commit()
-        sonn.commit()
-
-    conn.close()
-    sonn.close()
-    _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle'))
-    return