diff lexmapr/create_databases.py @ 0:f5c39d0447be

"planemo upload"
author kkonganti
date Wed, 31 Aug 2022 14:32:07 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lexmapr/create_databases.py	Wed Aug 31 14:32:07 2022 -0400
@@ -0,0 +1,248 @@
+"""Builds SQLite3 databases"""
+
+import logging, os, pickle, re, requests, sqlite3, sys, time
+import lexmapr.ontology_reasoner as ontr
+from nltk.tokenize import word_tokenize
+from lexmapr.pipeline_helpers import punctuation_treatment
+from lexmapr.definitions import embl_ontologies, synonym_db, ontol_db
+from lexmapr.definitions import owl_dir, purl_link, missing_ontol_labels
+from lexmapr.pipeline_resources import get_resource_label_permutations
+
+logging.getLogger('requests').setLevel(logging.WARNING)
+logging.getLogger('urllib3').setLevel(logging.WARNING)
+
+
+# TODO: might replace pickle with ujson
+def _pickle_save(data_to_save, file_path):
+    '''Write a pickle file'''
+    with open(file_path,'wb') as SAVE_file:
+        pickle.dump(data_to_save, SAVE_file)
+
+
+def _pickle_load(file_path):
+    '''Read a pickle file'''
+    with open(file_path,'rb') as LOAD_file:
+        return(pickle.load(LOAD_file))
+
+
+def _get_ontols(ontol_interest):
+    '''Obtain URLs for ontologies of interest'''
+    ontol_dic = {}
+    embl_resp = requests.get(embl_ontologies)
+    resp_blocks = re.findall('<tr>([\s\S]+?)</tr>',embl_resp.content.decode('utf-8'))
+    for resp_block in resp_blocks:
+        try:
+            embl_abbr = re.search('class=\"ontology-source\">([\s\S]+?)<', resp_block).group(1)
+            embl_name = re.search('<a href=\"/ols/ontologies/' + embl_abbr.lower() +\
+                                  '\">([\s\S]+?)</a>', resp_block).group(1)
+            embl_link = re.search('href=\"(\S+)\">Download', resp_block).group(1)
+            if embl_link.startswith('ontologies'):
+                embl_link = embl_link[len('ontologies'):]
+        # TODO: with Python 3.9- embl_link.removeprefix('ontologies')
+        except(AttributeError):
+            continue
+        if embl_abbr in ontol_interest:
+            ontol_dic[embl_abbr] = (embl_name, embl_link)
+    # Continue if not find all ontologies of interest specified in definitions.py
+    not_found = set(ontol_interest).difference(set(ontol_dic.keys()))
+    if not_found:
+        if len(not_found) == 1:
+            logging.warning(f'Did not find ontology: ' + ', '.join(not_found))
+        else:
+            logging.warning(f'Did not find ontologies: ' + ', '.join(not_found))
+    if ontol_dic == {}:
+        sys.exit('Zero ontologies found from user-given list')
+    return(ontol_dic)
+
+
+def _check_make(db_file, remake_cache, ontol_interest):
+    '''Check if database file should be remade'''
+    if os.path.exists(db_file) and remake_cache == False:
+        if os.path.exists(os.path.join(owl_dir, 'cached_ontologies.pickle')):
+            if ontol_interest == _pickle_load(os.path.join(owl_dir, 'cached_ontologies.pickle')):
+                return(False)
+    try:
+        os.remove(db_file)
+    except(FileNotFoundError):
+        pass
+    return(True)
+
+
+def _db_insert(db_cursor, table_name, key_term, val_term):
+    '''Insert new data into a database table'''
+    if key_term.strip()==val_term.strip() or key_term.strip()=='' or val_term.strip()=='':
+        return
+    db_cursor.execute(f"INSERT OR IGNORE INTO {table_name} VALUES (:key,:value)",
+                      {'key':key_term.strip(), 'value':val_term.strip()})
+    
+
+def _get_imports(file_handle):
+    '''Check for required imports; append any new patterns to pattern_strs'''
+    pattern_strs = []
+    pattern_strs.append('<owl:imports rdf:resource=\"&obo;(.*)"/>')
+    pattern_strs.append('<owl:imports rdf:resource=\"(http://.*)\"/>')
+    whole_file = str(file_handle.read())
+    for patt_str in pattern_strs:
+        import_match = re.findall(patt_str, whole_file)
+        if import_match != []:
+            import_match = [x if re.search('^http:',x) else purl_link+x for x in import_match]
+            break
+    return(import_match)
+
+
+def _section_file(file_handle, break_pattern,
+                 stt_at='    // Classes', end_at='    // Annotations'):
+    '''Break OWL files into readable sections for each ontology accession'''
+    whole_file = str(file_handle.read())
+    if stt_at != '':
+        if re.search(stt_at, whole_file):
+            whole_file = ''.join(whole_file.split(stt_at)[1:])
+    if end_at != '':
+        if re.search(end_at, whole_file):
+            whole_file = ''.join(whole_file.split(end_at)[:-1])
+    file_sections = whole_file.split(break_pattern)
+    return(file_sections[1:-1])
+
+
+def _labels_synonyms(obo_list, have_label=False):
+    '''Identify labels, ids and exact ontology synonyms'''
+    obo_ids = []
+    for obo_string in obo_list:
+        id_pattern = '(\w+) -->'
+        lab_pattern = '\<rdfs:label[\s\S]*?\>([\s\S]+?)\<\/rdfs:label\>'
+        syn_pattern = '\<oboInOwl:hasExactSynonym[\s\S]*?\>(.*?)\</oboInOwl:hasExactSynonym'
+        obo_id = re.search(id_pattern, obo_string).group(1)
+        # Do not save ids that are not formatted as expected, or placeholders/obsolete/Apollo:SV
+        if re.search('APOLLO:SV', obo_id) or re.search('APOLLO_SV', obo_id):
+            continue
+        elif obo_id in missing_ontol_labels:
+            continue
+        elif re.search(':', obo_id) or re.search('_', obo_id):
+            try:
+                obo_label = re.findall(lab_pattern, obo_string)[-1] 
+                if re.search('^obsolete:', obo_label):
+                    continue
+            except(IndexError):
+                obo_label = ''
+            obo_synonyms = re.findall(syn_pattern, obo_string)
+            obo_ids.append([str(obo_id.lstrip('_')), str(obo_label), obo_synonyms])
+    if have_label:
+        obo_ids = [x for x in obo_ids if x[1] != '']
+    return(obo_ids)
+
+
+def get_synonyms(remake_cache, ontol_interest):
+    '''Create database of predefined synonyms'''
+    os.makedirs(owl_dir, exist_ok=True)
+    if _check_make(synonym_db, remake_cache, ontol_interest) == False:
+        return
+
+    conn = sqlite3.connect(synonym_db)
+    c = conn.cursor()
+    c.execute("CREATE TABLE label_synonyms (key TEXT, value TEXT, UNIQUE(key, value))")
+    print('{:<40}'.format('\tRetrieving predefined synonyms'),end='\r')
+    with open('lexmapr/predefined_resources/resource_synonyms.csv', 'r') as IN_file:
+        for read_line in IN_file.readlines():
+            split_line = read_line.split(',')
+            try:
+                ontol_name = split_line[2].split('_')[0].upper()
+            except(IndexError):
+                ontol_name = ontol_interest[0]
+            if ontol_name in ontol_interest:
+                _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]),
+                           punctuation_treatment(split_line[1]))
+    conn.commit()
+    with open('lexmapr/predefined_resources/SynLex.csv', 'r') as IN_file:
+        IN_file.readline()
+        for read_line in IN_file.readlines():
+            split_line = read_line.split(',')
+            try:
+                ontol_name = split_line[2].split('_')[0].upper().strip()
+            except(IndexError):
+                ontol_name = ontol_interest[0]
+            if ontol_name in ontol_interest:
+                _db_insert(c, 'label_synonyms', punctuation_treatment(split_line[0]),
+                           punctuation_treatment(split_line[1]))
+    conn.commit()
+    conn.close()
+    return
+
+
+def get_resource_ids(remake_cache, ontol_interest):
+    '''Create database of online resources and update synonym database'''
+    if _check_make(ontol_db, remake_cache, ontol_interest) == False:
+        return
+
+    conn = sqlite3.connect(ontol_db)
+    c = conn.cursor()
+    c.execute("CREATE TABLE non_standard_resource_ids (key TEXT, value TEXT, UNIQUE(key))")
+    c.execute("CREATE TABLE standard_resource_labels (key TEXT, value TEXT, UNIQUE(key, value))")
+    c.execute("""CREATE TABLE standard_resource_permutations (key TEXT, value TEXT,
+                                                              UNIQUE(key, value))""")
+    sonn = sqlite3.connect(synonym_db)
+    s = sonn.cursor()
+
+    ontol_dic = _get_ontols(ontol_interest)
+    ontol_urls = [purl_link+x[1] for x in ontol_dic.values()]
+    for ontol_name in ontol_dic:
+        print('{:<40}'.format('\tRetrieving '+ontol_name[:15]+' terms'), end='\r')
+        no_label = set()
+        found_label = set()
+
+        if not os.path.isfile(os.path.join(owl_dir,ontol_name+'.owl')):
+            owl_download = requests.get(embl_ontologies+ontol_dic[ontol_name][1])
+            with open(os.path.join(owl_dir,ontol_name+'.owl'), 'w', encoding='utf-8') as O_file:
+                O_file.write(owl_download.content.decode('utf-8'))
+
+        with open(os.path.join(owl_dir,ontol_name+'.owl'),'r', encoding='utf-8') as OWL_file:
+            owl_sections = _section_file(OWL_file, '<!-- '+purl_link)
+            obo_ids = _labels_synonyms(owl_sections)
+            OWL_file.seek(0)
+            import_urls = [x for x in _get_imports(OWL_file) if x not in ontol_urls]            
+
+        for imp_url in import_urls:
+            imp_download = requests.get(imp_url)
+            imp_file = os.path.join(owl_dir,imp_url.split('/')[-1])
+            with open(imp_file, 'w', encoding='utf-8') as T_file:
+                T_file.write(imp_download.content.decode('utf-8'))
+            # Assume possible to get duplicate names for different imports, so rewrite file
+            with open(imp_file, 'r', encoding='utf-8') as T_file:
+                owl_sections = _section_file(T_file, '<!-- '+purl_link, stt_at='', end_at='')
+                obo_ids.extend(_labels_synonyms(owl_sections, have_label=True))
+            os.remove(imp_file)
+        # TODO: more elegant way to find which ids don't have labels and to remove duplicates?
+        [no_label.add(x[0]) if x[1]=='' else found_label.add(x[0]) for x in obo_ids]
+        no_label_found = list(no_label.difference(found_label))
+        for ontol_term in obo_ids:
+            if ontol_term[0] in no_label_found:
+                ontol_acc = ontr.Ontology_accession(':'+ontol_term[0])
+                if ontol_acc.label != 'unk':
+                    if not re.search('^obsolete:', ontol_acc.label):
+                        ontol_term[1] = ontol_acc.label
+                time.sleep(0.05)
+            if ontol_term[1] == '':
+                continue
+            ontol_label = punctuation_treatment(ontol_term[1])
+            _db_insert(c, 'non_standard_resource_ids', ontol_term[0], ontol_term[1])
+            _db_insert(c, 'standard_resource_labels', ontol_label, ontol_term[0])
+            # Note from v 0.7: To limit performance overhead, we ignore resource labels with
+            # more than 7 tokens, as permutating too many tokens can be costly.
+            # Ignore NCBI taxon terms
+            if len(word_tokenize(ontol_label)) < 7 and len(word_tokenize(ontol_label)) > 1:
+                if not re.search('NCBITaxon', ontol_term[0]):
+                    for permutation in get_resource_label_permutations(ontol_label):
+                        _db_insert(c,'standard_resource_permutations',permutation,ontol_term[0])
+                # Add abbreviated binomials from NCBITaxons; TODO: may get wrong combinations?
+                elif len(word_tokenize(ontol_label)) == 2:
+                    bi_name = ontol_label.split()
+                    _db_insert(c, 'standard_resource_permutations',
+                               bi_name[0][0]+' '+bi_name[1], ontol_term[0])
+            for syn_term in ontol_term[2]:
+                _db_insert(s,'label_synonyms',punctuation_treatment(str(syn_term)),ontol_label)
+        conn.commit()
+        sonn.commit()
+
+    conn.close()
+    sonn.close()
+    _pickle_save(ontol_interest, os.path.join(owl_dir,'cached_ontologies.pickle'))
+    return