cfsan_lexmapr2: lexmapr/pipeline_resources.py comparison

comparison lexmapr/pipeline_resources.py @ 0:f5c39d0447be

"planemo upload"

author	kkonganti
date	Wed, 31 Aug 2022 14:32:07 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:f5c39d0447be
+"""Cache, generate and load resources"""
+import csv, json, os, sys
+from nltk import word_tokenize
+from itertools import permutations
+from collections import OrderedDict
+from lexmapr.definitions import ROOT
+from lexmapr.pipeline_helpers import punctuation_treatment
+def _create_lookup_table_skeleton():
+'''Generate an empty lookup table'''
+return({'abbreviations':{}, 'inflection_exceptions':{}, 'non_english_words':{},
+'spelling_mistakes':{}, 'stop_words':{}, 'suffixes':{}})
+def _get_resource_dict(resource_file_name):
+'''Get dictionary of resources from CSV file'''
+ret_dic = {}
+with open(os.path.join(ROOT, 'predefined_resources', resource_file_name)) as RES_file:
+next(RES_file)
+for row in csv.reader(RES_file, delimiter=','):
+try:
+ret_dic[punctuation_treatment(row[0].strip())] = \
+punctuation_treatment(row[1].strip())
+except IndexError:
+ret_dic[punctuation_treatment(row[0].strip())] = ''
+return(ret_dic)
+def _add_predefined_resources_to_lookup_table(lookup_table):
+'''Adds elements from lexmapr/predefined_resources to lookup table'''
+lookup_table['abbreviations'] = _get_resource_dict('abbrv.csv')
+lookup_table['inflection_exceptions'] = _get_resource_dict('inflection_exceptions.csv')
+lookup_table['non_english_words'] = _get_resource_dict('nengwords.csv')
+lookup_table['spelling_mistakes'] = _get_resource_dict('ScorLex.csv') #only from v 0.7
+lookup_table['spelling_mistakes'].update(_get_resource_dict('misspellings.csv'))
+lookup_table['stop_words'] = _get_resource_dict('mining_stopwords.csv')
+lookup_table['suffixes'] = _get_resource_dict('suffixes.csv')
+return(lookup_table)
+def get_predefined_resources():
+'''Creates lookup table'''
+lookup_table_path = os.path.join(ROOT, 'predefined_resources', 'lookup_table.json')
+if os.path.exists(lookup_table_path):
+with open(lookup_table_path) as LT_file:
+lookup_table = json.load(LT_file)
+else:
+lookup_table = _create_lookup_table_skeleton()
+lookup_table = _add_predefined_resources_to_lookup_table(lookup_table)
+with open(lookup_table_path, 'w') as LT_file:
+json.dump(lookup_table, LT_file)
+return(lookup_table)
+def get_resource_label_permutations(resource_label):
+'''Get permutations of some term'''
+permutations_set = list(OrderedDict.fromkeys(permutations(resource_label.split())))
+ret_list = []
+for permutation_tuple in permutations_set:
+ret_list.append(' '.join(permutation_tuple))
+return(ret_list)

Mercurial > repos > kkonganti > cfsan_lexmapr2

comparison lexmapr/pipeline_resources.py @ 0:f5c39d0447be