Mercurial > repos > kkonganti > cfsan_lexmapr2
diff lexmapr/pipeline_resources.py @ 3:be95a7ce968a tip
"planemo upload"
author | kkonganti |
---|---|
date | Tue, 13 Sep 2022 11:32:24 -0400 |
parents | 5244e7465767 |
children |
line wrap: on
line diff
--- a/lexmapr/pipeline_resources.py Wed Aug 31 14:32:14 2022 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -"""Cache, generate and load resources""" - -import csv, json, os, sys -from nltk import word_tokenize -from itertools import permutations -from collections import OrderedDict -from lexmapr.definitions import ROOT -from lexmapr.pipeline_helpers import punctuation_treatment - - -def _create_lookup_table_skeleton(): - '''Generate an empty lookup table''' - return({'abbreviations':{}, 'inflection_exceptions':{}, 'non_english_words':{}, - 'spelling_mistakes':{}, 'stop_words':{}, 'suffixes':{}}) - - -def _get_resource_dict(resource_file_name): - '''Get dictionary of resources from CSV file''' - ret_dic = {} - with open(os.path.join(ROOT, 'predefined_resources', resource_file_name)) as RES_file: - next(RES_file) - for row in csv.reader(RES_file, delimiter=','): - try: - ret_dic[punctuation_treatment(row[0].strip())] = \ - punctuation_treatment(row[1].strip()) - except IndexError: - ret_dic[punctuation_treatment(row[0].strip())] = '' - return(ret_dic) - - -def _add_predefined_resources_to_lookup_table(lookup_table): - '''Adds elements from lexmapr/predefined_resources to lookup table''' - lookup_table['abbreviations'] = _get_resource_dict('abbrv.csv') - lookup_table['inflection_exceptions'] = _get_resource_dict('inflection_exceptions.csv') - lookup_table['non_english_words'] = _get_resource_dict('nengwords.csv') - lookup_table['spelling_mistakes'] = _get_resource_dict('ScorLex.csv') #only from v 0.7 - lookup_table['spelling_mistakes'].update(_get_resource_dict('misspellings.csv')) - lookup_table['stop_words'] = _get_resource_dict('mining_stopwords.csv') - lookup_table['suffixes'] = _get_resource_dict('suffixes.csv') - return(lookup_table) - - -def get_predefined_resources(): - '''Creates lookup table''' - lookup_table_path = os.path.join(ROOT, 'predefined_resources', 'lookup_table.json') - if os.path.exists(lookup_table_path): - with open(lookup_table_path) as LT_file: - lookup_table = json.load(LT_file) - else: - lookup_table = _create_lookup_table_skeleton() - lookup_table = _add_predefined_resources_to_lookup_table(lookup_table) - with open(lookup_table_path, 'w') as LT_file: - json.dump(lookup_table, LT_file) - return(lookup_table) - - -def get_resource_label_permutations(resource_label): - '''Get permutations of some term''' - permutations_set = list(OrderedDict.fromkeys(permutations(resource_label.split()))) - ret_list = [] - for permutation_tuple in permutations_set: - ret_list.append(' '.join(permutation_tuple)) - return(ret_list)