kkonganti@0: """Cache, generate and load resources""" kkonganti@0: kkonganti@0: import csv, json, os, sys kkonganti@0: from nltk import word_tokenize kkonganti@0: from itertools import permutations kkonganti@0: from collections import OrderedDict kkonganti@0: from lexmapr.definitions import ROOT kkonganti@0: from lexmapr.pipeline_helpers import punctuation_treatment kkonganti@0: kkonganti@0: kkonganti@0: def _create_lookup_table_skeleton(): kkonganti@0: '''Generate an empty lookup table''' kkonganti@0: return({'abbreviations':{}, 'inflection_exceptions':{}, 'non_english_words':{}, kkonganti@0: 'spelling_mistakes':{}, 'stop_words':{}, 'suffixes':{}}) kkonganti@0: kkonganti@0: kkonganti@0: def _get_resource_dict(resource_file_name): kkonganti@0: '''Get dictionary of resources from CSV file''' kkonganti@0: ret_dic = {} kkonganti@0: with open(os.path.join(ROOT, 'predefined_resources', resource_file_name)) as RES_file: kkonganti@0: next(RES_file) kkonganti@0: for row in csv.reader(RES_file, delimiter=','): kkonganti@0: try: kkonganti@0: ret_dic[punctuation_treatment(row[0].strip())] = \ kkonganti@0: punctuation_treatment(row[1].strip()) kkonganti@0: except IndexError: kkonganti@0: ret_dic[punctuation_treatment(row[0].strip())] = '' kkonganti@0: return(ret_dic) kkonganti@0: kkonganti@0: kkonganti@0: def _add_predefined_resources_to_lookup_table(lookup_table): kkonganti@0: '''Adds elements from lexmapr/predefined_resources to lookup table''' kkonganti@0: lookup_table['abbreviations'] = _get_resource_dict('abbrv.csv') kkonganti@0: lookup_table['inflection_exceptions'] = _get_resource_dict('inflection_exceptions.csv') kkonganti@0: lookup_table['non_english_words'] = _get_resource_dict('nengwords.csv') kkonganti@0: lookup_table['spelling_mistakes'] = _get_resource_dict('ScorLex.csv') #only from v 0.7 kkonganti@0: lookup_table['spelling_mistakes'].update(_get_resource_dict('misspellings.csv')) kkonganti@0: lookup_table['stop_words'] = _get_resource_dict('mining_stopwords.csv') kkonganti@0: lookup_table['suffixes'] = _get_resource_dict('suffixes.csv') kkonganti@0: return(lookup_table) kkonganti@0: kkonganti@0: kkonganti@0: def get_predefined_resources(): kkonganti@0: '''Creates lookup table''' kkonganti@0: lookup_table_path = os.path.join(ROOT, 'predefined_resources', 'lookup_table.json') kkonganti@0: if os.path.exists(lookup_table_path): kkonganti@0: with open(lookup_table_path) as LT_file: kkonganti@0: lookup_table = json.load(LT_file) kkonganti@0: else: kkonganti@0: lookup_table = _create_lookup_table_skeleton() kkonganti@0: lookup_table = _add_predefined_resources_to_lookup_table(lookup_table) kkonganti@0: with open(lookup_table_path, 'w') as LT_file: kkonganti@0: json.dump(lookup_table, LT_file) kkonganti@0: return(lookup_table) kkonganti@0: kkonganti@0: kkonganti@0: def get_resource_label_permutations(resource_label): kkonganti@0: '''Get permutations of some term''' kkonganti@0: permutations_set = list(OrderedDict.fromkeys(permutations(resource_label.split()))) kkonganti@0: ret_list = [] kkonganti@0: for permutation_tuple in permutations_set: kkonganti@0: ret_list.append(' '.join(permutation_tuple)) kkonganti@0: return(ret_list)