From cea385c0bc7abadb4d6cfafd4eb0fe110e952d70 Mon Sep 17 00:00:00 2001 From: Arian Pasquali Date: Mon, 5 Aug 2019 22:58:25 +0100 Subject: [PATCH] fix text highlight bug --- yake/textNormalization.py | 105 -------------------------------------- 1 file changed, 105 deletions(-) delete mode 100644 yake/textNormalization.py diff --git a/yake/textNormalization.py b/yake/textNormalization.py deleted file mode 100644 index b3af454..0000000 --- a/yake/textNormalization.py +++ /dev/null @@ -1,105 +0,0 @@ -import re - - -def format_one_gram_text(text, relevant_words_array): - text_tokens = text.replace('\n',' ').split(' ') - try: - for tk in range(len(text_tokens)): - kw = re.sub('[!",:.;?()]$|^[!",:.;?()]|\W["!,:.;?()]', '', text_tokens[tk]).lower() - if kw in relevant_words_array: - text_tokens[tk] = text_tokens[tk].lower().replace(kw, '' + kw + '') - except: - pass - new_text = ' '.join(text_tokens) - return new_text - - -def format_n_gram_text(text, relevant_words_array, n_gram): - text_tokens = text.replace('\n',' ').split(' ') - y = 0 - final_splited_text = [] - while y < len(text_tokens): - - splited_n_gram_kw_list = [] - n_gram_kw_list = [] - n_gram_word_list, splited_n_gram_kw_list = find_more_relevant(y, text_tokens, n_gram, relevant_words_array, n_gram_kw_list, splited_n_gram_kw_list) - if n_gram_word_list: - - if len(n_gram_word_list[0].split(' ')) == 1: - y, new_expression = replace_token(text_tokens, y, n_gram_word_list) - final_splited_text.append(new_expression) - else: - kw_list = [] - splited_n_gram_kw_list = [] - splited_one = n_gram_word_list[0].split() - - for len_kw in range(0, len(splited_one)): - kw_list, splited_n_gram_kw_list = find_more_relevant(y+len_kw, text_tokens, n_gram, relevant_words_array, kw_list, splited_n_gram_kw_list) - min_score_word = min(kw_list, key=lambda x: relevant_words_array.index(x)) - - if kw_list.index(min_score_word) == 0: - term_list = [min_score_word] - y, new_expression = replace_token(text_tokens, y, term_list) - final_splited_text.append(new_expression) - - elif kw_list.index(min_score_word) >= 1: - index_of_more_relevant = splited_n_gram_kw_list[0].index(min_score_word.split()[0]) - temporal_kw = ' '.join(splited_n_gram_kw_list[0][:index_of_more_relevant]) - if temporal_kw.lower() in relevant_words_array: - - if relevant_words_array.index(temporal_kw.lower()) > relevant_words_array.index(final_splited_text[-1].lower() +' '+temporal_kw.lower()) and not re.findall('', final_splited_text[-1].lower()): - term_list = [final_splited_text[-1].lower() +' '+temporal_kw.lower()] - del final_splited_text[-1] - y -= 1 - y, new_expression = replace_token(text_tokens, y, term_list) - final_splited_text.append(new_expression) - else: - term_list = [temporal_kw.lower()] - y, new_expression = replace_token(text_tokens, y, term_list) - final_splited_text.append(new_expression) - else: - for tmp_kw in splited_n_gram_kw_list[0][:index_of_more_relevant]: - if tmp_kw.lower() in relevant_words_array: - term_list = [tmp_kw.lower()] - y, new_expression = replace_token(text_tokens, y, term_list) - final_splited_text.append(new_expression) - else: - final_splited_text.append(text_tokens[y]) - y += 1 - - else: - final_splited_text.append(text_tokens[y]) - y += 1 - new_text = ' '.join(final_splited_text) - - return new_text - - -def find_more_relevant(y, text_tokens, n_gram, relevant_words_array, kw_list, splited_n_gram_word_list): - temporary_list = [] - temporary_list_two = [] - for i in range(n_gram): - - temporary_list.append(text_tokens[y:y + i + 1]) - k = re.sub('''[!",:.;?()]$|^[!",':.;?()]|\W["!,:.;?()]''', '', ' '.join(temporary_list[i])).lower() - - if k.lower() in relevant_words_array: - temporary_list_two.append(k) - - n_gram_word_list = sorted(temporary_list_two, key=lambda x: relevant_words_array.index(x)) - - try: - kw_list.append(n_gram_word_list[0]) - splited_n_gram_word_list.append(n_gram_word_list[0].split()) - except: - pass - - return kw_list, splited_n_gram_word_list - - -def replace_token(text_tokens, y, n_gram_word_list): - txt = ' '.join(text_tokens[y:y + len(n_gram_word_list[0].split(' '))]) - - new_expression = txt.replace(re.sub('[!",:.;?()]$|^[!",:.;?()]|\W["!,:.;?()]', '', txt), '' + n_gram_word_list[0] + '') - y += len(n_gram_word_list[0].split(' ')) - return y, new_expression \ No newline at end of file