From 5f23f5da05739f3dde80c9cb85142d708b33872b Mon Sep 17 00:00:00 2001 From: parisa-zahedi Date: Thu, 19 Dec 2024 16:08:03 +0100 Subject: [PATCH] avarge tf-idf score of keywords --- dataQuest/article_final_selection/process_articles.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dataQuest/article_final_selection/process_articles.py b/dataQuest/article_final_selection/process_articles.py index d1f1419..43f0546 100644 --- a/dataQuest/article_final_selection/process_articles.py +++ b/dataQuest/article_final_selection/process_articles.py @@ -3,6 +3,7 @@ and similarity scores. """ from typing import List, Tuple, Dict, Union +import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity from dataQuest.models.tfidf import TfidfEmbedder @@ -51,13 +52,15 @@ def apply_tfidf_similarity(documents: List[str], keywords: List[str]) -> ( Returns: List[float]: A list of similarity scores. """ - model = TfidfEmbedder(ngram_max=1, norm="l1", sublinear_tf=False, min_df=1, + model = TfidfEmbedder(ngram_max=1, norm="l2", sublinear_tf=False, min_df=1, max_df=1.0) keywords_list = [" ".join(keywords)] model.fit(documents) embeddings_documents = model.transform(documents).tocsr() embeddings_keywords = model.transform(keywords_list).tocsr() - similarity_scores = cosine_similarity(embeddings_keywords, + avg_keywords_embedding = embeddings_keywords.mean(axis=0) + avg_keywords_embedding = np.asarray(avg_keywords_embedding).flatten() + similarity_scores = cosine_similarity([avg_keywords_embedding], embeddings_documents) return similarity_scores[0]