From 5f23f5da05739f3dde80c9cb85142d708b33872b Mon Sep 17 00:00:00 2001
From: parisa-zahedi
Date: Thu, 19 Dec 2024 16:08:03 +0100
Subject: [PATCH] avarge tf-idf score of keywords
---
dataQuest/article_final_selection/process_articles.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/dataQuest/article_final_selection/process_articles.py b/dataQuest/article_final_selection/process_articles.py
index d1f1419..43f0546 100644
--- a/dataQuest/article_final_selection/process_articles.py
+++ b/dataQuest/article_final_selection/process_articles.py
@@ -3,6 +3,7 @@
and similarity scores.
"""
from typing import List, Tuple, Dict, Union
+import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from dataQuest.models.tfidf import TfidfEmbedder
@@ -51,13 +52,15 @@ def apply_tfidf_similarity(documents: List[str], keywords: List[str]) -> (
Returns:
List[float]: A list of similarity scores.
"""
- model = TfidfEmbedder(ngram_max=1, norm="l1", sublinear_tf=False, min_df=1,
+ model = TfidfEmbedder(ngram_max=1, norm="l2", sublinear_tf=False, min_df=1,
max_df=1.0)
keywords_list = [" ".join(keywords)]
model.fit(documents)
embeddings_documents = model.transform(documents).tocsr()
embeddings_keywords = model.transform(keywords_list).tocsr()
- similarity_scores = cosine_similarity(embeddings_keywords,
+ avg_keywords_embedding = embeddings_keywords.mean(axis=0)
+ avg_keywords_embedding = np.asarray(avg_keywords_embedding).flatten()
+ similarity_scores = cosine_similarity([avg_keywords_embedding],
embeddings_documents)
return similarity_scores[0]