Paragraph (#13)

* add segmented_text to list of output_units * add percentage to the output options * change parameter name * modify readme * fix linting error * fix pylint error * fix flake8 --------- Co-authored-by: parisa-zahedi <p.zahedi@uu.nl>
UtrechtUniversity · Apr 11, 2024 · 01702c2 · 01702c2
1 parent b003648
commit 01702c2
Show file tree

Hide file tree

Showing 10 changed files with 234 additions and 85 deletions.
diff --git a/README.md b/README.md
@@ -161,11 +161,21 @@ Before applying tf-idf, articles containing any of the specified keywords in the
 
 From the rest of articles, to choose the most relevant ones, you can specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/config.json):
 
-- Threshold for the tf-idf score value
-- Maximum number of selected articles with the top scores
+- Percentage of selected articles with the top scores
+- Maximum number of selected articles with the top scores 
+- Threshold for the value of cosine similarity between the embeddings of list of keywords and each article.
+
 
 ```commandline
-"article_selector":
+  "article_selector":
+    {
+      "type": "percentage",
+      "value": "30"
+    },
+    
+    OR
+  
+  "article_selector":
     {
       "type": "threshold",
       "value": "0.02"
@@ -186,14 +196,18 @@ python3 scripts/3_select_final_articles.py --input_dir "output/output_timestampe
 ```
 
 ### 5. Generate output
-As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs.
+As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs or a segmentation of the text.
 This feature can be set in [config.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/config.json).
 ```commandline
 "output_unit": "paragraph"
 
 OR
 
-"output_unit": "text"
+"output_unit": "full_text"
+
+OR
+"output_unit": "segmented_text"
+"sentences_per_segment": 10
 ```
 
 ```commandline

diff --git a/config.json b/config.json
@@ -10,8 +10,9 @@
   ],
   "article_selector":
     {
-      "type": "threshold",
-      "value": "0.02"
+      "type": "percentage",
+      "value": "30"
     },
-  "output_unit": "paragraph"
+  "output_unit": "segmented_text",
+  "sentences_per_segment": 10
 }
diff --git a/interest/article_final_selection/article_selector.py b/interest/article_final_selection/article_selector.py
@@ -45,4 +45,10 @@ def select_articles(self) -> List[int]:
             num_articles = int(self.config["value"])
             selected_indices.extend(sorted_indices[:num_articles])
 
+        elif self.config["type"] == "percentage":
+            percentage = float(self.config["value"])
+            num_articles = int(len(self.similarity_scores) *
+                               (percentage / 100.0))
+            selected_indices.extend(sorted_indices[:num_articles])
+
         return selected_indices
diff --git a/interest/article_final_selection/process_article.py b/interest/article_final_selection/process_article.py
@@ -8,7 +8,7 @@
 text_cleaner = TextCleaner()
 
 
-def clean(text: str) -> str:
+def clean(text:  Union[str, List[str]]) -> str:
     """
     Clean the input text using TextCleaner.
 
@@ -47,13 +47,13 @@ def __init__(self, gzip_file_path: str, article_id: int):
         self._body: Union[str, list, None] = ''
         self.selected: bool = False
 
-    def read_article_from_gzip(self, in_paragraph: bool = False) -> (
-            Tuple)[Union[str, None], Union[str, list, None]]:
+    def read_article_from_gzip(self) -> (
+            Tuple)[Union[str, None], Union[List[str], None]]:
         """
         Read article content from a gzip file.
 
         Returns:
-            Tuple[Union[str, None], Union[str, None]]: A tuple containing
+            Tuple[Union[str, None], Union[list, None]]: A tuple containing
             the title and body of the article.
         """
         try:
@@ -63,7 +63,7 @@ def read_article_from_gzip(self, in_paragraph: bool = False) -> (
                 article = articles.get(str(self._article_id), {})
                 title = article.get('title', {})
                 body = article.get('body', {})
-                return title, body if in_paragraph else " ".join(body)
+                return title, body
         except Exception as e:  # pylint: disable=broad-except
             logging.error("Error reading article %s from %s: %s",
                           str(self._article_id), self._file_path, e)
@@ -88,6 +88,5 @@ def process_article(self, clean_keywords: List[str]) -> str:
         if title_with_keyword:
             self.selected = True
             return ""
-        if isinstance(self._body, str):
-            return clean(self._body)
-        return ""
+
+        return clean(self._body)
diff --git a/interest/output_generator/text_formater.py b/interest/output_generator/text_formater.py
@@ -0,0 +1,117 @@
+""" This module defines a TextFormatter class for formatting text based on
+specified output units. """
+from typing import List, Union
+import logging
+from interest.settings import SPACY_MODEL
+from interest.utils import load_spacy_model
+
+PARAGRAPH_FORMATTER = 'paragraph'
+FULLTEXT_FORMATTER = 'full_text'
+SEGMENTED_TEXT_FORMATTER = 'segmented_text'
+
+
+class TextFormatter:
+    # pylint: disable=R0903
+    """Class for formatting text based on specified output units. """
+
+    def __init__(self, output_unit: str, sentences_per_segment: int,
+                 spacy_model=SPACY_MODEL):  # : Union[str, Language]
+        """
+        Initializes the TextFormatter object.
+
+        Args:
+            output_unit (str): The type of output unit ('paragraph',
+             'full_text', 'segmented_text').
+            sentences_per_segment (int): Number of sentences per
+            segment when output_unit is 'segmented_text'.
+            spacy_model (Union[str, Language], optional): Spacy model
+             or model name used for text processing. Defaults to the global
+             SPACY_MODEL value.
+        """
+        self.nlp = (
+            load_spacy_model(spacy_model)
+            if isinstance(spacy_model, str)
+            else spacy_model
+        )
+        self.sentences_per_segment = sentences_per_segment
+        self.formatter = output_unit
+        self.is_fulltext = self._is_fulltext()
+        self.texts: List[str] = []
+
+    def format_output(self, texts: Union[None, List[str]]) -> (
+            Union)[str, List[str], None]:
+        """
+        Formats input texts based on the specified output unit.
+
+        Args:
+            texts (List[str]): List of input texts to be formatted.
+
+        Returns:
+            Union[str, List[List[str]]]: Formatted output text based on the
+            selected output_unit. For 'full_text', returns a single string.
+            For 'paragraph' and 'segmented_text', returns a list of segmented
+             text lists.
+
+        Raises:
+            ValueError: If input 'texts' is not a list of strings.
+            ValueError: If an unsupported formatter type is specified.
+        """
+        try:
+            if (not isinstance(texts, list) or (texts is None) or
+                    not all(isinstance(text, str) for text in texts)):
+                raise ValueError("Input 'texts' must be a list of strings.")
+
+            self.texts = texts
+
+            if self.formatter == PARAGRAPH_FORMATTER:
+                return self._format_paragraph()
+            if self.formatter == FULLTEXT_FORMATTER:
+                return self._format_fulltext()
+            if self.formatter == SEGMENTED_TEXT_FORMATTER:
+                return self._format_segmented_text()
+
+        except ValueError as e:
+            logging.error("Unsupported formatter %s: %s", self.formatter, e)
+            return None
+        return None
+
+    def _format_paragraph(self) -> List[str]:
+        """Formats texts as a single paragraph.
+
+        Returns:
+            List[List[str]]: List of input texts, segmented in paragraphs.
+        """
+        return self.texts
+
+    def _format_fulltext(self) -> str:
+        """Formats texts as full text with newline separators.
+
+        Returns:
+            str: Newline-separated string of input texts.
+        """
+        return '\n'.join(self.texts)
+
+    def _format_segmented_text(self) -> List[str]:
+        """Formats texts as segmented text based on sentences_per_segment.
+
+        Returns:
+            List[str]: Flattened list of segmented text strings.
+        """
+        segmented_texts = []
+        for text in self.texts:
+            doc = self.nlp(text)
+            sentences = [sent.text for sent in doc.sents]
+
+            for i in range(0, len(sentences), self.sentences_per_segment):
+                segment = sentences[i:i + self.sentences_per_segment]
+                segmented_texts.extend(segment)
+
+        return segmented_texts
+
+    def _is_fulltext(self) -> bool:
+        """Checks if the formatter type is 'full_text'.
+
+        Returns:
+            bool: True if formatter is 'full_text', False otherwise.
+        """
+        return self.formatter == FULLTEXT_FORMATTER
diff --git a/interest/preprocessor/text_cleaner.py b/interest/preprocessor/text_cleaner.py
@@ -3,11 +3,28 @@
 data using various cleaning techniques.
 """
 import re
-# from typing import Optional
+from typing import Union, List
 from interest.settings import SPACY_MODEL
 from interest.utils import load_spacy_model
 
 
+def merge_texts_list(text: Union[str, List[str]]) -> str:
+    """
+    Merge a list of texts into a single string by joining them with spaces.
+
+    Args:
+        text (Union[str, List[str]]): The input text or list of texts to merge.
+
+    Returns:
+        str: The merged text if input is a list of strings, otherwise returns
+        the input text unchanged.
+    """
+    if isinstance(text, list):
+        merged_text = ' '.join(text)
+        return merged_text
+    return text
+
+
 class TextCleaner:
     """A class for cleaning text data using various preprocessing
        techniques."""
@@ -82,15 +99,12 @@ def preprocess(self, text):
         """Preprocess the given text using a series of cleaning steps.
 
         Args:
-            text (str): The text to preprocess.
+            text ( List[str]): The text to preprocess.
 
         Returns:
             str: The preprocessed text.
         """
-        self.text = text
-        # self.get_words()
-        # self.lower()
-        # self.remove_stopwords()
+        self.text = merge_texts_list(text)
         self.get_lower_lemma_tokens()
         self.remove_numeric()
         self.remove_extra_whitespace_tabs()
@@ -107,6 +121,7 @@ def clean(self, text):
         Returns:
             str: The cleaned text.
         """
+        self.text = merge_texts_list(text)
         self.text = text
         self.get_words()
         self.keep_standard_chars()

diff --git a/interest/utils.py b/interest/utils.py
@@ -40,6 +40,7 @@ def load_spacy_model(model_name: str, retry: bool = True) \
             spacy.cli.download(model_name)
             return load_spacy_model(model_name, False)
         raise exc
+    nlp.add_pipe("sentencizer")
     return nlp
 
 
@@ -106,60 +107,30 @@ def get_keywords_from_config(config_file: Path) -> List[str]:
         raise KeyError("Keywords not found in config file") from exc
 
 
-def get_article_selector_from_config(config_file: Path) -> dict:
+def read_config(config_file: Path, item_key: str) -> Dict[str, str]:
     """
-        Get the article selector configuration from a JSON file.
+        Get the value of the given key item from a JSON file.
 
         Args:
             config_file (Path): The path to the JSON config file.
-
-        Returns:
-            Dict[str, str]: The article selector configuration.
-
-        Raises:
-            ArticleSelectorNotFoundError: If the article selector
-            is not found in the config file.
-            FileNotFoundError: If the config file is not found.
-    """
-    try:
-        with open(config_file, 'r', encoding=ENCODING) as f:
-            config: Dict[str, str] = json.load(f)["article_selector"]
-        if not config:
-            raise ValueError("Config is empty")
-        return config
-    except FileNotFoundError as exc:
-        raise FileNotFoundError("Config file not found") from exc
-    except KeyError as exc:
-        raise KeyError("Article selector not found in config file") \
-            from exc
-
-
-def get_output_unit_from_config(config_file: Path) -> dict:
-    """
-        Get the article selector configuration from a JSON file.
-
-        Args:
-            config_file (Path): The path to the JSON config file.
-
+            item_key (str): Key item defined in config file.
         Returns:
             Dict[str, str]: The article selector configuration.
 
         Raises:
-            ArticleSelectorNotFoundError: If the article selector
-            is not found in the config file.
+            KeyError: If the key item is not found in the config file.
             FileNotFoundError: If the config file is not found.
     """
     try:
         with open(config_file, 'r', encoding=ENCODING) as f:
-            config: Dict[str, str] = json.load(f)["output_unit"]
+            config: Dict[str, str] = json.load(f)[item_key]
         if not config:
             raise ValueError("Config is empty")
         return config
     except FileNotFoundError as exc:
         raise FileNotFoundError("Config file not found") from exc
     except KeyError as exc:
-        raise KeyError("Article selector not found in config file") \
-            from exc
+        raise KeyError("Key item %s not found in config file") from exc
 
 
 def save_filtered_articles(input_file: Any, article_id: str,

diff --git a/scripts/step1_filter_articles.py b/scripts/step1_filter_articles.py
@@ -29,7 +29,7 @@
         help="Glob pattern for find input files; e.g. '*.gz' ",
     )
     parser.add_argument(
-        "--config_path",
+        "--config-path",
         type=Path,
         default="config.json",
         help="File path of config file.",