Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paragraph #13

Merged
merged 7 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,21 @@ Before applying tf-idf, articles containing any of the specified keywords in the

From the rest of articles, to choose the most relevant ones, you can specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/config.json):

- Threshold for the tf-idf score value
- Maximum number of selected articles with the top scores
- Percentage of selected articles with the top scores
- Maximum number of selected articles with the top scores
- Threshold for the value of cosine similarity between the embeddings of list of keywords and each article.


```commandline
"article_selector":
"article_selector":
{
"type": "percentage",
"value": "30"
},

OR

"article_selector":
{
"type": "threshold",
"value": "0.02"
Expand All @@ -186,14 +196,18 @@ python3 scripts/3_select_final_articles.py --input_dir "output/output_timestampe
```

### 5. Generate output
As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs.
As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs or a segmentation of the text.
This feature can be set in [config.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/config.json).
```commandline
"output_unit": "paragraph"

OR

"output_unit": "text"
"output_unit": "full_text"

OR
"output_unit": "segmented_text"
"sentences_per_segment": 10
```

```commandline
Expand Down
7 changes: 4 additions & 3 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
],
"article_selector":
{
"type": "threshold",
"value": "0.02"
"type": "percentage",
"value": "30"
},
"output_unit": "paragraph"
"output_unit": "segmented_text",
"sentences_per_segment": 10
}
6 changes: 6 additions & 0 deletions interest/article_final_selection/article_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,10 @@ def select_articles(self) -> List[int]:
num_articles = int(self.config["value"])
selected_indices.extend(sorted_indices[:num_articles])

elif self.config["type"] == "percentage":
percentage = float(self.config["value"])
num_articles = int(len(self.similarity_scores) *
(percentage / 100.0))
selected_indices.extend(sorted_indices[:num_articles])

return selected_indices
15 changes: 7 additions & 8 deletions interest/article_final_selection/process_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
text_cleaner = TextCleaner()


def clean(text: str) -> str:
def clean(text: Union[str, List[str]]) -> str:
"""
Clean the input text using TextCleaner.

Expand Down Expand Up @@ -47,13 +47,13 @@ def __init__(self, gzip_file_path: str, article_id: int):
self._body: Union[str, list, None] = ''
self.selected: bool = False

def read_article_from_gzip(self, in_paragraph: bool = False) -> (
Tuple)[Union[str, None], Union[str, list, None]]:
def read_article_from_gzip(self) -> (
Tuple)[Union[str, None], Union[List[str], None]]:
"""
Read article content from a gzip file.

Returns:
Tuple[Union[str, None], Union[str, None]]: A tuple containing
Tuple[Union[str, None], Union[list, None]]: A tuple containing
the title and body of the article.
"""
try:
Expand All @@ -63,7 +63,7 @@ def read_article_from_gzip(self, in_paragraph: bool = False) -> (
article = articles.get(str(self._article_id), {})
title = article.get('title', {})
body = article.get('body', {})
return title, body if in_paragraph else " ".join(body)
return title, body
except Exception as e: # pylint: disable=broad-except
logging.error("Error reading article %s from %s: %s",
str(self._article_id), self._file_path, e)
Expand All @@ -88,6 +88,5 @@ def process_article(self, clean_keywords: List[str]) -> str:
if title_with_keyword:
self.selected = True
return ""
if isinstance(self._body, str):
return clean(self._body)
return ""

return clean(self._body)
117 changes: 117 additions & 0 deletions interest/output_generator/text_formater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
""" This module defines a TextFormatter class for formatting text based on
specified output units. """
from typing import List, Union
import logging
from interest.settings import SPACY_MODEL
from interest.utils import load_spacy_model

PARAGRAPH_FORMATTER = 'paragraph'
FULLTEXT_FORMATTER = 'full_text'
SEGMENTED_TEXT_FORMATTER = 'segmented_text'


class TextFormatter:
# pylint: disable=R0903
"""Class for formatting text based on specified output units. """

def __init__(self, output_unit: str, sentences_per_segment: int,
spacy_model=SPACY_MODEL): # : Union[str, Language]
"""
Initializes the TextFormatter object.

Args:
output_unit (str): The type of output unit ('paragraph',
'full_text', 'segmented_text').
sentences_per_segment (int): Number of sentences per
segment when output_unit is 'segmented_text'.
spacy_model (Union[str, Language], optional): Spacy model
or model name used for text processing. Defaults to the global
SPACY_MODEL value.
"""
self.nlp = (
load_spacy_model(spacy_model)
if isinstance(spacy_model, str)
else spacy_model
)
self.sentences_per_segment = sentences_per_segment
self.formatter = output_unit
self.is_fulltext = self._is_fulltext()
self.texts: List[str] = []

def format_output(self, texts: Union[None, List[str]]) -> (
Union)[str, List[str], None]:
"""
Formats input texts based on the specified output unit.

Args:
texts (List[str]): List of input texts to be formatted.

Returns:
Union[str, List[List[str]]]: Formatted output text based on the
selected output_unit. For 'full_text', returns a single string.
For 'paragraph' and 'segmented_text', returns a list of segmented
text lists.

Raises:
ValueError: If input 'texts' is not a list of strings.
ValueError: If an unsupported formatter type is specified.
"""
try:
if (not isinstance(texts, list) or (texts is None) or
not all(isinstance(text, str) for text in texts)):
raise ValueError("Input 'texts' must be a list of strings.")

self.texts = texts

if self.formatter == PARAGRAPH_FORMATTER:
return self._format_paragraph()
if self.formatter == FULLTEXT_FORMATTER:
return self._format_fulltext()
if self.formatter == SEGMENTED_TEXT_FORMATTER:
return self._format_segmented_text()

except ValueError as e:
logging.error("Unsupported formatter %s: %s", self.formatter, e)
return None
return None

def _format_paragraph(self) -> List[str]:
"""Formats texts as a single paragraph.

Returns:
List[List[str]]: List of input texts, segmented in paragraphs.
"""
return self.texts

def _format_fulltext(self) -> str:
"""Formats texts as full text with newline separators.

Returns:
str: Newline-separated string of input texts.
"""
return '\n'.join(self.texts)

def _format_segmented_text(self) -> List[str]:
"""Formats texts as segmented text based on sentences_per_segment.

Returns:
List[str]: Flattened list of segmented text strings.
"""
segmented_texts = []
for text in self.texts:
doc = self.nlp(text)
sentences = [sent.text for sent in doc.sents]

for i in range(0, len(sentences), self.sentences_per_segment):
segment = sentences[i:i + self.sentences_per_segment]
segmented_texts.extend(segment)

return segmented_texts

def _is_fulltext(self) -> bool:
"""Checks if the formatter type is 'full_text'.

Returns:
bool: True if formatter is 'full_text', False otherwise.
"""
return self.formatter == FULLTEXT_FORMATTER
27 changes: 21 additions & 6 deletions interest/preprocessor/text_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,28 @@
data using various cleaning techniques.
"""
import re
# from typing import Optional
from typing import Union, List
from interest.settings import SPACY_MODEL
from interest.utils import load_spacy_model


def merge_texts_list(text: Union[str, List[str]]) -> str:
"""
Merge a list of texts into a single string by joining them with spaces.

Args:
text (Union[str, List[str]]): The input text or list of texts to merge.

Returns:
str: The merged text if input is a list of strings, otherwise returns
the input text unchanged.
"""
if isinstance(text, list):
merged_text = ' '.join(text)
return merged_text
return text


class TextCleaner:
"""A class for cleaning text data using various preprocessing
techniques."""
Expand Down Expand Up @@ -82,15 +99,12 @@ def preprocess(self, text):
"""Preprocess the given text using a series of cleaning steps.

Args:
text (str): The text to preprocess.
text ( List[str]): The text to preprocess.

Returns:
str: The preprocessed text.
"""
self.text = text
# self.get_words()
# self.lower()
# self.remove_stopwords()
self.text = merge_texts_list(text)
self.get_lower_lemma_tokens()
self.remove_numeric()
self.remove_extra_whitespace_tabs()
Expand All @@ -107,6 +121,7 @@ def clean(self, text):
Returns:
str: The cleaned text.
"""
self.text = merge_texts_list(text)
self.text = text
self.get_words()
self.keep_standard_chars()
Expand Down
43 changes: 7 additions & 36 deletions interest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def load_spacy_model(model_name: str, retry: bool = True) \
spacy.cli.download(model_name)
return load_spacy_model(model_name, False)
raise exc
nlp.add_pipe("sentencizer")
return nlp


Expand Down Expand Up @@ -106,60 +107,30 @@ def get_keywords_from_config(config_file: Path) -> List[str]:
raise KeyError("Keywords not found in config file") from exc


def get_article_selector_from_config(config_file: Path) -> dict:
def read_config(config_file: Path, item_key: str) -> Dict[str, str]:
"""
Get the article selector configuration from a JSON file.
Get the value of the given key item from a JSON file.

Args:
config_file (Path): The path to the JSON config file.

Returns:
Dict[str, str]: The article selector configuration.

Raises:
ArticleSelectorNotFoundError: If the article selector
is not found in the config file.
FileNotFoundError: If the config file is not found.
"""
try:
with open(config_file, 'r', encoding=ENCODING) as f:
config: Dict[str, str] = json.load(f)["article_selector"]
if not config:
raise ValueError("Config is empty")
return config
except FileNotFoundError as exc:
raise FileNotFoundError("Config file not found") from exc
except KeyError as exc:
raise KeyError("Article selector not found in config file") \
from exc


def get_output_unit_from_config(config_file: Path) -> dict:
"""
Get the article selector configuration from a JSON file.

Args:
config_file (Path): The path to the JSON config file.

item_key (str): Key item defined in config file.
Returns:
Dict[str, str]: The article selector configuration.

Raises:
ArticleSelectorNotFoundError: If the article selector
is not found in the config file.
KeyError: If the key item is not found in the config file.
FileNotFoundError: If the config file is not found.
"""
try:
with open(config_file, 'r', encoding=ENCODING) as f:
config: Dict[str, str] = json.load(f)["output_unit"]
config: Dict[str, str] = json.load(f)[item_key]
if not config:
raise ValueError("Config is empty")
return config
except FileNotFoundError as exc:
raise FileNotFoundError("Config file not found") from exc
except KeyError as exc:
raise KeyError("Article selector not found in config file") \
from exc
raise KeyError("Key item %s not found in config file") from exc


def save_filtered_articles(input_file: Any, article_id: str,
Expand Down
2 changes: 1 addition & 1 deletion scripts/step1_filter_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
help="Glob pattern for find input files; e.g. '*.gz' ",
)
parser.add_argument(
"--config_path",
"--config-path",
type=Path,
default="config.json",
help="File path of config file.",
Expand Down
Loading
Loading