Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paragraph #13

Merged
merged 7 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
],
"article_selector":
{
"type": "threshold",
"value": "0.02"
"type": "percentage",
"value": "30"
},
"output_unit": "paragraph"
"output_unit": "segmented_text",
"sentences_per_segment": 10
}
5 changes: 5 additions & 0 deletions interest/article_final_selection/article_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,9 @@ def select_articles(self) -> List[int]:
num_articles = int(self.config["value"])
selected_indices.extend(sorted_indices[:num_articles])

elif self.config["type"] == "percentage":
percentage = float(self.config["value"])
num_articles = int(len(self.similarity_scores) * (percentage / 100.0))
selected_indices.extend(sorted_indices[:num_articles])

return selected_indices
15 changes: 7 additions & 8 deletions interest/article_final_selection/process_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
text_cleaner = TextCleaner()


def clean(text: str) -> str:
def clean(text: Union[str, List[str]]) -> str:
"""
Clean the input text using TextCleaner.

Expand Down Expand Up @@ -47,13 +47,13 @@ def __init__(self, gzip_file_path: str, article_id: int):
self._body: Union[str, list, None] = ''
self.selected: bool = False

def read_article_from_gzip(self, in_paragraph: bool = False) -> (
Tuple)[Union[str, None], Union[str, list, None]]:
def read_article_from_gzip(self) -> (
Tuple)[Union[str, None], Union[List[str], None]]:
"""
Read article content from a gzip file.

Returns:
Tuple[Union[str, None], Union[str, None]]: A tuple containing
Tuple[Union[str, None], Union[list, None]]: A tuple containing
the title and body of the article.
"""
try:
Expand All @@ -63,7 +63,7 @@ def read_article_from_gzip(self, in_paragraph: bool = False) -> (
article = articles.get(str(self._article_id), {})
title = article.get('title', {})
body = article.get('body', {})
return title, body if in_paragraph else " ".join(body)
return title, body
except Exception as e: # pylint: disable=broad-except
logging.error("Error reading article %s from %s: %s",
str(self._article_id), self._file_path, e)
Expand All @@ -88,6 +88,5 @@ def process_article(self, clean_keywords: List[str]) -> str:
if title_with_keyword:
self.selected = True
return ""
if isinstance(self._body, str):
return clean(self._body)
return ""

return clean(self._body)
115 changes: 115 additions & 0 deletions interest/output_generator/text_formater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
""" This module defines a TextFormatter class for formatting text based on
specified output units. """
from typing import List, Union
from spacy.language import Language
from interest.settings import SPACY_MODEL
from interest.utils import load_spacy_model
import logging

PARAGRAPH_FORMATTER = 'paragraph'
FULLTEXT_FORMATTER = 'full_text'
SEGMENTED_TEXT_FORMATTER = 'segmented_text'


class TextFormatter:
# pylint: disable=R0903
"""Class for formatting text based on specified output units. """

def __init__(self, output_unit: str, sentences_per_segment: int,
spacy_model: Union[str, Language] = SPACY_MODEL):
"""
Initializes the TextFormatter object.

Args:
output_unit (str): The type of output unit ('paragraph',
'full_text', 'segmented_text').
sentences_per_segment (int): Number of sentences per
segment when output_unit is 'segmented_text'.
spacy_model (Union[str, Language], optional): Spacy model
or model name used for text processing. Defaults to the global
SPACY_MODEL value.
"""
self.nlp = (
load_spacy_model(spacy_model)
if isinstance(spacy_model, str)
else spacy_model
)
self.sentences_per_segment = sentences_per_segment
self.formatter = output_unit
self.is_fulltext = self._is_fulltext()
self.texts: List[str] = []

def format_output(self, texts: Union[None, List[str]]) -> Union[str, List[str]]:
"""
Formats input texts based on the specified output unit.

Args:
texts (List[str]): List of input texts to be formatted.

Returns:
Union[str, List[List[str]]]: Formatted output text based on the
selected output_unit. For 'full_text', returns a single string.
For 'paragraph' and 'segmented_text', returns a list of segmented
text lists.

Raises:
ValueError: If input 'texts' is not a list of strings.
ValueError: If an unsupported formatter type is specified.
"""
try:
if (not isinstance(texts, list) or (texts is None) or
not all(isinstance(text, str) for text in texts)):
raise ValueError("Input 'texts' must be a list of strings.")

self.texts = texts

if self.formatter == PARAGRAPH_FORMATTER:
return self._format_paragraph()
if self.formatter == FULLTEXT_FORMATTER:
return self._format_fulltext()
if self.formatter == SEGMENTED_TEXT_FORMATTER:
return self._format_segmented_text()

except ValueError as e:
logging.error("Unsupported formatter %s: %s", self.formatter, e)

def _format_paragraph(self) -> List[str]:
"""Formats texts as a single paragraph.

Returns:
List[List[str]]: List of input texts, segmented in paragraphs.
"""
return self.texts

def _format_fulltext(self) -> str:
"""Formats texts as full text with newline separators.

Returns:
str: Newline-separated string of input texts.
"""
return '\n'.join(self.texts)

def _format_segmented_text(self) -> List[str]:
"""Formats texts as segmented text based on sentences_per_segment.

Returns:
List[str]: Flattened list of segmented text strings.
"""
segmented_texts = []
for text in self.texts:
doc = self.nlp(text)
sentences = [sent.text for sent in doc.sents]

for i in range(0, len(sentences), self.sentences_per_segment):
segment = sentences[i:i + self.sentences_per_segment]
segmented_texts.extend(segment)

return segmented_texts

def _is_fulltext(self) -> bool:
"""Checks if the formatter type is 'full_text'.

Returns:
bool: True if formatter is 'full_text', False otherwise.
"""
return self.formatter == FULLTEXT_FORMATTER
27 changes: 21 additions & 6 deletions interest/preprocessor/text_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,28 @@
data using various cleaning techniques.
"""
import re
# from typing import Optional
from typing import Union, List
from interest.settings import SPACY_MODEL
from interest.utils import load_spacy_model


def merge_texts_list(text: Union[str, List[str]]) -> str:
"""
Merge a list of texts into a single string by joining them with spaces.

Args:
text (Union[str, List[str]]): The input text or list of texts to merge.

Returns:
str: The merged text if input is a list of strings, otherwise returns
the input text unchanged.
"""
if isinstance(text, list):
merged_text = ' '.join(text)
return merged_text
return text


class TextCleaner:
"""A class for cleaning text data using various preprocessing
techniques."""
Expand Down Expand Up @@ -82,15 +99,12 @@ def preprocess(self, text):
"""Preprocess the given text using a series of cleaning steps.

Args:
text (str): The text to preprocess.
text ( List[str]): The text to preprocess.

Returns:
str: The preprocessed text.
"""
self.text = text
# self.get_words()
# self.lower()
# self.remove_stopwords()
self.text = merge_texts_list(text)
self.get_lower_lemma_tokens()
self.remove_numeric()
self.remove_extra_whitespace_tabs()
Expand All @@ -107,6 +121,7 @@ def clean(self, text):
Returns:
str: The cleaned text.
"""
self.text = merge_texts_list(text)
self.text = text
self.get_words()
self.keep_standard_chars()
Expand Down
43 changes: 7 additions & 36 deletions interest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def load_spacy_model(model_name: str, retry: bool = True) \
spacy.cli.download(model_name)
return load_spacy_model(model_name, False)
raise exc
nlp.add_pipe("sentencizer")
return nlp


Expand Down Expand Up @@ -106,60 +107,30 @@ def get_keywords_from_config(config_file: Path) -> List[str]:
raise KeyError("Keywords not found in config file") from exc


def get_article_selector_from_config(config_file: Path) -> dict:
def read_config(config_file: Path, item_key: str) -> Dict[str, str]:
"""
Get the article selector configuration from a JSON file.
Get the value of the given key item from a JSON file.

Args:
config_file (Path): The path to the JSON config file.

Returns:
Dict[str, str]: The article selector configuration.

Raises:
ArticleSelectorNotFoundError: If the article selector
is not found in the config file.
FileNotFoundError: If the config file is not found.
"""
try:
with open(config_file, 'r', encoding=ENCODING) as f:
config: Dict[str, str] = json.load(f)["article_selector"]
if not config:
raise ValueError("Config is empty")
return config
except FileNotFoundError as exc:
raise FileNotFoundError("Config file not found") from exc
except KeyError as exc:
raise KeyError("Article selector not found in config file") \
from exc


def get_output_unit_from_config(config_file: Path) -> dict:
"""
Get the article selector configuration from a JSON file.

Args:
config_file (Path): The path to the JSON config file.

item_key (str): Key item defined in config file.
Returns:
Dict[str, str]: The article selector configuration.

Raises:
ArticleSelectorNotFoundError: If the article selector
is not found in the config file.
KeyError: If the key item is not found in the config file.
FileNotFoundError: If the config file is not found.
"""
try:
with open(config_file, 'r', encoding=ENCODING) as f:
config: Dict[str, str] = json.load(f)["output_unit"]
config: Dict[str, str] = json.load(f)[item_key]
if not config:
raise ValueError("Config is empty")
return config
except FileNotFoundError as exc:
raise FileNotFoundError("Config file not found") from exc
except KeyError as exc:
raise KeyError("Article selector not found in config file") \
from exc
raise KeyError("Key item %s not found in config file") from exc


def save_filtered_articles(input_file: Any, article_id: str,
Expand Down
2 changes: 1 addition & 1 deletion scripts/step1_filter_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
help="Glob pattern for find input files; e.g. '*.gz' ",
)
parser.add_argument(
"--config_path",
"--config-path",
type=Path,
default="config.json",
help="File path of config file.",
Expand Down
12 changes: 7 additions & 5 deletions scripts/step3_select_final_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from pathlib import Path
import pandas as pd
from interest.utils import get_keywords_from_config
from interest.utils import get_article_selector_from_config
from interest.utils import read_config
from interest.article_final_selection.process_articles import select_articles

ARTICLE_SELECTOR_FIELD = "article_selector"


def update_selected_indices_in_file(filepath: str,
indices_selected: List[int]) -> None:
Expand Down Expand Up @@ -42,7 +44,7 @@ def update_selected_indices_in_file(filepath: str,
parser = argparse.ArgumentParser("Select final articles.")

parser.add_argument(
"--input_dir",
"--input-dir",
type=Path,
required=True,
help="Base directory for reading input files.",
Expand All @@ -54,7 +56,7 @@ def update_selected_indices_in_file(filepath: str,
help="Glob pattern for find input files; e.g. '*.csv'.",
)
parser.add_argument(
"--config_path",
"--config-path",
type=Path,
default="config.json",
help="File path of config file.",
Expand All @@ -66,8 +68,8 @@ def update_selected_indices_in_file(filepath: str,
parser.error(f"Not a directory: '{str(args.input_dir.absolute())}'")

keywords = get_keywords_from_config(args.config_path)
config_article_selector = get_article_selector_from_config(
args.config_path)
config_article_selector = read_config(
args.config_path, ARTICLE_SELECTOR_FIELD)

if (len(keywords) > 0) and config_article_selector:
for articles_filepath in args.input_dir.rglob(args.glob):
Expand Down
Loading
Loading