From a48aac7faa54a89b68a9fd9159fae0f56d479549 Mon Sep 17 00:00:00 2001 From: parisa-zahedi Date: Mon, 15 Apr 2024 14:35:40 +0200 Subject: [PATCH] Fix output (#14) * fix the error that output was splitted by comma * fix doc * fix flake8 error --------- Co-authored-by: parisa-zahedi --- interest/output_generator/text_formater.py | 16 ++++++++-------- scripts/step4_generate_output.py | 6 ++++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/interest/output_generator/text_formater.py b/interest/output_generator/text_formater.py index 42ee92b..93bb85b 100644 --- a/interest/output_generator/text_formater.py +++ b/interest/output_generator/text_formater.py @@ -39,7 +39,7 @@ def __init__(self, output_unit: str, sentences_per_segment: int, self.texts: List[str] = [] def format_output(self, texts: Union[None, List[str]]) -> ( - Union)[str, List[str], None]: + Union)[str, List[str], List[List[str]], None]: """ Formats input texts based on the specified output unit. @@ -47,10 +47,10 @@ def format_output(self, texts: Union[None, List[str]]) -> ( texts (List[str]): List of input texts to be formatted. Returns: - Union[str, List[List[str]]]: Formatted output text based on the - selected output_unit. For 'full_text', returns a single string. - For 'paragraph' and 'segmented_text', returns a list of segmented - text lists. + Union[str, List[str], List[List[str]]]: Formatted output text + based on the selected output_unit. For 'full_text', returns a + single string. For 'paragraph' and 'segmented_text', returns a + list of segmented text lists. Raises: ValueError: If input 'texts' is not a list of strings. @@ -91,11 +91,11 @@ def _format_fulltext(self) -> str: """ return '\n'.join(self.texts) - def _format_segmented_text(self) -> List[str]: + def _format_segmented_text(self) -> List[List[str]]: """Formats texts as segmented text based on sentences_per_segment. Returns: - List[str]: Flattened list of segmented text strings. + List[List[str]]: Flattened list of segmented text strings. """ segmented_texts = [] for text in self.texts: @@ -104,7 +104,7 @@ def _format_segmented_text(self) -> List[str]: for i in range(0, len(sentences), self.sentences_per_segment): segment = sentences[i:i + self.sentences_per_segment] - segmented_texts.extend(segment) + segmented_texts.append(segment) return segmented_texts diff --git a/scripts/step4_generate_output.py b/scripts/step4_generate_output.py index ad89f61..b9904c0 100644 --- a/scripts/step4_generate_output.py +++ b/scripts/step4_generate_output.py @@ -135,6 +135,8 @@ def find_articles_in_file(filepath: str, formatter: TextFormatter) -> ( spacy_model=SPACY_MODEL) for articles_filepath in args.input_dir.rglob(args.glob): df = find_articles_in_file(articles_filepath, text_formatter) + if df is None: + continue file_name = get_file_name_without_extension(articles_filepath) - df.to_csv(os.path.join(args.output_dir, 'articles_to_label_'+file_name+'.csv'), index = False) - + df.to_csv(os.path.join(args.output_dir, 'articles_to_label_' + + file_name+'.csv'), index=False)