Skip to content

Commit

Permalink
save seperate csv file for each timeperiod
Browse files Browse the repository at this point in the history
  • Loading branch information
parisa-zahedi committed Apr 12, 2024
1 parent 5dad77d commit 122d367
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
16 changes: 16 additions & 0 deletions interest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,19 @@ def save_filtered_articles(input_file: Any, article_id: str,
print('output_fp', output_fp)
with open(output_fp, "w", encoding=ENCODING) as json_file:
json.dump(data, json_file, indent=4)


def get_file_name_without_extension(full_path: str) -> str:
"""
Extracts the file name without extension from a full path.
Args:
full_path (str): The full path of the file.
Returns:
str: The file name without extension.
"""
base_name = os.path.basename(full_path)
file_name_without_ext = os.path.splitext(base_name)[0]
return file_name_without_ext
11 changes: 4 additions & 7 deletions scripts/step4_generate_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
from pandas import DataFrame
from interest.settings import SPACY_MODEL
from interest.article_final_selection.process_article import ArticleProcessor
from interest.utils import read_config
from interest.utils import read_config, get_file_name_without_extension
from interest.output_generator.text_formater import (TextFormatter,
SEGMENTED_TEXT_FORMATTER)


FILE_PATH_FIELD = "file_path"
ARTICLE_ID_FIELD = "article_id"
TITLE_FIELD = "title"
ARTICLE_ID_FIELD = "article_id"
BODY_FIELD = "body"
LABEL_FIELD = "label"
SELECTED_FIELD = "selected"
Expand Down Expand Up @@ -130,14 +130,11 @@ def find_articles_in_file(filepath: str, formatter: TextFormatter) -> (
SENTENCES_PER_SEGMENT = str(read_config(args.config_path,
SENTENCE_PER_SEGMENT_KEY))

result_df = pd.DataFrame(columns=[FILE_PATH_FIELD, ARTICLE_ID_FIELD,
TITLE_FIELD, BODY_FIELD, LABEL_FIELD])

text_formatter = TextFormatter(str(output_unit),
int(SENTENCES_PER_SEGMENT),
spacy_model=SPACY_MODEL)
for articles_filepath in args.input_dir.rglob(args.glob):
df = find_articles_in_file(articles_filepath, text_formatter)
result_df = pd.concat([result_df, df], ignore_index=True)
file_name = get_file_name_without_extension(articles_filepath)
df.to_csv(os.path.join(args.output_dir, 'articles_to_label_'+file_name+'.csv'), index = False)

result_df.to_csv(os.path.join(args.output_dir, 'articles_to_label.csv'))

0 comments on commit 122d367

Please sign in to comment.