diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4a5cb18..4fc8315 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -18,7 +18,7 @@ on: # Replace package-name with your package name env: - PACKAGE_NAME: interest + PACKAGE_NAME: dataQuest jobs: build: diff --git a/README.md b/README.md index 0116b7c..a949991 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# INTEREST +# dataQuest The code in this repository implements a pipeline to extract specific articles from a large corpus. @@ -10,7 +10,7 @@ Articles can be filtered based on individual or multiple features such as title, ## Getting Started Clone this repository to your working station to obtain examples and python scripts: ``` -git clone https://github.com/UtrechtUniversity/historical-news-sentiment.git +git clone https://github.com/UtrechtUniversity/dataQuest.git ``` ### Prerequisites @@ -20,10 +20,10 @@ To install and run this project you need to have the following prerequisites ins ``` ### Installation -#### Option 1 - Install interest package -To run the project, ensure to install the interest package that is part of this project. +#### Option 1 - Install dataQuest package +To run the project, ensure to install the dataQuest package that is part of this project. ``` -pip install interest +pip install dataQuest ``` #### Option 2 - Run from source code If you want to run the scripts without installation you need to: @@ -42,7 +42,7 @@ pip install . On Linux and Mac OS, you might have to set the PYTHONPATH environment variable to point to this directory. ```commandline -export PYTHONPATH="current working directory/historical-news-sentiment:${PYTHONPATH}" +export PYTHONPATH="current working directory/dataQuest:${PYTHONPATH}" ``` ### Built with These packages are automatically installed in the step above: @@ -85,7 +85,7 @@ Below is a snapshot of the JSON file format: In our use case, the harvested KB data is in XML format. We have provided the following script to transform the original data into the expected format. ``` -from interest.preprocessor.parser import XMLExtractor +from dataQuest.preprocessor.parser import XMLExtractor extractor = XMLExtractor(Path(input_dir), Path(output_dir)) extractor.extract_xml_string() @@ -99,9 +99,9 @@ python3 convert_input_files.py --input_dir path/to/raw/xml/data --output_dir pat In order to define a corpus with a new data format you should: -- add a new input_file_type to [INPUT_FILE_TYPES](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/interest/filter/__init__.py) -- implement a class that inherits from [input_file.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/interest/filter/input_file.py). -This class is customized to read a new data format. In our case-study we defined [delpher_kranten.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/interest/filter/delpher_kranten.py). +- add a new input_file_type to [INPUT_FILE_TYPES](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/__init__.py) +- implement a class that inherits from [input_file.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/input_file.py). +This class is customized to read a new data format. In our case-study we defined [delpher_kranten.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/filter/delpher_kranten.py). ### 2. Filtering @@ -144,7 +144,7 @@ The output of this script is a JSON file for each selected article in the follow } ``` ### 3. Categorization by timestamp -The output files generated in the previous step are categorized based on a specified [period-type](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/interest/temporal_categorization/__init__.py), +The output files generated in the previous step are categorized based on a specified [period-type](https://github.com/UtrechtUniversity/dataQuest/blob/main/dataQuest/temporal_categorization/__init__.py), such as ```year``` or ```decade```. This categorization is essential for subsequent steps, especially if you intend to apply tf-idf or other models to specific periods. In our case, we applied tf-idf per decade. ```commandline @@ -159,7 +159,7 @@ By utilizing tf-idf, the most relevant articles related to the specified topic ( Before applying tf-idf, articles containing any of the specified keywords in their title are selected. -From the rest of articles, to choose the most relevant ones, you can specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/config.json): +From the rest of articles, to choose the most relevant ones, you can specify one of the following criteria in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json): - Percentage of selected articles with the top scores - Maximum number of selected articles with the top scores @@ -192,12 +192,12 @@ From the rest of articles, to choose the most relevant ones, you can specify one The following script, add a new column, ```selected``` to the .csv files from the previous step. ```commandline -python3 scripts/3_select_final_articles.py --input_dir "output/output_timestamped/" +python3 scripts/step3_select_final_articles.py --input-dir "output/output_timestamped/" ``` ### 5. Generate output As the final step of the pipeline, the text of the selected articles is saved in a .csv file, which can be used for manual labeling. The user has the option to choose whether the text should be divided into paragraphs or a segmentation of the text. -This feature can be set in [config.py](https://github.com/UtrechtUniversity/historical-news-sentiment/blob/main/config.json). +This feature can be set in [config.py](https://github.com/UtrechtUniversity/dataQuest/blob/main/config.json). ```commandline "output_unit": "paragraph" @@ -211,7 +211,7 @@ OR ``` ```commandline -python3 scripts/step4_generate_output.py --input_dir "output/output_timestamped/” --output-dir “output/output_results/“ --glob “*.csv” +python3 scripts/step4_generate_output.py --input-dir "output/output_timestamped/” --output-dir “output/output_results/“ --glob “*.csv” ``` ## About the Project **Date**: February 2024 @@ -248,5 +248,5 @@ To contribute: Pim Huijnen - p.huijnen@uu.nl -Project Link: [https://github.com/UtrechtUniversity/historical-news-sentiment](https://github.com/UtrechtUniversity/historical-news-sentiment) +Project Link: [https://github.com/UtrechtUniversity/dataQuest](https://github.com/UtrechtUniversity/dataQuest) diff --git a/interest/__init__.py b/dataQuest/__init__.py similarity index 100% rename from interest/__init__.py rename to dataQuest/__init__.py diff --git a/interest/article_final_selection/__init__.py b/dataQuest/article_final_selection/__init__.py similarity index 100% rename from interest/article_final_selection/__init__.py rename to dataQuest/article_final_selection/__init__.py diff --git a/interest/article_final_selection/article_selector.py b/dataQuest/article_final_selection/article_selector.py similarity index 100% rename from interest/article_final_selection/article_selector.py rename to dataQuest/article_final_selection/article_selector.py diff --git a/interest/article_final_selection/process_article.py b/dataQuest/article_final_selection/process_article.py similarity index 98% rename from interest/article_final_selection/process_article.py rename to dataQuest/article_final_selection/process_article.py index 763e166..f78036a 100644 --- a/interest/article_final_selection/process_article.py +++ b/dataQuest/article_final_selection/process_article.py @@ -3,7 +3,7 @@ import json import logging from typing import List, Union, Tuple -from interest.preprocessor.text_cleaner import TextCleaner +from dataQuest.preprocessor.text_cleaner import TextCleaner text_cleaner = TextCleaner() diff --git a/interest/article_final_selection/process_articles.py b/dataQuest/article_final_selection/process_articles.py similarity index 93% rename from interest/article_final_selection/process_articles.py rename to dataQuest/article_final_selection/process_articles.py index c768271..d1f1419 100644 --- a/interest/article_final_selection/process_articles.py +++ b/dataQuest/article_final_selection/process_articles.py @@ -5,10 +5,10 @@ from typing import List, Tuple, Dict, Union import pandas as pd from sklearn.metrics.pairwise import cosine_similarity -from interest.models.tfidf import TfidfEmbedder -from interest.article_final_selection.process_article import ArticleProcessor -from interest.article_final_selection.process_article import clean -from interest.article_final_selection.article_selector import ArticleSelector +from dataQuest.models.tfidf import TfidfEmbedder +from dataQuest.article_final_selection.process_article import ArticleProcessor +from dataQuest.article_final_selection.process_article import clean +from dataQuest.article_final_selection.article_selector import ArticleSelector def process_articles(articles_filepath: str, clean_keywords: List[str]) -> ( diff --git a/interest/filter/__init__.py b/dataQuest/filter/__init__.py similarity index 60% rename from interest/filter/__init__.py rename to dataQuest/filter/__init__.py index 5618aa7..1351c2a 100644 --- a/interest/filter/__init__.py +++ b/dataQuest/filter/__init__.py @@ -1,5 +1,5 @@ """define input-file type""" -from interest.filter.delpher_kranten import KrantenFile +from dataQuest.filter.delpher_kranten import KrantenFile INPUT_FILE_TYPES = { "delpher_kranten": KrantenFile diff --git a/interest/filter/delpher_kranten.py b/dataQuest/filter/delpher_kranten.py similarity index 97% rename from interest/filter/delpher_kranten.py rename to dataQuest/filter/delpher_kranten.py index ec2dc85..f3544ff 100644 --- a/interest/filter/delpher_kranten.py +++ b/dataQuest/filter/delpher_kranten.py @@ -8,8 +8,8 @@ import logging import os from typing import Optional -from interest.filter.document import Document, Article -from interest.filter.input_file import InputFile +from dataQuest.filter.document import Document, Article +from dataQuest.filter.input_file import InputFile class KrantenFile(InputFile): diff --git a/interest/filter/document.py b/dataQuest/filter/document.py similarity index 100% rename from interest/filter/document.py rename to dataQuest/filter/document.py diff --git a/interest/filter/document_filter.py b/dataQuest/filter/document_filter.py similarity index 99% rename from interest/filter/document_filter.py rename to dataQuest/filter/document_filter.py index 19f5412..ee5bb8c 100644 --- a/interest/filter/document_filter.py +++ b/dataQuest/filter/document_filter.py @@ -4,7 +4,7 @@ """ from abc import ABC, abstractmethod from typing import List -from interest.filter.document import Document, Article +from dataQuest.filter.document import Document, Article class DocumentFilter(ABC): diff --git a/interest/filter/input_file.py b/dataQuest/filter/input_file.py similarity index 96% rename from interest/filter/input_file.py rename to dataQuest/filter/input_file.py index dcb7504..fc799fb 100644 --- a/interest/filter/input_file.py +++ b/dataQuest/filter/input_file.py @@ -8,8 +8,8 @@ import logging from pathlib import Path from typing import Iterable, TextIO, cast, Optional -from interest.filter.document import Document, Article -from interest.filter.document_filter import DocumentFilter +from dataQuest.filter.document import Document, Article +from dataQuest.filter.document_filter import DocumentFilter class InputFile(abc.ABC): diff --git a/interest/models/base.py b/dataQuest/models/base.py similarity index 100% rename from interest/models/base.py rename to dataQuest/models/base.py diff --git a/interest/models/tfidf.py b/dataQuest/models/tfidf.py similarity index 95% rename from interest/models/tfidf.py rename to dataQuest/models/tfidf.py index c443843..8583f07 100644 --- a/interest/models/tfidf.py +++ b/dataQuest/models/tfidf.py @@ -6,9 +6,9 @@ import scipy from sklearn.feature_extraction.text import TfidfVectorizer -from interest.models.base import BaseEmbedder -from interest.utils import load_spacy_model -from interest.settings import SPACY_MODEL +from dataQuest.models.base import BaseEmbedder +from dataQuest.utils import load_spacy_model +from dataQuest.settings import SPACY_MODEL class TfidfEmbedder(BaseEmbedder): diff --git a/interest/output_generator/text_formater.py b/dataQuest/output_generator/text_formater.py similarity index 97% rename from interest/output_generator/text_formater.py rename to dataQuest/output_generator/text_formater.py index 93bb85b..1bad6a0 100644 --- a/interest/output_generator/text_formater.py +++ b/dataQuest/output_generator/text_formater.py @@ -2,8 +2,8 @@ specified output units. """ from typing import List, Union import logging -from interest.settings import SPACY_MODEL -from interest.utils import load_spacy_model +from dataQuest.settings import SPACY_MODEL +from dataQuest.utils import load_spacy_model PARAGRAPH_FORMATTER = 'paragraph' FULLTEXT_FORMATTER = 'full_text' diff --git a/dataQuest/preprocessor/__init__.py b/dataQuest/preprocessor/__init__.py new file mode 100644 index 0000000..f6b7579 --- /dev/null +++ b/dataQuest/preprocessor/__init__.py @@ -0,0 +1 @@ +# from dataQuest.preprocessor.parser import XMLExtractor diff --git a/interest/preprocessor/parser.py b/dataQuest/preprocessor/parser.py similarity index 100% rename from interest/preprocessor/parser.py rename to dataQuest/preprocessor/parser.py diff --git a/interest/preprocessor/text_cleaner.py b/dataQuest/preprocessor/text_cleaner.py similarity index 97% rename from interest/preprocessor/text_cleaner.py rename to dataQuest/preprocessor/text_cleaner.py index ca96945..33cd27b 100644 --- a/interest/preprocessor/text_cleaner.py +++ b/dataQuest/preprocessor/text_cleaner.py @@ -4,8 +4,8 @@ """ import re from typing import Union, List -from interest.settings import SPACY_MODEL -from interest.utils import load_spacy_model +from dataQuest.settings import SPACY_MODEL +from dataQuest.utils import load_spacy_model def merge_texts_list(text: Union[str, List[str]]) -> str: diff --git a/interest/settings.py b/dataQuest/settings.py similarity index 100% rename from interest/settings.py rename to dataQuest/settings.py diff --git a/interest/temporal_categorization/__init__.py b/dataQuest/temporal_categorization/__init__.py similarity index 77% rename from interest/temporal_categorization/__init__.py rename to dataQuest/temporal_categorization/__init__.py index 60ec91d..ca3bb25 100644 --- a/interest/temporal_categorization/__init__.py +++ b/dataQuest/temporal_categorization/__init__.py @@ -1,5 +1,5 @@ """Mapping from string format descriptions to corresponding classes.""" -from interest.temporal_categorization.timestamped_data \ +from dataQuest.temporal_categorization.timestamped_data \ import (YearPeriodData, DecadePeriodData) PERIOD_TYPES = { diff --git a/interest/temporal_categorization/timestamped_data.py b/dataQuest/temporal_categorization/timestamped_data.py similarity index 100% rename from interest/temporal_categorization/timestamped_data.py rename to dataQuest/temporal_categorization/timestamped_data.py diff --git a/interest/utils.py b/dataQuest/utils.py similarity index 93% rename from interest/utils.py rename to dataQuest/utils.py index c601162..7d8fd65 100644 --- a/interest/utils.py +++ b/dataQuest/utils.py @@ -8,13 +8,13 @@ import json import spacy import spacy.cli -from interest.filter.document_filter import (YearFilter, - TitleFilter, - DocumentFilter) -from interest.filter.document_filter import (CompoundFilter, - DecadeFilter, - KeywordsFilter) -from interest.settings import ENCODING +from dataQuest.filter.document_filter import (YearFilter, + TitleFilter, + DocumentFilter) +from dataQuest.filter.document_filter import (CompoundFilter, + DecadeFilter, + KeywordsFilter) +from dataQuest.settings import ENCODING @cache diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py deleted file mode 100644 index 3cec932..0000000 --- a/interest/preprocessor/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# from interest.preprocessor.parser import XMLExtractor diff --git a/pyproject.toml b/pyproject.toml index 6878f35..23e1c3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm"] build-backend = "setuptools.build_meta" [project] -name = "interest" +name = "dataQuest" description = "A package to extract hystorical news sentiments" authors = [ {name = "Shiva Nadi", email = "s.nadi@uu.nl"}, @@ -31,7 +31,7 @@ lint = ["flake8"] test = ["pytest", "mypy"] [tool.setuptools] -packages = ["interest"] +packages = ["dataQuest"] [tool.flake8] max-line-length = 99 diff --git a/scripts/convert_input_files.py b/scripts/convert_input_files.py index b6d2dea..023d152 100644 --- a/scripts/convert_input_files.py +++ b/scripts/convert_input_files.py @@ -1,4 +1,4 @@ -from interest.preprocessor.parser import XMLExtractor +from dataQuest.preprocessor.parser import XMLExtractor from argparse import ArgumentParser from pathlib import Path import logging diff --git a/scripts/step1_filter_articles.py b/scripts/step1_filter_articles.py index 99d59a0..fa638f2 100644 --- a/scripts/step1_filter_articles.py +++ b/scripts/step1_filter_articles.py @@ -9,10 +9,10 @@ from tqdm import tqdm -from interest.filter import INPUT_FILE_TYPES -from interest.filter.input_file import InputFile -from interest.utils import load_filters_from_config -from interest.utils import save_filtered_articles +from dataQuest.filter import INPUT_FILE_TYPES +from dataQuest.filter.input_file import InputFile +from dataQuest.utils import load_filters_from_config +from dataQuest.utils import save_filtered_articles if __name__ == "__main__": parser = argparse.ArgumentParser("Filter articles from input files.") diff --git a/scripts/step2_categorize_by_timestamp.py b/scripts/step2_categorize_by_timestamp.py index 0979642..586eb3d 100644 --- a/scripts/step2_categorize_by_timestamp.py +++ b/scripts/step2_categorize_by_timestamp.py @@ -9,8 +9,8 @@ from pathlib import Path import pandas as pd from tqdm import tqdm # type: ignore -from interest.temporal_categorization import PERIOD_TYPES -from interest.temporal_categorization.timestamped_data import TimestampedData +from dataQuest.temporal_categorization import PERIOD_TYPES +from dataQuest.temporal_categorization.timestamped_data import TimestampedData OUTPUT_FILE_NAME = 'articles' FILENAME_COLUMN = 'file_path' diff --git a/scripts/step3_select_final_articles.py b/scripts/step3_select_final_articles.py index 37f723c..82a71f9 100644 --- a/scripts/step3_select_final_articles.py +++ b/scripts/step3_select_final_articles.py @@ -4,9 +4,9 @@ from typing import List from pathlib import Path import pandas as pd -from interest.utils import get_keywords_from_config -from interest.utils import read_config -from interest.article_final_selection.process_articles import select_articles +from dataQuest.utils import get_keywords_from_config +from dataQuest.utils import read_config +from dataQuest.article_final_selection.process_articles import select_articles ARTICLE_SELECTOR_FIELD = "article_selector" diff --git a/scripts/step4_generate_output.py b/scripts/step4_generate_output.py index 161140c..5a62e5a 100644 --- a/scripts/step4_generate_output.py +++ b/scripts/step4_generate_output.py @@ -7,11 +7,11 @@ from typing import Union import pandas as pd from pandas import DataFrame -from interest.settings import SPACY_MODEL -from interest.article_final_selection.process_article import ArticleProcessor -from interest.utils import read_config, get_file_name_without_extension -from interest.output_generator.text_formater import (TextFormatter, - SEGMENTED_TEXT_FORMATTER) +from dataQuest.settings import SPACY_MODEL +from dataQuest.article_final_selection.process_article import ArticleProcessor +from dataQuest.utils import read_config, get_file_name_without_extension +from dataQuest.output_generator.text_formater import (TextFormatter, + SEGMENTED_TEXT_FORMATTER) FILE_PATH_FIELD = "file_path"