diff --git a/README.md b/README.md index a949991..a32351b 100644 --- a/README.md +++ b/README.md @@ -108,24 +108,38 @@ This class is customized to read a new data format. In our case-study we defined In this step, you may select articles based on a filter or a collection of filters. Articles can be filtered by title, year, decade, or a set of keywords defined in the ```config.json``` file. ```commandline "filters": [ - { - "type": "TitleFilter", - "title": "example" - }, - { - "type": "YearFilter", - "year": 2022 - }, - { - "type": "DecadeFilter", - "decade": 1960 - }, - { - "type": "KeywordsFilter", - "keywords": ["sustainability", "green"] - } - ] - } + { + "type": "AndFilter", + "filters": [ + { + "type": "OrFilter", + "filters": [ + { + "type": "YearFilter", + "start_year": 1800, + "end_year": 1910 + }, + { + "type": "DecadeFilter", + "decade": 1960 + } + ] + }, + { + "type": "NotFilter", + "filter": { + "type": "ArticleTitleFilter", + "article_title": "Advertentie" + }, + "level": "article" + }, + { + "type": "KeywordsFilter", + "keywords": ["sustainability", "green"] + } + ] + } + ] ``` run the following to filter the articles: diff --git a/dataQuest/filter/document_filter.py b/dataQuest/filter/document_filter.py index ee5bb8c..9cb0136 100644 --- a/dataQuest/filter/document_filter.py +++ b/dataQuest/filter/document_filter.py @@ -74,26 +74,34 @@ def filter_document(self, document: Document) -> bool: class YearFilter(DocumentFilter): """ - Filter documents by year. + Filter documents by a range of years. - Attributes: - year (int): The year to filter by. + Attributes: + start_year (int): The start year of the range. + end_year (int): The end year of the range. """ - def __init__(self, year: int): - self.year = year + def __init__(self, start_year: int, end_year: int): + self.start_year = start_year + self.end_year = end_year def filter_document(self, document: Document) -> bool: """ - Filter documents by year. + Filter documents by a range of years. - Args: - document (Document): The document to be filtered. + Args: + document (Document): The document to be filtered. - Returns: - bool: True if the document's year matches the specified - year, False otherwise. + Returns: + bool: True if the document's year is within the specified range, + False otherwise. """ - return document.year == self.year + if document.year is None: + return False + if self.start_year is not None and document.year < self.start_year: + return False + if self.end_year is not None and document.year > self.end_year: + return False + return True class DecadeFilter(DocumentFilter): @@ -157,52 +165,96 @@ def filter_article(self, article: Article) -> bool: keyword in self.keywords) -class CompoundFilter(DocumentFilter): +class ArticleTitleFilter(DocumentFilter): """ - Compound filter combining multiple filters. + Filter documents and articles by article title. Attributes: - filters (List[DocumentFilter]): The list of filters to apply. + article_title (str): The article title to filter by. """ - def __init__(self, filters: List[DocumentFilter]): - self.filters = filters + def __init__(self, article_title: str): + self.article_title = article_title def filter_document(self, document: Document) -> bool: """ - Filter documents by applying all filters. + Filter documents by article title. Args: document (Document): The document to be filtered. Returns: - bool: True if the document passes all filters, - False otherwise. + bool: Always returns True. """ - return all(filter_.filter_document(document) - for filter_ in self.filters) + return True def filter_article(self, article: Article) -> bool: """ - Filter articles by applying all filters. + Filter articles by keywords. Args: article (Article): The article to be filtered. Returns: - bool: True if the article passes all filters, - False otherwise. + bool: True if the article's title or text contains any + of the specified keywords, False otherwise. """ + return self.article_title in article.title + + +class AndFilter(DocumentFilter): + """ + Logical AND filter combining multiple filters. + + Attributes: + filters (List[DocumentFilter]): The list of filters to apply. + """ + def __init__(self, filters: List[DocumentFilter]): + self.filters = filters + + def filter_document(self, document: Document) -> bool: + return all(filter_.filter_document(document) for filter_ in self.filters) + + def filter_article(self, article: Article) -> bool: return all(filter_.filter_article(article) for filter_ in self.filters) - def include_keyword_filter(self) -> bool: - """ - Check if the compound filter includes a KeywordsFilter. - Returns: - bool: True if the compound filter includes a - KeywordsFilter, False otherwise. - """ - for filter_ in self.filters: - if isinstance(filter_, KeywordsFilter): - return True - return False +class OrFilter(DocumentFilter): + """ + Logical OR filter combining multiple filters. + + Attributes: + filters (List[DocumentFilter]): The list of filters to apply. + """ + def __init__(self, filters: List[DocumentFilter]): + self.filters = filters + + def filter_document(self, document: Document) -> bool: + return any(filter_.filter_document(document) for filter_ in self.filters) + + def filter_article(self, article: Article) -> bool: + return any(filter_.filter_article(article) for filter_ in self.filters) + + +class NotFilter(DocumentFilter): + """ + Logical NOT filter to negate a filter's result. + + Attributes: + filter (DocumentFilter): The filter to negate. + level (str): The level at which to apply the filter ('document', 'article', or 'both'). + """ + def __init__(self, _filter: DocumentFilter, level: str = 'both'): + self.filter = _filter + self.level = level + + def filter_document(self, document: Document) -> bool: + if self.level in ('document', 'both'): + result = not self.filter.filter_document(document) + return result + return True + + def filter_article(self, article: Article) -> bool: + if self.level in ('article', 'both'): + result = not self.filter.filter_article(article) + return result + return True diff --git a/dataQuest/utils.py b/dataQuest/utils.py index 7d8fd65..81d3fc8 100644 --- a/dataQuest/utils.py +++ b/dataQuest/utils.py @@ -11,9 +11,13 @@ from dataQuest.filter.document_filter import (YearFilter, TitleFilter, DocumentFilter) -from dataQuest.filter.document_filter import (CompoundFilter, + +from dataQuest.filter.document_filter import (AndFilter, + OrFilter, + NotFilter, DecadeFilter, - KeywordsFilter) + KeywordsFilter, + ArticleTitleFilter) from dataQuest.settings import ENCODING @@ -44,7 +48,43 @@ def load_spacy_model(model_name: str, retry: bool = True) \ return nlp -def load_filters_from_config(config_file: Path) -> CompoundFilter: +def create_filter(filter_config: Dict[str, Any]) -> DocumentFilter: + """ + Factory function to create filters based on configuration. + + Args: + filter_config (Dict[str, Any]): The filter configuration. + + Returns: + DocumentFilter: The created filter instance. + """ + filter_type = filter_config.get('type') + + if filter_type == 'TitleFilter': + return TitleFilter(filter_config['title']) + if filter_type == 'YearFilter': + start_year = filter_config['start_year'] + end_year = filter_config['end_year'] + return YearFilter(start_year, end_year) + if filter_type == 'DecadeFilter': + return DecadeFilter(filter_config['decade']) + if filter_type == 'KeywordsFilter': + return KeywordsFilter(filter_config['keywords']) + if filter_type == 'ArticleTitleFilter': + return ArticleTitleFilter(filter_config['article_title']) + if filter_type == 'AndFilter': + return AndFilter([create_filter(f) for f in filter_config['filters']]) + if filter_type == 'OrFilter': + return OrFilter([create_filter(f) for f in filter_config['filters']]) + if filter_type == 'NotFilter': + inner_filter = create_filter(filter_config['filter']) + level = filter_config.get('level', 'both') + return NotFilter(inner_filter, level) + + raise ValueError(f"Unknown filter type: {filter_type}") + + +def load_filters_from_config(config_file: Path) -> AndFilter: """Load document filters from a configuration file. Args: @@ -58,19 +98,9 @@ def load_filters_from_config(config_file: Path) -> CompoundFilter: with open(config_file, 'r', encoding=ENCODING) as f: config: Dict[str, List[Dict[str, Any]]] = json.load(f) - filters: List[DocumentFilter] = [] - for filter_config in config['filters']: - filter_type = filter_config['type'] - if filter_type == 'TitleFilter': - filters.append(TitleFilter(filter_config['title'])) - elif filter_type == 'YearFilter': - filters.append(YearFilter(filter_config['year'])) - elif filter_type == 'DecadeFilter': - filters.append(DecadeFilter(filter_config['decade'])) - elif filter_type == 'KeywordsFilter': - filters.append(KeywordsFilter(filter_config['keywords'])) - - return CompoundFilter(filters) + filters = [create_filter(filter_config) for filter_config in config['filters']] + compound_filter = AndFilter(filters) + return compound_filter def get_keywords_from_config(config_file: Path) -> List[str]: @@ -152,7 +182,9 @@ def save_filtered_articles(input_file: Any, article_id: str, "Title": input_file.doc().title, } - output_fp = os.path.join(output_dir, input_file.base_file_name() + '.json') + output_fp = os.path.join(output_dir, input_file.base_file_name() + '_' + + str(article_id) + '.json') + print('output_fp', output_fp) with open(output_fp, "w", encoding=ENCODING) as json_file: json.dump(data, json_file, indent=4) diff --git a/pyproject.toml b/pyproject.toml index 23e1c3d..c9ca36f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,8 @@ classifiers = [ ] dynamic = ["version"] -dependencies = ["tqdm","pandas","pandas-stubs", "types-tqdm","spacy","scikit-learn","numpy","scipy" +dependencies = ["tqdm","pandas","pandas-stubs", "types-tqdm","spacy","scikit-learn","numpy","scipy", + "flake8-pyproject" ] [project.optional-dependencies] @@ -40,6 +41,9 @@ max-line-length = 99 max-line-length=100 max-locals=35 max-args=10 +disable = [ + "R0911" +] [[tool.mypy.overrides]] module = [ diff --git a/scripts/step1_filter_articles.py b/scripts/step1_filter_articles.py index fa638f2..3abf8ab 100644 --- a/scripts/step1_filter_articles.py +++ b/scripts/step1_filter_articles.py @@ -60,10 +60,9 @@ args.output_dir.mkdir(parents=True, exist_ok=True) compound_filter = load_filters_from_config(args.config_path) - with_keyword_filter = compound_filter.include_keyword_filter() for input_file in tqdm(input_files, desc="Filtering articles", unit="file"): for article in input_file.selected_articles(compound_filter): save_filtered_articles(input_file, article.id, - args.output_dir) + args.output_dir)