Skip to content

Commit

Permalink
Logical operators (#19)
Browse files Browse the repository at this point in the history
* change repo_name

* remove files in old directory

* modify project name

* change project name

* add logical operators

* fix pylint error

* add logical operators in description of config file

* add range to year filter

* add range to filter years

* fix flake8 errors

---------

Co-authored-by: parisa-zahedi <p.zahedi@uu.nl>
  • Loading branch information
parisa-zahedi and parisa-zahedi authored Aug 16, 2024
1 parent 80e00cd commit bc18f13
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 74 deletions.
50 changes: 32 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,24 +108,38 @@ This class is customized to read a new data format. In our case-study we defined
In this step, you may select articles based on a filter or a collection of filters. Articles can be filtered by title, year, decade, or a set of keywords defined in the ```config.json``` file.
```commandline
"filters": [
{
"type": "TitleFilter",
"title": "example"
},
{
"type": "YearFilter",
"year": 2022
},
{
"type": "DecadeFilter",
"decade": 1960
},
{
"type": "KeywordsFilter",
"keywords": ["sustainability", "green"]
}
]
}
{
"type": "AndFilter",
"filters": [
{
"type": "OrFilter",
"filters": [
{
"type": "YearFilter",
"start_year": 1800,
"end_year": 1910
},
{
"type": "DecadeFilter",
"decade": 1960
}
]
},
{
"type": "NotFilter",
"filter": {
"type": "ArticleTitleFilter",
"article_title": "Advertentie"
},
"level": "article"
},
{
"type": "KeywordsFilter",
"keywords": ["sustainability", "green"]
}
]
}
]
```
run the following to filter the articles:
Expand Down
124 changes: 88 additions & 36 deletions dataQuest/filter/document_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,26 +74,34 @@ def filter_document(self, document: Document) -> bool:

class YearFilter(DocumentFilter):
"""
Filter documents by year.
Filter documents by a range of years.
Attributes:
year (int): The year to filter by.
Attributes:
start_year (int): The start year of the range.
end_year (int): The end year of the range.
"""
def __init__(self, year: int):
self.year = year
def __init__(self, start_year: int, end_year: int):
self.start_year = start_year
self.end_year = end_year

def filter_document(self, document: Document) -> bool:
"""
Filter documents by year.
Filter documents by a range of years.
Args:
document (Document): The document to be filtered.
Args:
document (Document): The document to be filtered.
Returns:
bool: True if the document's year matches the specified
year, False otherwise.
Returns:
bool: True if the document's year is within the specified range,
False otherwise.
"""
return document.year == self.year
if document.year is None:
return False
if self.start_year is not None and document.year < self.start_year:
return False
if self.end_year is not None and document.year > self.end_year:
return False
return True


class DecadeFilter(DocumentFilter):
Expand Down Expand Up @@ -157,52 +165,96 @@ def filter_article(self, article: Article) -> bool:
keyword in self.keywords)


class CompoundFilter(DocumentFilter):
class ArticleTitleFilter(DocumentFilter):
"""
Compound filter combining multiple filters.
Filter documents and articles by article title.
Attributes:
filters (List[DocumentFilter]): The list of filters to apply.
article_title (str): The article title to filter by.
"""
def __init__(self, filters: List[DocumentFilter]):
self.filters = filters
def __init__(self, article_title: str):
self.article_title = article_title

def filter_document(self, document: Document) -> bool:
"""
Filter documents by applying all filters.
Filter documents by article title.
Args:
document (Document): The document to be filtered.
Returns:
bool: True if the document passes all filters,
False otherwise.
bool: Always returns True.
"""
return all(filter_.filter_document(document)
for filter_ in self.filters)
return True

def filter_article(self, article: Article) -> bool:
"""
Filter articles by applying all filters.
Filter articles by keywords.
Args:
article (Article): The article to be filtered.
Returns:
bool: True if the article passes all filters,
False otherwise.
bool: True if the article's title or text contains any
of the specified keywords, False otherwise.
"""
return self.article_title in article.title


class AndFilter(DocumentFilter):
"""
Logical AND filter combining multiple filters.
Attributes:
filters (List[DocumentFilter]): The list of filters to apply.
"""
def __init__(self, filters: List[DocumentFilter]):
self.filters = filters

def filter_document(self, document: Document) -> bool:
return all(filter_.filter_document(document) for filter_ in self.filters)

def filter_article(self, article: Article) -> bool:
return all(filter_.filter_article(article) for filter_ in self.filters)

def include_keyword_filter(self) -> bool:
"""
Check if the compound filter includes a KeywordsFilter.

Returns:
bool: True if the compound filter includes a
KeywordsFilter, False otherwise.
"""
for filter_ in self.filters:
if isinstance(filter_, KeywordsFilter):
return True
return False
class OrFilter(DocumentFilter):
"""
Logical OR filter combining multiple filters.
Attributes:
filters (List[DocumentFilter]): The list of filters to apply.
"""
def __init__(self, filters: List[DocumentFilter]):
self.filters = filters

def filter_document(self, document: Document) -> bool:
return any(filter_.filter_document(document) for filter_ in self.filters)

def filter_article(self, article: Article) -> bool:
return any(filter_.filter_article(article) for filter_ in self.filters)


class NotFilter(DocumentFilter):
"""
Logical NOT filter to negate a filter's result.
Attributes:
filter (DocumentFilter): The filter to negate.
level (str): The level at which to apply the filter ('document', 'article', or 'both').
"""
def __init__(self, _filter: DocumentFilter, level: str = 'both'):
self.filter = _filter
self.level = level

def filter_document(self, document: Document) -> bool:
if self.level in ('document', 'both'):
result = not self.filter.filter_document(document)
return result
return True

def filter_article(self, article: Article) -> bool:
if self.level in ('article', 'both'):
result = not self.filter.filter_article(article)
return result
return True
66 changes: 49 additions & 17 deletions dataQuest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@
from dataQuest.filter.document_filter import (YearFilter,
TitleFilter,
DocumentFilter)
from dataQuest.filter.document_filter import (CompoundFilter,

from dataQuest.filter.document_filter import (AndFilter,
OrFilter,
NotFilter,
DecadeFilter,
KeywordsFilter)
KeywordsFilter,
ArticleTitleFilter)
from dataQuest.settings import ENCODING


Expand Down Expand Up @@ -44,7 +48,43 @@ def load_spacy_model(model_name: str, retry: bool = True) \
return nlp


def load_filters_from_config(config_file: Path) -> CompoundFilter:
def create_filter(filter_config: Dict[str, Any]) -> DocumentFilter:
"""
Factory function to create filters based on configuration.
Args:
filter_config (Dict[str, Any]): The filter configuration.
Returns:
DocumentFilter: The created filter instance.
"""
filter_type = filter_config.get('type')

if filter_type == 'TitleFilter':
return TitleFilter(filter_config['title'])
if filter_type == 'YearFilter':
start_year = filter_config['start_year']
end_year = filter_config['end_year']
return YearFilter(start_year, end_year)
if filter_type == 'DecadeFilter':
return DecadeFilter(filter_config['decade'])
if filter_type == 'KeywordsFilter':
return KeywordsFilter(filter_config['keywords'])
if filter_type == 'ArticleTitleFilter':
return ArticleTitleFilter(filter_config['article_title'])
if filter_type == 'AndFilter':
return AndFilter([create_filter(f) for f in filter_config['filters']])
if filter_type == 'OrFilter':
return OrFilter([create_filter(f) for f in filter_config['filters']])
if filter_type == 'NotFilter':
inner_filter = create_filter(filter_config['filter'])
level = filter_config.get('level', 'both')
return NotFilter(inner_filter, level)

raise ValueError(f"Unknown filter type: {filter_type}")


def load_filters_from_config(config_file: Path) -> AndFilter:
"""Load document filters from a configuration file.
Args:
Expand All @@ -58,19 +98,9 @@ def load_filters_from_config(config_file: Path) -> CompoundFilter:
with open(config_file, 'r', encoding=ENCODING) as f:
config: Dict[str, List[Dict[str, Any]]] = json.load(f)

filters: List[DocumentFilter] = []
for filter_config in config['filters']:
filter_type = filter_config['type']
if filter_type == 'TitleFilter':
filters.append(TitleFilter(filter_config['title']))
elif filter_type == 'YearFilter':
filters.append(YearFilter(filter_config['year']))
elif filter_type == 'DecadeFilter':
filters.append(DecadeFilter(filter_config['decade']))
elif filter_type == 'KeywordsFilter':
filters.append(KeywordsFilter(filter_config['keywords']))

return CompoundFilter(filters)
filters = [create_filter(filter_config) for filter_config in config['filters']]
compound_filter = AndFilter(filters)
return compound_filter


def get_keywords_from_config(config_file: Path) -> List[str]:
Expand Down Expand Up @@ -152,7 +182,9 @@ def save_filtered_articles(input_file: Any, article_id: str,
"Title": input_file.doc().title,
}

output_fp = os.path.join(output_dir, input_file.base_file_name() + '.json')
output_fp = os.path.join(output_dir, input_file.base_file_name() + '_' +
str(article_id) + '.json')

print('output_fp', output_fp)
with open(output_fp, "w", encoding=ENCODING) as json_file:
json.dump(data, json_file, indent=4)
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ classifiers = [
]
dynamic = ["version"]

dependencies = ["tqdm","pandas","pandas-stubs", "types-tqdm","spacy","scikit-learn","numpy","scipy"
dependencies = ["tqdm","pandas","pandas-stubs", "types-tqdm","spacy","scikit-learn","numpy","scipy",
"flake8-pyproject"
]

[project.optional-dependencies]
Expand All @@ -40,6 +41,9 @@ max-line-length = 99
max-line-length=100
max-locals=35
max-args=10
disable = [
"R0911"
]

[[tool.mypy.overrides]]
module = [
Expand Down
3 changes: 1 addition & 2 deletions scripts/step1_filter_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,9 @@
args.output_dir.mkdir(parents=True, exist_ok=True)

compound_filter = load_filters_from_config(args.config_path)
with_keyword_filter = compound_filter.include_keyword_filter()

for input_file in tqdm(input_files, desc="Filtering articles",
unit="file"):
for article in input_file.selected_articles(compound_filter):
save_filtered_articles(input_file, article.id,
args.output_dir)
args.output_dir)

0 comments on commit bc18f13

Please sign in to comment.