Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Logical operators #19

Merged
merged 11 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 32 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,24 +108,38 @@ This class is customized to read a new data format. In our case-study we defined
In this step, you may select articles based on a filter or a collection of filters. Articles can be filtered by title, year, decade, or a set of keywords defined in the ```config.json``` file.
```commandline
"filters": [
{
"type": "TitleFilter",
"title": "example"
},
{
"type": "YearFilter",
"year": 2022
},
{
"type": "DecadeFilter",
"decade": 1960
},
{
"type": "KeywordsFilter",
"keywords": ["sustainability", "green"]
}
]
}
{
"type": "AndFilter",
"filters": [
{
"type": "OrFilter",
"filters": [
{
"type": "YearFilter",
"start_year": 1800,
"end_year": 1910
},
{
"type": "DecadeFilter",
"decade": 1960
}
]
},
{
"type": "NotFilter",
"filter": {
"type": "ArticleTitleFilter",
"article_title": "Advertentie"
},
"level": "article"
},
{
"type": "KeywordsFilter",
"keywords": ["sustainability", "green"]
}
]
}
]

```
run the following to filter the articles:
Expand Down
124 changes: 88 additions & 36 deletions dataQuest/filter/document_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,26 +74,34 @@ def filter_document(self, document: Document) -> bool:

class YearFilter(DocumentFilter):
"""
Filter documents by year.
Filter documents by a range of years.

Attributes:
year (int): The year to filter by.
Attributes:
start_year (int): The start year of the range.
end_year (int): The end year of the range.
"""
def __init__(self, year: int):
self.year = year
def __init__(self, start_year: int, end_year: int):
self.start_year = start_year
self.end_year = end_year

def filter_document(self, document: Document) -> bool:
"""
Filter documents by year.
Filter documents by a range of years.

Args:
document (Document): The document to be filtered.
Args:
document (Document): The document to be filtered.

Returns:
bool: True if the document's year matches the specified
year, False otherwise.
Returns:
bool: True if the document's year is within the specified range,
False otherwise.
"""
return document.year == self.year
if document.year is None:
return False
if self.start_year is not None and document.year < self.start_year:
return False
if self.end_year is not None and document.year > self.end_year:
return False
return True


class DecadeFilter(DocumentFilter):
Expand Down Expand Up @@ -157,52 +165,96 @@ def filter_article(self, article: Article) -> bool:
keyword in self.keywords)


class CompoundFilter(DocumentFilter):
class ArticleTitleFilter(DocumentFilter):
"""
Compound filter combining multiple filters.
Filter documents and articles by article title.

Attributes:
filters (List[DocumentFilter]): The list of filters to apply.
article_title (str): The article title to filter by.
"""
def __init__(self, filters: List[DocumentFilter]):
self.filters = filters
def __init__(self, article_title: str):
self.article_title = article_title

def filter_document(self, document: Document) -> bool:
"""
Filter documents by applying all filters.
Filter documents by article title.

Args:
document (Document): The document to be filtered.

Returns:
bool: True if the document passes all filters,
False otherwise.
bool: Always returns True.
"""
return all(filter_.filter_document(document)
for filter_ in self.filters)
return True

def filter_article(self, article: Article) -> bool:
"""
Filter articles by applying all filters.
Filter articles by keywords.

Args:
article (Article): The article to be filtered.

Returns:
bool: True if the article passes all filters,
False otherwise.
bool: True if the article's title or text contains any
of the specified keywords, False otherwise.
"""
return self.article_title in article.title


class AndFilter(DocumentFilter):
"""
Logical AND filter combining multiple filters.

Attributes:
filters (List[DocumentFilter]): The list of filters to apply.
"""
def __init__(self, filters: List[DocumentFilter]):
self.filters = filters

def filter_document(self, document: Document) -> bool:
return all(filter_.filter_document(document) for filter_ in self.filters)

def filter_article(self, article: Article) -> bool:
return all(filter_.filter_article(article) for filter_ in self.filters)

def include_keyword_filter(self) -> bool:
"""
Check if the compound filter includes a KeywordsFilter.

Returns:
bool: True if the compound filter includes a
KeywordsFilter, False otherwise.
"""
for filter_ in self.filters:
if isinstance(filter_, KeywordsFilter):
return True
return False
class OrFilter(DocumentFilter):
"""
Logical OR filter combining multiple filters.

Attributes:
filters (List[DocumentFilter]): The list of filters to apply.
"""
def __init__(self, filters: List[DocumentFilter]):
self.filters = filters

def filter_document(self, document: Document) -> bool:
return any(filter_.filter_document(document) for filter_ in self.filters)

def filter_article(self, article: Article) -> bool:
return any(filter_.filter_article(article) for filter_ in self.filters)


class NotFilter(DocumentFilter):
"""
Logical NOT filter to negate a filter's result.

Attributes:
filter (DocumentFilter): The filter to negate.
level (str): The level at which to apply the filter ('document', 'article', or 'both').
"""
def __init__(self, _filter: DocumentFilter, level: str = 'both'):
self.filter = _filter
self.level = level

def filter_document(self, document: Document) -> bool:
if self.level in ('document', 'both'):
result = not self.filter.filter_document(document)
return result
return True

def filter_article(self, article: Article) -> bool:
if self.level in ('article', 'both'):
result = not self.filter.filter_article(article)
return result
return True
66 changes: 49 additions & 17 deletions dataQuest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@
from dataQuest.filter.document_filter import (YearFilter,
TitleFilter,
DocumentFilter)
from dataQuest.filter.document_filter import (CompoundFilter,

from dataQuest.filter.document_filter import (AndFilter,
OrFilter,
NotFilter,
DecadeFilter,
KeywordsFilter)
KeywordsFilter,
ArticleTitleFilter)
from dataQuest.settings import ENCODING


Expand Down Expand Up @@ -44,7 +48,43 @@ def load_spacy_model(model_name: str, retry: bool = True) \
return nlp


def load_filters_from_config(config_file: Path) -> CompoundFilter:
def create_filter(filter_config: Dict[str, Any]) -> DocumentFilter:
"""
Factory function to create filters based on configuration.

Args:
filter_config (Dict[str, Any]): The filter configuration.

Returns:
DocumentFilter: The created filter instance.
"""
filter_type = filter_config.get('type')

if filter_type == 'TitleFilter':
return TitleFilter(filter_config['title'])
if filter_type == 'YearFilter':
start_year = filter_config['start_year']
end_year = filter_config['end_year']
return YearFilter(start_year, end_year)
if filter_type == 'DecadeFilter':
return DecadeFilter(filter_config['decade'])
if filter_type == 'KeywordsFilter':
return KeywordsFilter(filter_config['keywords'])
if filter_type == 'ArticleTitleFilter':
return ArticleTitleFilter(filter_config['article_title'])
if filter_type == 'AndFilter':
return AndFilter([create_filter(f) for f in filter_config['filters']])
if filter_type == 'OrFilter':
return OrFilter([create_filter(f) for f in filter_config['filters']])
if filter_type == 'NotFilter':
inner_filter = create_filter(filter_config['filter'])
level = filter_config.get('level', 'both')
return NotFilter(inner_filter, level)

raise ValueError(f"Unknown filter type: {filter_type}")


def load_filters_from_config(config_file: Path) -> AndFilter:
"""Load document filters from a configuration file.

Args:
Expand All @@ -58,19 +98,9 @@ def load_filters_from_config(config_file: Path) -> CompoundFilter:
with open(config_file, 'r', encoding=ENCODING) as f:
config: Dict[str, List[Dict[str, Any]]] = json.load(f)

filters: List[DocumentFilter] = []
for filter_config in config['filters']:
filter_type = filter_config['type']
if filter_type == 'TitleFilter':
filters.append(TitleFilter(filter_config['title']))
elif filter_type == 'YearFilter':
filters.append(YearFilter(filter_config['year']))
elif filter_type == 'DecadeFilter':
filters.append(DecadeFilter(filter_config['decade']))
elif filter_type == 'KeywordsFilter':
filters.append(KeywordsFilter(filter_config['keywords']))

return CompoundFilter(filters)
filters = [create_filter(filter_config) for filter_config in config['filters']]
compound_filter = AndFilter(filters)
return compound_filter


def get_keywords_from_config(config_file: Path) -> List[str]:
Expand Down Expand Up @@ -152,7 +182,9 @@ def save_filtered_articles(input_file: Any, article_id: str,
"Title": input_file.doc().title,
}

output_fp = os.path.join(output_dir, input_file.base_file_name() + '.json')
output_fp = os.path.join(output_dir, input_file.base_file_name() + '_' +
str(article_id) + '.json')

print('output_fp', output_fp)
with open(output_fp, "w", encoding=ENCODING) as json_file:
json.dump(data, json_file, indent=4)
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ classifiers = [
]
dynamic = ["version"]

dependencies = ["tqdm","pandas","pandas-stubs", "types-tqdm","spacy","scikit-learn","numpy","scipy"
dependencies = ["tqdm","pandas","pandas-stubs", "types-tqdm","spacy","scikit-learn","numpy","scipy",
"flake8-pyproject"
]

[project.optional-dependencies]
Expand All @@ -40,6 +41,9 @@ max-line-length = 99
max-line-length=100
max-locals=35
max-args=10
disable = [
"R0911"
]

[[tool.mypy.overrides]]
module = [
Expand Down
3 changes: 1 addition & 2 deletions scripts/step1_filter_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,9 @@
args.output_dir.mkdir(parents=True, exist_ok=True)

compound_filter = load_filters_from_config(args.config_path)
with_keyword_filter = compound_filter.include_keyword_filter()

for input_file in tqdm(input_files, desc="Filtering articles",
unit="file"):
for article in input_file.selected_articles(compound_filter):
save_filtered_articles(input_file, article.id,
args.output_dir)
args.output_dir)
Loading