Skip to content

Commit

Permalink
Merge branch 'main' into filter
Browse files Browse the repository at this point in the history
  • Loading branch information
parisa-zahedi authored Mar 7, 2024
2 parents 7fb2de2 + 57235ab commit ac93487
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 43 deletions.
4 changes: 2 additions & 2 deletions interest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .delpher_kranten import KrantenFile
# from interest.preprocessor.parser import XMLExtractor
from interest.delpher_kranten import KrantenFile

INPUT_FILE_TYPES = {
"delpher_kranten": KrantenFile

}
"""Mapping from string format descriptions to corresponding classes."""
5 changes: 3 additions & 2 deletions interest/delpher_kranten.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
import logging
import os
from typing import Optional
from .document import Document, Article
from .input_file import InputFile
from interest.document import Document, Article
from interest.input_file import InputFile



class KrantenFile(InputFile):
Expand Down
31 changes: 6 additions & 25 deletions interest/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
This module defines the Document class, which represents a document
containing articles.
"""
from typing import Optional, List
from typing import Optional, List, Union
from datetime import datetime


Expand All @@ -20,14 +20,15 @@ class Article:
body (str): The body text of the article, represented as
a single string.
"""
def __init__(self, article_id: str, title: str, body: list) -> None:
def __init__(self, article_id: str, title: str,
body: Union[str, List[str]]) -> None:
"""Initialize an Article object with the given ID, title, and body.
Args:
id (str): The unique identifier of the article.
title (str): The title of the article.
body (list): The body text of the article, provided as a
list of paragraphs.
body (Union[str, List[str]): The body text of the article,
provided as a list of paragraphs.
"""
self.id = article_id
self.title = title
Expand Down Expand Up @@ -72,7 +73,7 @@ class Document:
"""
def __init__(self, title: str, publish_date: str, language: str,
articles: List[Article]) -> None:
self._year = None
self._year: Optional[int] = None
self._articles = articles
self._title = title
self._publish_date = publish_date
Expand All @@ -88,16 +89,6 @@ def title(self) -> str:
"""
return self._title

@property
def publish_date(self) -> str:
"""
Getter for the publication date of the document.
Returns:
str: The publication date of the document.
"""
return self._publish_date

@property
def year(self) -> Optional[int]:
"""
Expand Down Expand Up @@ -129,16 +120,6 @@ def decade(self) -> Optional[int]:
_ = self.year
return int(self._year / 10) * 10 if self._year is not None else None

@property
def language(self) -> str:
"""
Getter for the language of the document.
Returns:
str: The language of the document.
"""
return self._language

@property
def articles(self) -> List[Article]:
"""
Expand Down
29 changes: 17 additions & 12 deletions interest/input_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

import abc
import gzip
import logging
from pathlib import Path
from typing import Iterable, TextIO
from .document import Document, Article
from .document_filter import DocumentFilter
from typing import Iterable, TextIO, cast, Optional
from interest.document import Document, Article
from interest.document_filter import DocumentFilter


class InputFile(abc.ABC):
Expand Down Expand Up @@ -72,10 +73,12 @@ def open(self, mode: str = "rt", encoding=None) -> TextIO:
TextIO: A file object for reading the input file.
"""
if self._filepath.suffix.startswith(".gz"):
return gzip.open(self._filepath, mode=mode, encoding=encoding)
return cast(TextIO, gzip.open(self._filepath, mode=mode,
encoding=encoding))

# Default to text file
return open(self._filepath, mode=mode, encoding=encoding)
return cast(TextIO, open(self._filepath,
mode=mode, encoding=encoding))

# pylint: disable=no-member
def articles(self) -> Iterable[Article]:
Expand All @@ -85,15 +88,16 @@ def articles(self) -> Iterable[Article]:
Yields:
Article: An article object.
"""

yield from self.doc().articles()
# for document in self.doc(): # Iterate over each Document object
# for article in self.doc.articles(): # Iterate over articles in
# the Document
# yield article
doc = self.doc()
if doc is not None:
yield from doc.articles
else:
logging.error("Document not found or is None for filepath: %s",
self.filepath)
return

@abc.abstractmethod
def doc(self) -> Document:
def doc(self) -> Optional[Document]:
"""
Output a list of documents in the input file.
Expand All @@ -112,3 +116,4 @@ def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
for article in document.articles:
if filter.filter_article(article):
yield article

1 change: 1 addition & 0 deletions interest/preprocessor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# from interest.preprocessor.parser import XMLExtractor
207 changes: 207 additions & 0 deletions interest/preprocessor/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@

import os
import tarfile
import gzip
import json
import xml.etree.ElementTree as ET
from typing import Dict, Union, Any, Optional, List
import logging


class XMLExtractor:
"""Class for extracting XML content and metadata from nested .tgz files.""" # noqa: E501
def __init__(self, root_dir: str, output_dir: str):
"""
Initializes the XMLExtractor object.
Parameters:
root_dir (str): The root directory containing .tgz files.
output_dir (str): The output directory for saving extracted JSON files. # noqa: E501
"""
self.root_dir = root_dir
self.output_dir = output_dir
self.fields = [
"title", "language", "issuenumber", "date", "identifier",
"temporal", "recordRights", "publisher", "spatial", "source",
"recordIdentifier", "type", "isPartOf"
]

def extract_xml_string(self) -> None:
"""
Extracts XML content and metadata from .tgz files in the root directory. # noqa: E501
"""
for folder_name in os.listdir(self.root_dir):
folder_path = os.path.join(self.root_dir, folder_name)
if not os.path.isdir(folder_path):
continue
if not folder_name.isdigit(): # Exclude in_progress, manifests, and ocr_complete folders and log files. # noqa: E501
continue
self.process_folder(folder_name, folder_path)

def process_folder(self, folder_name: str, folder_path: str) -> None:
"""
Processes .tgz files within a folder.
Parameters:
folder_name (str): Name of the folder being processed.
folder_path (str): Path to the folder being processed.
"""
for tgz_filename in os.listdir(folder_path):
if not tgz_filename.endswith('.tgz'):
continue
tgz_file_path = os.path.join(folder_path, tgz_filename)
base_name = os.path.splitext(tgz_filename)[0]
output_folder = os.path.join(self.output_dir, folder_name)
os.makedirs(output_folder, exist_ok=True)
try:
with tarfile.open(tgz_file_path, "r:gz") as outer_tar:
news_dict = self.process_tar(outer_tar)
except tarfile.TarError as e:
logging.error(f"Error extracting {tgz_filename}: {e}")
continue
output_file = os.path.join(output_folder, f"{base_name}.json")
self.save_as_json_compressed(news_dict, output_file)
# self.save_as_json(news_dict, output_file)

def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: # noqa: E501
"""
Processes a .tgz file and extracts XML content and metadata.
Parameters:
outer_tar (tarfile.TarFile): The .tgz file being processed.
Returns:
Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata. # noqa: E501
"""
news_dict: Dict[str, Any] = {"newsletter_metadata": {}, "articles": {}}
id = 0
for entry in outer_tar:
try:
if entry.name.endswith(".xml"):
file = outer_tar.extractfile(entry)
if file is not None:
content = file.read()
xml_content = content.decode('utf-8', 'ignore')
article = self.extract_article(xml_content, entry.name)
id += 1
news_dict["articles"][id] = article

elif entry.name.endswith(".gz"):
gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz')) # noqa: E501
with outer_tar.extractfile(gz_member) as gz_file: # type: ignore # noqa: E501
with gzip.open(gz_file, 'rt') as xml_file:
xml_string = xml_file.read()
if isinstance(xml_string, bytes):
xml_string = xml_string.decode('utf-8')
newsletter_metadata = self.extract_meta(xml_string)
news_dict["newsletter_metadata"] = newsletter_metadata # noqa: E501
else:
continue
except Exception as e:
logging.error(f"Error processing file {entry.name}: {e}")
return news_dict

@staticmethod
def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: # noqa: E501
"""
Saves data as compressed JSON using gzip.
Parameters:
data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. # noqa: E501
output_file (str): Path to the output JSON file.
"""
try:
with gzip.open(output_file, 'wt') as json_file:
json.dump(data, json_file, indent=4)
except Exception as e:
logging.error(f"Error saving compressed JSON to {output_file}: {e}") # noqa: E501

# @staticmethod
# def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: # noqa: E501
# """
# Saves data as JSON to a specified file.

# Parameters:
# data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. # noqa: E501
# output_file (str): Path to the output JSON file.
# """
# try:
# with open(output_file, 'w') as json_file:
# json.dump(data, json_file, indent=4)
# except Exception as e:
# logging.error(f"Error saving JSON to {output_file}: {e}")

@staticmethod
def extract_article(xml_content: str, file_name: str) -> Dict[str, Union[str, List[Optional[str]]]]: # noqa: E501
"""
Extracts article title and body from XML content.
Parameters:
xml_content (str): XML content of the article.
file_name (str): Name of the XML file.
Returns:
Dict[Optional[str], list[str]]: A dictionary containing the extracted title and body of the article.
body contains a list of paragraphs. # noqa: E501
"""
try:
root = ET.fromstring(xml_content)
except ET.ParseError:
logging.error(f"Failed to parse XML from file: {file_name}")
return {}

title_values = [element.text for element in root.iter() if element.tag.endswith('title')] # noqa: E501
if len(title_values) > 1:
logging.warning("More than one titles are extracted for the article.") # noqa: E501
if not title_values:
logging.warning("No title is extracted for the article.")
title = ""
else:
title = title_values[0] if title_values[0] is not None else ""
# title = title_values[0]

body_values = [element.text for element in root.iter() if element.tag.endswith('p')] # noqa: E501
if not body_values:
logging.warning("No body is extracted.")
body = []
# elif len(body_values) > 1:
# logging.warning("There are more than one paragraphs in the article.") # noqa: E501
# body = ' '.join(body_values)
else:
# body = body_values[0]
body = body_values

return {"title": title, "body": body}

def extract_meta(self, xml_string: str) -> Dict[str, Union[str, None]]:
"""
Extracts metadata from XML string.
Parameters:
xml_string (str): XML string containing metadata.
Returns:
Dict[str, Union[str, None]]: A dictionary containing the extracted metadata. # noqa: E501
"""
newsletter_metadata: Dict[str, Union[str, None]] = {}

try:
root = ET.fromstring(xml_string)
except ET.ParseError:
logging.error("Failed to parse XML from file")
return newsletter_metadata

for field in self.fields:
field_values = [element.text for element in root.iter() if element.tag.endswith(field)] # noqa: E501
if len(field_values) > 1:
logging.warning(f"More than one {field}s are extracted from metadata.") # noqa: E501
if not field_values:
logging.warning(f"No {field} is extracted.")
newsletter_metadata[field] = None
else:
filtered_field_values = [value for value in field_values if value is not None] # noqa: E501
newsletter_metadata[field] = filtered_field_values[0] if field != "spatial" else ", ".join(filtered_field_values) # noqa: E501

# newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values) # noqa: E501

return newsletter_metadata
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ description = "A package to extract hystorical news sentiments"
authors = [
{name = "Shiva Nadi", email = "s.nadi@uu.nl"},
{name = "Parisa Zahedi", email = "p.zahedi@uu.nl"},
{name = "Matty Vermet", email = "m.s.vermet@uu.nl"}
]
readme = "README.md"
requires-python = ">=3.8"
Expand All @@ -23,8 +24,6 @@ classifiers = [
]
dynamic = ["version"]
dependencies = [
# "numpy ~= 1.23.4",
# "scikit-learn ~= 0.19.1",
]

[project.optional-dependencies]
Expand Down
Loading

0 comments on commit ac93487

Please sign in to comment.