From ed6ec18dc15f1341576be806112abd0e0ed1b4f2 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Tue, 13 Feb 2024 11:01:23 +0100
Subject: [PATCH 01/22] Add parser

---
 interest/preprocessor/__init__.py |  0
 interest/preprocessor/parser.py   | 55 +++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 interest/preprocessor/__init__.py
 create mode 100644 interest/preprocessor/parser.py

diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
new file mode 100644
index 0000000..fbe4148
--- /dev/null
+++ b/interest/preprocessor/parser.py
@@ -0,0 +1,55 @@
+import json
+import lzma
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import List, Union, Dict
+
+import xml.etree.cElementTree as et
+
+def parse_raw_article(article_input_fp: Union[Path, str]) -> Dict:
+    """Parse a raw article file into a structured list
+
+    Arguments
+    ---------
+    article_input_fp:
+    Input file to process.
+
+    Returns
+    --------
+    articles: List[Dict]
+    A list of dictionaries, where each item is for one article and includes
+    the title and the body of article.
+     
+    """
+    if article_input_fp !=None:
+        tree = et.parse(article_input_fp)
+        root = tree.getroot()
+        for title_item in root.findall('./title'):
+            title = title_item.text
+        for article_item in root.findall('./p'):
+            body = article_item.text
+
+        return title, body
+
+
+def parse_journal_articles(input_dir: Union[Path, str]) -> Dict:
+     input_dir = Path(input_dir)
+     file_list = list(input_dir.glob("*.xml"))
+     meta_file_list = list(input_dir.glob("*.didl.xml"))
+     file_list = [item for item in file_list if item not in meta_file_list]
+     articles: List[Dict] = []
+     for file in file_list:
+         title, body = parse_raw_article(file)
+         articles.append({"title": title, "body":body})
+     return articles
+    
+    
+
+
+    
+
+if __name__ == "__main__":
+    # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml'))
+    print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
+    

From bf1f4d840bab468d017b3af3cfd0c18dbe3d944d Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 16 Feb 2024 09:25:39 +0100
Subject: [PATCH 02/22] Add parse_meta_data function

---
 interest/preprocessor/parser.py | 46 ++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index fbe4148..e71f687 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -39,17 +39,61 @@ def parse_journal_articles(input_dir: Union[Path, str]) -> Dict:
      meta_file_list = list(input_dir.glob("*.didl.xml"))
      file_list = [item for item in file_list if item not in meta_file_list]
      articles: List[Dict] = []
+
      for file in file_list:
          title, body = parse_raw_article(file)
          articles.append({"title": title, "body":body})
      return articles
     
+def parse_meta_file(input_dir: Union[Path, str]) -> Dict:
+    input_dir = Path(input_dir)
+    meta_file_list = list(input_dir.glob("*.didl.xml"))
+
+    tree=et.parse(meta_file_list[0])
+    root=tree.getroot()
+
+    title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
+    language_values = [element.text for element in root.iter() if element.tag.endswith('language')]
+    issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')]
+    date_values = [element.text for element in root.iter() if element.tag.endswith('date')]
+    identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')]
+
+    print(title_values[0], '*')
+    print(language_values[0], '*')
+    print(issuenumber_values[0], '*')
+    print(date_values[0], '*')
+    print(identifier_values[0], '*')
+
+
+
     
 
+
+    
+
+    # for item in root.findall('.//{urn:mpeg:mpeg21:2002:02-DIDL-NS}Item'):
+    #     for x in item.iter():
+    #         for t in x.findall('{http://purl.org/dc/elements/1.1/}title'):
+    #             title = t.text
+    #         for l in x.findall('./{http://purl.org/dc/elements/1.1/}language'):
+    #             language = l.text
+    #         for issuenumber in x.findall('./{http://krait.kb.nl/coop/tel/handbook/telterms.html}issuenumber'):
+    #             issue_number = issuenumber.text
+    #         for d in x.findall('./{http://purl.org/dc/elements/1.1/}date'):
+    #             date = d.text
+    #         for i in x.findall('./{http://purl.org/dc/elements/1.1/}identifier'):
+    #             identifier = i.text
+            
+
+
+
+
+
     
 
 if __name__ == "__main__":
     # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml'))
-    print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
+    # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
+    parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')
     

From 43467d8b70ef8792daa4343138ab69ad1253dfb5 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 16 Feb 2024 12:18:53 +0100
Subject: [PATCH 03/22] Extract metadata

---
 interest/preprocessor/parser.py | 179 ++++++++++++++++++++++++++------
 1 file changed, 147 insertions(+), 32 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index e71f687..94830f1 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -4,6 +4,7 @@
 from collections import Counter, defaultdict
 from pathlib import Path
 from typing import List, Union, Dict
+import logging
 
 import xml.etree.cElementTree as et
 
@@ -40,6 +41,7 @@ def parse_journal_articles(input_dir: Union[Path, str]) -> Dict:
      file_list = [item for item in file_list if item not in meta_file_list]
      articles: List[Dict] = []
 
+
      for file in file_list:
          title, body = parse_raw_article(file)
          articles.append({"title": title, "body":body})
@@ -48,46 +50,159 @@ def parse_journal_articles(input_dir: Union[Path, str]) -> Dict:
 def parse_meta_file(input_dir: Union[Path, str]) -> Dict:
     input_dir = Path(input_dir)
     meta_file_list = list(input_dir.glob("*.didl.xml"))
-
-    tree=et.parse(meta_file_list[0])
-    root=tree.getroot()
-
-    title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
-    language_values = [element.text for element in root.iter() if element.tag.endswith('language')]
-    issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')]
-    date_values = [element.text for element in root.iter() if element.tag.endswith('date')]
-    identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')]
-
-    print(title_values[0], '*')
-    print(language_values[0], '*')
-    print(issuenumber_values[0], '*')
-    print(date_values[0], '*')
-    print(identifier_values[0], '*')
+    newsletter_metadata: List[Dict] = []
 
 
+    try:
+        tree=et.parse(meta_file_list[0])
+        root=tree.getroot()
+    except et.ParseError as e:
+        logging.error("Failed to parse the xml file:%s", e)
 
-    
-
+        
+    title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
+    if len(title_values)>1:
+        logging.warning("More than one titles are extracted from metadata.")
+    if not title_values:
+        logging.warning("No title is extracted.")
+        title = None
+    else:
+        title = title_values[0]
 
+    language_values = [element.text for element in root.iter() if element.tag.endswith('language')]
+    if len(language_values)>1:
+        logging.warning("More than one language are extracted from metadata.")
+    if not language_values:
+        logging.warning("No language is extracted.")
+        language = None
+    else:
+        language = language_values[0]
 
     
-
-    # for item in root.findall('.//{urn:mpeg:mpeg21:2002:02-DIDL-NS}Item'):
-    #     for x in item.iter():
-    #         for t in x.findall('{http://purl.org/dc/elements/1.1/}title'):
-    #             title = t.text
-    #         for l in x.findall('./{http://purl.org/dc/elements/1.1/}language'):
-    #             language = l.text
-    #         for issuenumber in x.findall('./{http://krait.kb.nl/coop/tel/handbook/telterms.html}issuenumber'):
-    #             issue_number = issuenumber.text
-    #         for d in x.findall('./{http://purl.org/dc/elements/1.1/}date'):
-    #             date = d.text
-    #         for i in x.findall('./{http://purl.org/dc/elements/1.1/}identifier'):
-    #             identifier = i.text
-            
+    issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')]
+    if len(issuenumber_values)>1:
+        logging.warning("More than one issuenumbers are extracted from metadata.")
+    if not issuenumber_values:
+        logging.warning("No issuenumber is extracted.")
+        issuenumber = None
+    else:
+        issuenumber = issuenumber_values[0]
 
 
+    date_values = [element.text for element in root.iter() if element.tag.endswith('date')]
+    if len(date_values)>1:
+        logging.warning("More than one dates are extracted from metadata.")
+    if not date_values:
+        logging.warning("No date is extracted.")
+        date = None
+    else:
+        date = date_values[0]
 
+    identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')]
+    if len(identifier_values)>1:
+        logging.warning("More than one identifiers are extracted from metadata.")
+    if not identifier_values:
+        logging.warning("No identifier is extracted.")
+        identifier = None
+    else:
+        identifier = identifier_values[0]
+
+    temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')]
+    if len(temporal_values)>1:
+        logging.warning("More than one temporal are extracted from metadata.")
+    if not temporal_values:
+        logging.warning("No temporal is extracted.")
+        temporal = None
+    else:
+        temporal = temporal_values[0]
+
+    recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')]
+    if len(recordRights_values)>1:
+        logging.warning("More than one recordRights are extracted from metadata.")
+    if not recordRights_values:
+        logging.warning("No recordRights is extracted.")
+        recordRights = None
+    else:
+        recordRights = recordRights_values[0]
+
+    publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')]
+    if len(publisher_values)>1:
+        logging.warning("More than one publisher are extracted from metadata.")
+    if not publisher_values:
+        logging.warning("No publisher is extracted.")
+        publisher = None
+    else:
+        publisher = publisher_values[0]
+
+    spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')]
+    if len(spatial_values)>1:
+        logging.warning("More than one spatial are extracted from metadata.")
+    if not spatial_values:
+        logging.warning("No spatial is extracted.")
+        spatial_1 = None
+        spatial_2 = None
+    else:
+        spatial_1 = spatial_values[0]
+        spatial_2 = spatial_values[1]
+
+    source_values = [element.text for element in root.iter() if element.tag.endswith('source')]
+    if len(source_values)>1:
+        logging.warning("More than one source are extracted from metadata.")
+    if not source_values:
+        logging.warning("No source is extracted.")
+        source = None
+    else:
+        source = source_values[1]
+
+    recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')]
+    if len(recordIdentifier_values)>1:
+        logging.warning("More than one recordIdentifier are extracted from metadata.")
+    if not recordIdentifier_values:
+        logging.warning("No recordIdentifier is extracted.")
+        recordIdentifier = None
+    else:
+        recordIdentifier = recordIdentifier_values[0]
+
+    type_values = [element.text for element in root.iter() if element.tag.endswith('type')]
+    if len(type_values)>1:
+        logging.warning("More than one type are extracted from metadata.")
+    if not type_values:
+        logging.warning("No type is extracted.")
+        type = None
+    else:
+        type = type_values[0]
+
+    isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')]
+    if len(isPartOf_values)>1:
+        logging.warning("More than one isPartOf are extracted from metadata.")
+    if not isPartOf_values:
+        logging.warning("No isPartOf is extracted.")
+        isPartOf_1 = None
+        isPartOf_2 = None
+    else:
+        isPartOf_1 = isPartOf_values[0]
+        isPartOf_2 = isPartOf_values[1]
+
+
+    newsletter_metadata.append({
+        "title": title,
+          "language":language,
+            "issue_number":issuenumber,
+            "date": date,
+            "identifier": identifier,
+            "temporal": temporal,
+            "recordRights": recordRights,
+            "publisher": publisher,
+            "spatial_1": spatial_1,
+            "spatial_2": spatial_2,
+            "source": source,
+            "recordIdentifier": recordIdentifier,
+            "type": type,
+            "isPartOf_1":isPartOf_1,
+            "isPartOf_2":isPartOf_2
+            })
+     
+    return newsletter_metadata
 
 
@@ -95,5 +210,5 @@ def parse_meta_file(input_dir: Union[Path, str]) -> Dict:
 if __name__ == "__main__":
     # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml'))
     # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
-    parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')
+    print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
     

From 215f20953b3d88743699bc2712812a2b15025732 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 16 Feb 2024 15:22:57 +0100
Subject: [PATCH 04/22] Add parse_all_articles

---
 interest/preprocessor/parser.py | 87 +++++++++++++++++++++------------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 94830f1..9acdfe8 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -3,17 +3,38 @@
 import re
 from collections import Counter, defaultdict
 from pathlib import Path
-from typing import List, Union, Dict
+from typing import List, Union, Dict, Optional
 import logging
 
 import xml.etree.cElementTree as et
+   
+def parse_all_articles(input_dir: Union[Path, str]) -> Dict:
+     
+     input_dir = Path(input_dir)
+     file_list = list(input_dir.glob("*.xml"))
+     # List of meta files
+     meta_file_list = list(input_dir.glob("*.didl.xml"))
+    # List of xml files excluded meta file
+     article_list = [item for item in file_list if item not in meta_file_list]
+
+     articles: List[Dict] = []
+
 
-def parse_raw_article(article_input_fp: Union[Path, str]) -> Dict:
+     for file in article_list:
+         article = parse_raw_article(file)
+         articles.append(article)
+
+     newsletter_metadata= parse_meta_file(meta_file_list[0])
+
+     news_dict = {"newsletter_metadata": newsletter_metadata, "articles": articles}
+     return news_dict
+
+def parse_raw_article(article_fp: Union[Path, str]) -> Dict:
     """Parse a raw article file into a structured list
 
     Arguments
     ---------
-    article_input_fp:
+    article_input_fp: Union[Path, str]
     Input file to process.
 
     Returns
@@ -23,41 +44,45 @@ def parse_raw_article(article_input_fp: Union[Path, str]) -> Dict:
     the title and the body of article.
      
     """
-    if article_input_fp !=None:
-        tree = et.parse(article_input_fp)
+    try:
+        tree = et.parse(article_fp)
         root = tree.getroot()
-        for title_item in root.findall('./title'):
-            title = title_item.text
-        for article_item in root.findall('./p'):
-            body = article_item.text
-
-        return title, body
-
+    except et.ParseError as e:
+        logging.error("Failed to parse the article file:%s", e)
 
-def parse_journal_articles(input_dir: Union[Path, str]) -> Dict:
-     input_dir = Path(input_dir)
-     file_list = list(input_dir.glob("*.xml"))
-     meta_file_list = list(input_dir.glob("*.didl.xml"))
-     file_list = [item for item in file_list if item not in meta_file_list]
-     articles: List[Dict] = []
+    title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
+    if len(title_values)>1:
+        logging.warning("More than one titles are extracted for the article.")
+    if not title_values:
+        logging.warning("No title is extracted for the article.")
+        title = None
+    else:
+        title = title_values[0]
 
+    body_values = [element.text for element in root.iter() if element.tag.endswith('p')]
+    if not body_values:
+        logging.warning("No body is extracted.")
+        body = None
+    if len(body_values)>1:
+        logging.warning("There are more than on paragraphs in the article.")
+        body = ' '.join(body_values)
+    else:
+        body = body_values[0]
 
-     for file in file_list:
-         title, body = parse_raw_article(file)
-         articles.append({"title": title, "body":body})
-     return articles
+    return {"title": title, "body":body}
     
-def parse_meta_file(input_dir: Union[Path, str]) -> Dict:
-    input_dir = Path(input_dir)
-    meta_file_list = list(input_dir.glob("*.didl.xml"))
+
+def parse_meta_file(meta_fp: Union[Path, str]) -> Dict:
+    # input_dir = Path(input_dir)
+    # meta_file_list = list(input_dir.glob("*.didl.xml"))
     newsletter_metadata: List[Dict] = []
 
 
     try:
-        tree=et.parse(meta_file_list[0])
+        tree=et.parse(meta_fp)
         root=tree.getroot()
     except et.ParseError as e:
-        logging.error("Failed to parse the xml file:%s", e)
+        logging.error("Failed to parse the meta file:%s", e)
 
         
     title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
@@ -185,8 +210,8 @@ def parse_meta_file(input_dir: Union[Path, str]) -> Dict:
 
 
     newsletter_metadata.append({
-        "title": title,
-          "language":language,
+            "title": title,
+            "language":language,
             "issue_number":issuenumber,
             "date": date,
             "identifier": identifier,
@@ -210,5 +235,7 @@ def parse_meta_file(input_dir: Union[Path, str]) -> Dict:
 if __name__ == "__main__":
     # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml'))
     # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
-    print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
+    # print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
+    x = parse_all_articles("../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100")
+    print(x)
     

From 00573dbefe222e846bfc8a68b12de9b3e127dc8c Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 16 Feb 2024 16:07:38 +0100
Subject: [PATCH 05/22] Convert to class

---
 interest/preprocessor/parser.py | 457 +++++++++++++++++---------------
 1 file changed, 237 insertions(+), 220 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 9acdfe8..d61049f 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -7,235 +7,252 @@
 import logging
 
 import xml.etree.cElementTree as et
-   
-def parse_all_articles(input_dir: Union[Path, str]) -> Dict:
-     
-     input_dir = Path(input_dir)
-     file_list = list(input_dir.glob("*.xml"))
-     # List of meta files
-     meta_file_list = list(input_dir.glob("*.didl.xml"))
-    # List of xml files excluded meta file
-     article_list = [item for item in file_list if item not in meta_file_list]
-
-     articles: List[Dict] = []
-
-
-     for file in article_list:
-         article = parse_raw_article(file)
-         articles.append(article)
-
-     newsletter_metadata= parse_meta_file(meta_file_list[0])
-
-     news_dict = {"newsletter_metadata": newsletter_metadata, "articles": articles}
-     return news_dict
-
-def parse_raw_article(article_fp: Union[Path, str]) -> Dict:
-    """Parse a raw article file into a structured list
-
-    Arguments
-    ---------
-    article_input_fp: Union[Path, str]
-    Input file to process.
-
-    Returns
-    --------
-    articles: List[Dict]
-    A list of dictionaries, where each item is for one article and includes
-    the title and the body of article.
-     
-    """
-    try:
-        tree = et.parse(article_fp)
-        root = tree.getroot()
-    except et.ParseError as e:
-        logging.error("Failed to parse the article file:%s", e)
-
-    title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
-    if len(title_values)>1:
-        logging.warning("More than one titles are extracted for the article.")
-    if not title_values:
-        logging.warning("No title is extracted for the article.")
-        title = None
-    else:
-        title = title_values[0]
-
-    body_values = [element.text for element in root.iter() if element.tag.endswith('p')]
-    if not body_values:
-        logging.warning("No body is extracted.")
-        body = None
-    if len(body_values)>1:
-        logging.warning("There are more than on paragraphs in the article.")
-        body = ' '.join(body_values)
-    else:
-        body = body_values[0]
-
-    return {"title": title, "body":body}
-    
 
-def parse_meta_file(meta_fp: Union[Path, str]) -> Dict:
-    # input_dir = Path(input_dir)
-    # meta_file_list = list(input_dir.glob("*.didl.xml"))
-    newsletter_metadata: List[Dict] = []
 
+class NewsletterFile:
+    """ Class for parsing xml files to json """
+
+    def __init__(
+            self,
+            input_dir: Union[Path, str], 
+            output_dir: Union[Path, str]
+            ):
+             
+        self.input_dir = Path(input_dir)
+        self.output_dir = Path(output_dir)
 
-    try:
-        tree=et.parse(meta_fp)
-        root=tree.getroot()
-    except et.ParseError as e:
-        logging.error("Failed to parse the meta file:%s", e)
 
+    def parse_all_articles(self) -> Dict:
         
-    title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
-    if len(title_values)>1:
-        logging.warning("More than one titles are extracted from metadata.")
-    if not title_values:
-        logging.warning("No title is extracted.")
-        title = None
-    else:
-        title = title_values[0]
-
-    language_values = [element.text for element in root.iter() if element.tag.endswith('language')]
-    if len(language_values)>1:
-        logging.warning("More than one language are extracted from metadata.")
-    if not language_values:
-        logging.warning("No language is extracted.")
-        language = None
-    else:
-        language = language_values[0]
+        file_list = list(self.input_dir.glob("*.xml"))
+        # List of meta files
+        meta_file_list = list(self.input_dir.glob("*.didl.xml"))
+        # List of xml files excluded meta file
+        article_file_list = [item for item in file_list if item not in meta_file_list]
 
-    
-    issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')]
-    if len(issuenumber_values)>1:
-        logging.warning("More than one issuenumbers are extracted from metadata.")
-    if not issuenumber_values:
-        logging.warning("No issuenumber is extracted.")
-        issuenumber = None
-    else:
-        issuenumber = issuenumber_values[0]
-
-
-    date_values = [element.text for element in root.iter() if element.tag.endswith('date')]
-    if len(date_values)>1:
-        logging.warning("More than one dates are extracted from metadata.")
-    if not date_values:
-        logging.warning("No date is extracted.")
-        date = None
-    else:
-        date = date_values[0]
-
-    identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')]
-    if len(identifier_values)>1:
-        logging.warning("More than one identifiers are extracted from metadata.")
-    if not identifier_values:
-        logging.warning("No identifier is extracted.")
-        identifier = None
-    else:
-        identifier = identifier_values[0]
-
-    temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')]
-    if len(temporal_values)>1:
-        logging.warning("More than one temporal are extracted from metadata.")
-    if not temporal_values:
-        logging.warning("No temporal is extracted.")
-        temporal = None
-    else:
-        temporal = temporal_values[0]
-
-    recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')]
-    if len(recordRights_values)>1:
-        logging.warning("More than one recordRights are extracted from metadata.")
-    if not recordRights_values:
-        logging.warning("No recordRights is extracted.")
-        recordRights = None
-    else:
-        recordRights = recordRights_values[0]
-
-    publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')]
-    if len(publisher_values)>1:
-        logging.warning("More than one publisher are extracted from metadata.")
-    if not publisher_values:
-        logging.warning("No publisher is extracted.")
-        publisher = None
-    else:
-        publisher = publisher_values[0]
-
-    spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')]
-    if len(spatial_values)>1:
-        logging.warning("More than one spatial are extracted from metadata.")
-    if not spatial_values:
-        logging.warning("No spatial is extracted.")
-        spatial_1 = None
-        spatial_2 = None
-    else:
-        spatial_1 = spatial_values[0]
-        spatial_2 = spatial_values[1]
-
-    source_values = [element.text for element in root.iter() if element.tag.endswith('source')]
-    if len(source_values)>1:
-        logging.warning("More than one source are extracted from metadata.")
-    if not source_values:
-        logging.warning("No source is extracted.")
-        source = None
-    else:
-        source = source_values[1]
-
-    recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')]
-    if len(recordIdentifier_values)>1:
-        logging.warning("More than one recordIdentifier are extracted from metadata.")
-    if not recordIdentifier_values:
-        logging.warning("No recordIdentifier is extracted.")
-        recordIdentifier = None
-    else:
-        recordIdentifier = recordIdentifier_values[0]
-
-    type_values = [element.text for element in root.iter() if element.tag.endswith('type')]
-    if len(type_values)>1:
-        logging.warning("More than one type are extracted from metadata.")
-    if not type_values:
-        logging.warning("No type is extracted.")
-        type = None
-    else:
-        type = type_values[0]
-
-    isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')]
-    if len(isPartOf_values)>1:
-        logging.warning("More than one isPartOf are extracted from metadata.")
-    if not isPartOf_values:
-        logging.warning("No isPartOf is extracted.")
-        isPartOf_1 = None
-        isPartOf_2 = None
-    else:
-        isPartOf_1 = isPartOf_values[0]
-        isPartOf_2 = isPartOf_values[1]
-
-
-    newsletter_metadata.append({
-            "title": title,
-            "language":language,
-            "issue_number":issuenumber,
-            "date": date,
-            "identifier": identifier,
-            "temporal": temporal,
-            "recordRights": recordRights,
-            "publisher": publisher,
-            "spatial_1": spatial_1,
-            "spatial_2": spatial_2,
-            "source": source,
-            "recordIdentifier": recordIdentifier,
-            "type": type,
-            "isPartOf_1":isPartOf_1,
-            "isPartOf_2":isPartOf_2
-            })
-     
-    return newsletter_metadata
+        articles: List[Dict] = []
+
+
+        for file in article_file_list:
+            article = self._parse_raw_article(file)
+            articles.append(article)
+
+        newsletter_metadata= self._parse_meta_file(meta_file_list[0])
+
+        news_dict = {"newsletter_metadata": newsletter_metadata, "articles": articles}
+        return news_dict
+
+    def _parse_raw_article(self, article_fp: Union[Path, str]) -> Dict:
+        """Parse a raw article file into a structured list
+
+        Arguments
+        ---------
+        article_input_fp: Union[Path, str]
+        Input file to process.
+
+        Returns
+        --------
+        articles: List[Dict]
+        A list of dictionaries, where each item is for one article and includes
+        the title and the body of article.
+        
+        """
+        try:
+            tree = et.parse(article_fp)
+            root = tree.getroot()
+        except et.ParseError as e:
+            logging.error("Failed to parse the article file:%s", e)
+
+        title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
+        if len(title_values)>1:
+            logging.warning("More than one titles are extracted for the article.")
+        if not title_values:
+            logging.warning("No title is extracted for the article.")
+            title = None
+        else:
+            title = title_values[0]
+
+        body_values = [element.text for element in root.iter() if element.tag.endswith('p')]
+        if not body_values:
+            logging.warning("No body is extracted.")
+            body = None
+        if len(body_values)>1:
+            logging.warning("There are more than on paragraphs in the article.")
+            body = ' '.join(body_values)
+        else:
+            body = body_values[0]
+
+        return {"title": title, "body":body}
+        
+
+    def _parse_meta_file(self, meta_fp: Union[Path, str]) -> Dict:
+
+        newsletter_metadata: List[Dict] = []
+
+
+        try:
+            tree=et.parse(meta_fp)
+            root=tree.getroot()
+        except et.ParseError as e:
+            logging.error("Failed to parse the meta file:%s", e)
+
+            
+        title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
+        if len(title_values)>1:
+            logging.warning("More than one titles are extracted from metadata.")
+        if not title_values:
+            logging.warning("No title is extracted.")
+            title = None
+        else:
+            title = title_values[0]
+
+        language_values = [element.text for element in root.iter() if element.tag.endswith('language')]
+        if len(language_values)>1:
+            logging.warning("More than one language are extracted from metadata.")
+        if not language_values:
+            logging.warning("No language is extracted.")
+            language = None
+        else:
+            language = language_values[0]
+
+        
+        issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')]
+        if len(issuenumber_values)>1:
+            logging.warning("More than one issuenumbers are extracted from metadata.")
+        if not issuenumber_values:
+            logging.warning("No issuenumber is extracted.")
+            issuenumber = None
+        else:
+            issuenumber = issuenumber_values[0]
+
+
+        date_values = [element.text for element in root.iter() if element.tag.endswith('date')]
+        if len(date_values)>1:
+            logging.warning("More than one dates are extracted from metadata.")
+        if not date_values:
+            logging.warning("No date is extracted.")
+            date = None
+        else:
+            date = date_values[0]
+
+        identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')]
+        if len(identifier_values)>1:
+            logging.warning("More than one identifiers are extracted from metadata.")
+        if not identifier_values:
+            logging.warning("No identifier is extracted.")
+            identifier = None
+        else:
+            identifier = identifier_values[0]
+
+        temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')]
+        if len(temporal_values)>1:
+            logging.warning("More than one temporal are extracted from metadata.")
+        if not temporal_values:
+            logging.warning("No temporal is extracted.")
+            temporal = None
+        else:
+            temporal = temporal_values[0]
+
+        recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')]
+        if len(recordRights_values)>1:
+            logging.warning("More than one recordRights are extracted from metadata.")
+        if not recordRights_values:
+            logging.warning("No recordRights is extracted.")
+            recordRights = None
+        else:
+            recordRights = recordRights_values[0]
+
+        publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')]
+        if len(publisher_values)>1:
+            logging.warning("More than one publisher are extracted from metadata.")
+        if not publisher_values:
+            logging.warning("No publisher is extracted.")
+            publisher = None
+        else:
+            publisher = publisher_values[0]
+
+        spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')]
+        if len(spatial_values)>1:
+            logging.warning("More than one spatial are extracted from metadata.")
+        if not spatial_values:
+            logging.warning("No spatial is extracted.")
+            spatial_1 = None
+            spatial_2 = None
+        else:
+            spatial_1 = spatial_values[0]
+            spatial_2 = spatial_values[1]
+
+        source_values = [element.text for element in root.iter() if element.tag.endswith('source')]
+        if len(source_values)>1:
+            logging.warning("More than one source are extracted from metadata.")
+        if not source_values:
+            logging.warning("No source is extracted.")
+            source = None
+        else:
+            source = source_values[1]
+
+        recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')]
+        if len(recordIdentifier_values)>1:
+            logging.warning("More than one recordIdentifier are extracted from metadata.")
+        if not recordIdentifier_values:
+            logging.warning("No recordIdentifier is extracted.")
+            recordIdentifier = None
+        else:
+            recordIdentifier = recordIdentifier_values[0]
+
+        type_values = [element.text for element in root.iter() if element.tag.endswith('type')]
+        if len(type_values)>1:
+            logging.warning("More than one type are extracted from metadata.")
+        if not type_values:
+            logging.warning("No type is extracted.")
+            type = None
+        else:
+            type = type_values[0]
+
+        isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')]
+        if len(isPartOf_values)>1:
+            logging.warning("More than one isPartOf are extracted from metadata.")
+        if not isPartOf_values:
+            logging.warning("No isPartOf is extracted.")
+            isPartOf_1 = None
+            isPartOf_2 = None
+        else:
+            isPartOf_1 = isPartOf_values[0]
+            isPartOf_2 = isPartOf_values[1]
+
+
+        newsletter_metadata.append({
+                "title": title,
+                "language":language,
+                "issue_number":issuenumber,
+                "date": date,
+                "identifier": identifier,
+                "temporal": temporal,
+                "recordRights": recordRights,
+                "publisher": publisher,
+                "spatial_1": spatial_1,
+                "spatial_2": spatial_2,
+                "source": source,
+                "recordIdentifier": recordIdentifier,
+                "type": type,
+                "isPartOf_1":isPartOf_1,
+                "isPartOf_2":isPartOf_2
+                })
+        
+        return newsletter_metadata
 
 
 if __name__ == "__main__":
-    # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml'))
-    # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
-    # print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
-    x = parse_all_articles("../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100")
-    print(x)
+    x = NewsletterFile(input_dir = '../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100', output_dir=' ')
+    print(x.parse_all_articles())
+    # print(x.input_dir)
+
+    # # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml'))
+    # # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
+    # # print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
+    # x = parse_all_articles("../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100")
+    # print(x)
+
     

From be5a852d77de85410afa7902dfca2c1652b43b43 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Mon, 19 Feb 2024 10:04:25 +0100
Subject: [PATCH 06/22] Add id to the articles

---
 interest/preprocessor/parser.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index d61049f..64dcf3d 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -30,12 +30,17 @@ def parse_all_articles(self) -> Dict:
         # List of xml files excluded meta file
         article_file_list = [item for item in file_list if item not in meta_file_list]
 
-        articles: List[Dict] = []
+        # articles: List[Dict] = []
+        articles: dict[Dict] = {}
+        id = 0
+
 
 
         for file in article_file_list:
             article = self._parse_raw_article(file)
-            articles.append(article)
+            id += 1
+            articles[id] = article
+            # articles.append(article)
 
         newsletter_metadata= self._parse_meta_file(meta_file_list[0])
 
@@ -246,7 +251,12 @@ def _parse_meta_file(self, meta_fp: Union[Path, str]) -> Dict:
 
 if __name__ == "__main__":
     x = NewsletterFile(input_dir = '../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100', output_dir=' ')
-    print(x.parse_all_articles())
+    h = x.parse_all_articles()
+    # print(h.keys())
+    print(h['articles'][1])
+    # print(h['newsletter_metadata'])
+    # print(h['articles'][38]['title'])
+
     # print(x.input_dir)
 
     # # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml'))

From 55855c7db56b790520ec6d5ded90b67045c4be79 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Tue, 27 Feb 2024 12:52:50 +0100
Subject: [PATCH 07/22] Add new layout

---
 interest/preprocessor/parser.py | 269 +++++++++++++++-----------------
 1 file changed, 125 insertions(+), 144 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 64dcf3d..6daaa39 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -1,75 +1,90 @@
+
+import os
+import tarfile
+import gzip
 import json
-import lzma
-import re
-from collections import Counter, defaultdict
-from pathlib import Path
-from typing import List, Union, Dict, Optional
+import xml.etree.ElementTree as ET
+from typing import Dict, Union
 import logging
 
-import xml.etree.cElementTree as et
-
-
-class NewsletterFile:
-    """ Class for parsing xml files to json """
-
-    def __init__(
-            self,
-            input_dir: Union[Path, str], 
-            output_dir: Union[Path, str]
-            ):
-             
-        self.input_dir = Path(input_dir)
-        self.output_dir = Path(output_dir)
-
-
-    def parse_all_articles(self) -> Dict:
-        
-        file_list = list(self.input_dir.glob("*.xml"))
-        # List of meta files
-        meta_file_list = list(self.input_dir.glob("*.didl.xml"))
-        # List of xml files excluded meta file
-        article_file_list = [item for item in file_list if item not in meta_file_list]
 
-        # articles: List[Dict] = []
-        articles: dict[Dict] = {}
+class XMLExtractor:
+    def __init__(self, root_dir: str, output_dir: str):
+        self.root_dir = root_dir
+        self.output_dir = output_dir
+
+    def extract_xml_string(self) -> None:
+        for folder_name in os.listdir(self.root_dir):
+            folder_path = os.path.join(self.root_dir, folder_name)
+            if not os.path.isdir(folder_path):
+                continue
+            if not folder_name.isdigit():  # Exclude in_progress, manifests, and ocr_complete folders and log files
+                continue
+            self.process_folder(folder_name, folder_path)
+
+    def process_folder(self, folder_name: str, folder_path: str) -> None:
+        for tgz_filename in os.listdir(folder_path):
+            if not tgz_filename.endswith('.tgz'):
+                continue
+            tgz_file_path = os.path.join(folder_path, tgz_filename)
+            base_name = os.path.splitext(tgz_filename)[0]
+            output_folder = os.path.join(self.output_dir, folder_name)
+            os.makedirs(output_folder, exist_ok=True)
+            try:
+                with tarfile.open(tgz_file_path, "r:gz") as outer_tar:
+                    news_dict = self.process_tar(outer_tar)
+            except tarfile.TarError as e:
+                logging.error(f"Error extracting {tgz_filename}: {e}")
+                continue
+            output_file = os.path.join(output_folder, f"{base_name}.json")
+            self.save_as_json(news_dict, output_file)
+
+    def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]:
+        news_dict = {"newsletter_metadata": {}, "articles": {}}
+        articles: Dict[int, Dict[str, str]] = {}
         id = 0
-
-
-
-        for file in article_file_list:
-            article = self._parse_raw_article(file)
-            id += 1
-            articles[id] = article
-            # articles.append(article)
-
-        newsletter_metadata= self._parse_meta_file(meta_file_list[0])
-
-        news_dict = {"newsletter_metadata": newsletter_metadata, "articles": articles}
+        for entry in outer_tar:
+            try:
+                if entry.name.endswith(".xml"):
+                    file = outer_tar.extractfile(entry)
+                    if file is not None:
+                        content = file.read()
+                        xml_content = content.decode('utf-8', 'ignore')
+                        article = self.extract_article(xml_content, entry.name)
+                        id += 1
+                        news_dict["articles"][id] = article
+
+                elif entry.name.endswith(".gz"):
+                    gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz'))
+                    with outer_tar.extractfile(gz_member) as gz_file:
+                        with gzip.open(gz_file, 'rt') as xml_file:
+                            xml_string = xml_file.read()
+                            newsletter_metadata = self.extract_meta(xml_string)
+                            news_dict["newsletter_metadata"] = newsletter_metadata
+                else:
+                    continue
+            except Exception as e:
+                logging.error(f"Error processing file {entry.name}: {e}")
         return news_dict
 
-    def _parse_raw_article(self, article_fp: Union[Path, str]) -> Dict:
-        """Parse a raw article file into a structured list
-
-        Arguments
-        ---------
-        article_input_fp: Union[Path, str]
-        Input file to process.
-
-        Returns
-        --------
-        articles: List[Dict]
-        A list of dictionaries, where each item is for one article and includes
-        the title and the body of article.
-        
-        """
+    @staticmethod
+    def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:
         try:
-            tree = et.parse(article_fp)
-            root = tree.getroot()
-        except et.ParseError as e:
-            logging.error("Failed to parse the article file:%s", e)
+            with open(output_file, 'w') as json_file:
+                json.dump(data, json_file, indent=4)
+        except Exception as e:
+            logging.error(f"Error saving JSON to {output_file}: {e}")
+
+    @staticmethod
+    def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
+        try:
+            root = ET.fromstring(xml_content)
+        except ET.ParseError:
+            logging.error(f"Failed to parse XML from file: {file_name}")
+            return {}
 
         title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
-        if len(title_values)>1:
+        if len(title_values) > 1:
             logging.warning("More than one titles are extracted for the article.")
         if not title_values:
             logging.warning("No title is extracted for the article.")
@@ -81,54 +96,52 @@ def _parse_raw_article(self, article_fp: Union[Path, str]) -> Dict:
         if not body_values:
             logging.warning("No body is extracted.")
             body = None
-        if len(body_values)>1:
-            logging.warning("There are more than on paragraphs in the article.")
+        elif len(body_values) > 1:
+            logging.warning("There are more than one paragraphs in the article.")
             body = ' '.join(body_values)
         else:
             body = body_values[0]
 
-        return {"title": title, "body":body}
-        
-
-    def _parse_meta_file(self, meta_fp: Union[Path, str]) -> Dict:
-
-        newsletter_metadata: List[Dict] = []
+        return {"title": title, "body": body}
 
+    @staticmethod
+    def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
+        newsletter_metadata: Dict[str, Union[str, None]] = {}
 
         try:
-            tree=et.parse(meta_fp)
-            root=tree.getroot()
-        except et.ParseError as e:
-            logging.error("Failed to parse the meta file:%s", e)
+            root = ET.fromstring(xml_string)
+        except ET.ParseError:
+            logging.error("Failed to parse XML from file")
+            return newsletter_metadata
 
-            
+        # Extracting metadata
         title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
         if len(title_values)>1:
             logging.warning("More than one titles are extracted from metadata.")
         if not title_values:
             logging.warning("No title is extracted.")
-            title = None
+            newsletter_metadata['title'] = None
         else:
-            title = title_values[0]
+            newsletter_metadata['title'] = title_values[0]
 
         language_values = [element.text for element in root.iter() if element.tag.endswith('language')]
         if len(language_values)>1:
             logging.warning("More than one language are extracted from metadata.")
         if not language_values:
             logging.warning("No language is extracted.")
-            language = None
+            newsletter_metadata['language'] = None
         else:
-            language = language_values[0]
+            newsletter_metadata['language'] = language_values[0]
+
 
-        
         issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')]
         if len(issuenumber_values)>1:
             logging.warning("More than one issuenumbers are extracted from metadata.")
         if not issuenumber_values:
             logging.warning("No issuenumber is extracted.")
-            issuenumber = None
+            newsletter_metadata['issuenumber'] = None
         else:
-            issuenumber = issuenumber_values[0]
+            newsletter_metadata['issuenumber'] = issuenumber_values[0]
 
 
         date_values = [element.text for element in root.iter() if element.tag.endswith('date')]
@@ -136,133 +149,101 @@ def _parse_meta_file(self, meta_fp: Union[Path, str]) -> Dict:
             logging.warning("More than one dates are extracted from metadata.")
         if not date_values:
             logging.warning("No date is extracted.")
-            date = None
+            newsletter_metadata['date'] = None
         else:
-            date = date_values[0]
+            newsletter_metadata['date'] = date_values[0]
 
         identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')]
         if len(identifier_values)>1:
             logging.warning("More than one identifiers are extracted from metadata.")
         if not identifier_values:
             logging.warning("No identifier is extracted.")
-            identifier = None
+            newsletter_metadata['identifier'] = None
         else:
-            identifier = identifier_values[0]
+            newsletter_metadata['identifier'] = identifier_values[0]
 
         temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')]
         if len(temporal_values)>1:
             logging.warning("More than one temporal are extracted from metadata.")
         if not temporal_values:
             logging.warning("No temporal is extracted.")
-            temporal = None
+            newsletter_metadata['temporal'] = None
         else:
-            temporal = temporal_values[0]
+            newsletter_metadata['temporal'] = temporal_values[0]
 
         recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')]
         if len(recordRights_values)>1:
             logging.warning("More than one recordRights are extracted from metadata.")
         if not recordRights_values:
             logging.warning("No recordRights is extracted.")
-            recordRights = None
+            newsletter_metadata['recordRights'] = None
         else:
-            recordRights = recordRights_values[0]
+            newsletter_metadata['recordRights'] = recordRights_values[0]
 
         publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')]
         if len(publisher_values)>1:
             logging.warning("More than one publisher are extracted from metadata.")
         if not publisher_values:
             logging.warning("No publisher is extracted.")
-            publisher = None
+            newsletter_metadata['publisher'] = None
         else:
-            publisher = publisher_values[0]
+            newsletter_metadata['publisher'] = publisher_values[0]
 
         spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')]
         if len(spatial_values)>1:
             logging.warning("More than one spatial are extracted from metadata.")
         if not spatial_values:
             logging.warning("No spatial is extracted.")
-            spatial_1 = None
-            spatial_2 = None
+            newsletter_metadata['spatial_1'] = None
+            newsletter_metadata['spatial_2'] = None
         else:
-            spatial_1 = spatial_values[0]
-            spatial_2 = spatial_values[1]
+            newsletter_metadata['spatial_1'] = spatial_values[0]
+            newsletter_metadata['spatial_2'] = spatial_values[1]
 
         source_values = [element.text for element in root.iter() if element.tag.endswith('source')]
         if len(source_values)>1:
             logging.warning("More than one source are extracted from metadata.")
         if not source_values:
             logging.warning("No source is extracted.")
-            source = None
+            newsletter_metadata['source'] = None
         else:
-            source = source_values[1]
+            newsletter_metadata['source'] = source_values[1]
 
         recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')]
         if len(recordIdentifier_values)>1:
             logging.warning("More than one recordIdentifier are extracted from metadata.")
         if not recordIdentifier_values:
             logging.warning("No recordIdentifier is extracted.")
-            recordIdentifier = None
+            newsletter_metadata['recordIdentifier'] = None
         else:
-            recordIdentifier = recordIdentifier_values[0]
+            newsletter_metadata['recordIdentifier'] = recordIdentifier_values[0]
 
         type_values = [element.text for element in root.iter() if element.tag.endswith('type')]
         if len(type_values)>1:
             logging.warning("More than one type are extracted from metadata.")
         if not type_values:
             logging.warning("No type is extracted.")
-            type = None
+            newsletter_metadata['type'] = None
         else:
-            type = type_values[0]
+            newsletter_metadata['type'] = type_values[0]
 
         isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')]
         if len(isPartOf_values)>1:
             logging.warning("More than one isPartOf are extracted from metadata.")
         if not isPartOf_values:
             logging.warning("No isPartOf is extracted.")
-            isPartOf_1 = None
-            isPartOf_2 = None
+            newsletter_metadata['isPartOf_1'] = None
+            newsletter_metadata['isPartOf_2'] = None
         else:
-            isPartOf_1 = isPartOf_values[0]
-            isPartOf_2 = isPartOf_values[1]
-
-
-        newsletter_metadata.append({
-                "title": title,
-                "language":language,
-                "issue_number":issuenumber,
-                "date": date,
-                "identifier": identifier,
-                "temporal": temporal,
-                "recordRights": recordRights,
-                "publisher": publisher,
-                "spatial_1": spatial_1,
-                "spatial_2": spatial_2,
-                "source": source,
-                "recordIdentifier": recordIdentifier,
-                "type": type,
-                "isPartOf_1":isPartOf_1,
-                "isPartOf_2":isPartOf_2
-                })
-        
-        return newsletter_metadata
+            newsletter_metadata['isPartOf_1'] = isPartOf_values[0]
+            newsletter_metadata['isPartOf_2'] = isPartOf_values[1]
 
+        return newsletter_metadata
 
-    
+# Configure logging
+logging.basicConfig(filename='extractor.log', level=logging.DEBUG)
 
+# Example usage
 if __name__ == "__main__":
-    x = NewsletterFile(input_dir = '../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100', output_dir=' ')
-    h = x.parse_all_articles()
-    # print(h.keys())
-    print(h['articles'][1])
-    # print(h['newsletter_metadata'])
-    # print(h['articles'][38]['title'])
-
-    # print(x.input_dir)
-
-    # # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml'))
-    # # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
-    # # print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100'))
-    # x = parse_all_articles("../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100")
-    # print(x)
-
-    
+    extractor = XMLExtractor("../../data/news/gg", "../../data/news/gg-json")
+    extractor.extract_xml_string()

From 112be91d026708d5a07f2a5a3cddb4e23d3348a6 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Tue, 27 Feb 2024 13:00:42 +0100
Subject: [PATCH 08/22] Add documentation

---
 interest/preprocessor/parser.py | 53 +++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 6daaa39..2c14a77 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -9,11 +9,22 @@
 
 
 class XMLExtractor:
+    """Class for extracting XML content and metadata from nested .tgz files."""
     def __init__(self, root_dir: str, output_dir: str):
+        """
+        Initializes the XMLExtractor object.
+
+        Parameters:
+            root_dir (str): The root directory containing .tgz files.
+            output_dir (str): The output directory for saving extracted JSON files.
+        """
         self.root_dir = root_dir
         self.output_dir = output_dir
 
     def extract_xml_string(self) -> None:
+        """
+        Extracts XML content and metadata from .tgz files in the root directory.
+        """
         for folder_name in os.listdir(self.root_dir):
             folder_path = os.path.join(self.root_dir, folder_name)
             if not os.path.isdir(folder_path):
@@ -23,6 +34,13 @@ def extract_xml_string(self) -> None:
             self.process_folder(folder_name, folder_path)
 
     def process_folder(self, folder_name: str, folder_path: str) -> None:
+        """
+        Processes .tgz files within a folder.
+
+        Parameters:
+            folder_name (str): Name of the folder being processed.
+            folder_path (str): Path to the folder being processed.
+        """
         for tgz_filename in os.listdir(folder_path):
             if not tgz_filename.endswith('.tgz'):
                 continue
@@ -40,6 +58,15 @@ def process_folder(self, folder_name: str, folder_path: str) -> None:
             self.save_as_json(news_dict, output_file)
 
     def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]:
+        """
+        Processes a .tgz file and extracts XML content and metadata.
+
+        Parameters:
+            outer_tar (tarfile.TarFile): The .tgz file being processed.
+
+        Returns:
+            Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata.
+        """
         news_dict = {"newsletter_metadata": {}, "articles": {}}
         articles: Dict[int, Dict[str, str]] = {}
         id = 0
@@ -69,6 +96,13 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s
 
     @staticmethod
     def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:
+        """
+        Saves data as JSON to a specified file.
+
+        Parameters:
+            data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.
+            output_file (str): Path to the output JSON file.
+        """
         try:
             with open(output_file, 'w') as json_file:
                 json.dump(data, json_file, indent=4)
@@ -77,6 +111,16 @@ def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]
 
     @staticmethod
     def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
+        """
+        Extracts article title and body from XML content.
+
+        Parameters:
+            xml_content (str): XML content of the article.
+            file_name (str): Name of the XML file.
+
+        Returns:
+            Dict[str, str]: A dictionary containing the extracted title and body of the article.
+        """
         try:
             root = ET.fromstring(xml_content)
         except ET.ParseError:
@@ -106,6 +150,15 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
 
     @staticmethod
     def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
+        """
+        Extracts metadata from XML string.
+
+        Parameters:
+            xml_string (str): XML string containing metadata.
+
+        Returns:
+            Dict[str, Union[str, None]]: A dictionary containing the extracted metadata.
+        """
         newsletter_metadata: Dict[str, Union[str, None]] = {}
 
         try:

From 6a8aca2fea27515331ec7e0a285d6114e03b569f Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Tue, 27 Feb 2024 13:31:24 +0100
Subject: [PATCH 09/22] Compress json files while saving

---
 interest/preprocessor/parser.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 2c14a77..14a67c5 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -8,6 +8,7 @@
 import logging
 
 
+
 class XMLExtractor:
     """Class for extracting XML content and metadata from nested .tgz files."""
     def __init__(self, root_dir: str, output_dir: str):
@@ -55,7 +56,8 @@ def process_folder(self, folder_name: str, folder_path: str) -> None:
                 logging.error(f"Error extracting {tgz_filename}: {e}")
                 continue
             output_file = os.path.join(output_folder, f"{base_name}.json")
-            self.save_as_json(news_dict, output_file)
+            self.save_as_json_compressed(news_dict, output_file)
+            # self.save_as_json(news_dict, output_file)
 
     def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]:
         """
@@ -93,21 +95,37 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s
             except Exception as e:
                 logging.error(f"Error processing file {entry.name}: {e}")
         return news_dict
-
+    
     @staticmethod
-    def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:
+    def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:
         """
-        Saves data as JSON to a specified file.
+        Saves data as compressed JSON using gzip.
 
         Parameters:
             data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.
             output_file (str): Path to the output JSON file.
         """
         try:
-            with open(output_file, 'w') as json_file:
+            with gzip.open(output_file, 'wt') as json_file:
                 json.dump(data, json_file, indent=4)
         except Exception as e:
-            logging.error(f"Error saving JSON to {output_file}: {e}")
+            logging.error(f"Error saving compressed JSON to {output_file}: {e}")
+
+
+    # @staticmethod
+    # def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:
+    #     """
+    #     Saves data as JSON to a specified file.
+
+    #     Parameters:
+    #         data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.
+    #         output_file (str): Path to the output JSON file.
+    #     """
+    #     try:
+    #         with open(output_file, 'w') as json_file:
+    #             json.dump(data, json_file, indent=4)
+    #     except Exception as e:
+    #         logging.error(f"Error saving JSON to {output_file}: {e}")
 
     @staticmethod
     def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
@@ -298,5 +316,5 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
 
 # Example usage
 if __name__ == "__main__":
-    extractor = XMLExtractor("../../data/news/gg", "../../data/news/gg-json")
+    extractor = XMLExtractor("../../data/news/gg", "../../data/news/gg-json-compress")
     extractor.extract_xml_string()

From c61cf3e700c246cea5ff60860ca2db5dd0bc676e Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Tue, 27 Feb 2024 15:25:54 +0100
Subject: [PATCH 10/22] Shorten extract_meta method

---
 interest/preprocessor/parser.py | 138 ++++----------------------------
 1 file changed, 16 insertions(+), 122 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 14a67c5..56845b0 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -186,131 +186,25 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
             return newsletter_metadata
 
         # Extracting metadata
-        title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
-        if len(title_values)>1:
-            logging.warning("More than one titles are extracted from metadata.")
-        if not title_values:
-            logging.warning("No title is extracted.")
-            newsletter_metadata['title'] = None
-        else:
-            newsletter_metadata['title'] = title_values[0]
-
-        language_values = [element.text for element in root.iter() if element.tag.endswith('language')]
-        if len(language_values)>1:
-            logging.warning("More than one language are extracted from metadata.")
-        if not language_values:
-            logging.warning("No language is extracted.")
-            newsletter_metadata['language'] = None
-        else:
-            newsletter_metadata['language'] = language_values[0]
-
-
-        issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')]
-        if len(issuenumber_values)>1:
-            logging.warning("More than one issuenumbers are extracted from metadata.")
-        if not issuenumber_values:
-            logging.warning("No issuenumber is extracted.")
-            newsletter_metadata['issuenumber'] = None
-        else:
-            newsletter_metadata['issuenumber'] = issuenumber_values[0]
-
-
-        date_values = [element.text for element in root.iter() if element.tag.endswith('date')]
-        if len(date_values)>1:
-            logging.warning("More than one dates are extracted from metadata.")
-        if not date_values:
-            logging.warning("No date is extracted.")
-            newsletter_metadata['date'] = None
-        else:
-            newsletter_metadata['date'] = date_values[0]
-
-        identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')]
-        if len(identifier_values)>1:
-            logging.warning("More than one identifiers are extracted from metadata.")
-        if not identifier_values:
-            logging.warning("No identifier is extracted.")
-            newsletter_metadata['identifier'] = None
-        else:
-            newsletter_metadata['identifier'] = identifier_values[0]
-
-        temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')]
-        if len(temporal_values)>1:
-            logging.warning("More than one temporal are extracted from metadata.")
-        if not temporal_values:
-            logging.warning("No temporal is extracted.")
-            newsletter_metadata['temporal'] = None
-        else:
-            newsletter_metadata['temporal'] = temporal_values[0]
-
-        recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')]
-        if len(recordRights_values)>1:
-            logging.warning("More than one recordRights are extracted from metadata.")
-        if not recordRights_values:
-            logging.warning("No recordRights is extracted.")
-            newsletter_metadata['recordRights'] = None
-        else:
-            newsletter_metadata['recordRights'] = recordRights_values[0]
-
-        publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')]
-        if len(publisher_values)>1:
-            logging.warning("More than one publisher are extracted from metadata.")
-        if not publisher_values:
-            logging.warning("No publisher is extracted.")
-            newsletter_metadata['publisher'] = None
-        else:
-            newsletter_metadata['publisher'] = publisher_values[0]
-
-        spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')]
-        if len(spatial_values)>1:
-            logging.warning("More than one spatial are extracted from metadata.")
-        if not spatial_values:
-            logging.warning("No spatial is extracted.")
-            newsletter_metadata['spatial_1'] = None
-            newsletter_metadata['spatial_2'] = None
-        else:
-            newsletter_metadata['spatial_1'] = spatial_values[0]
-            newsletter_metadata['spatial_2'] = spatial_values[1]
-
-        source_values = [element.text for element in root.iter() if element.tag.endswith('source')]
-        if len(source_values)>1:
-            logging.warning("More than one source are extracted from metadata.")
-        if not source_values:
-            logging.warning("No source is extracted.")
-            newsletter_metadata['source'] = None
-        else:
-            newsletter_metadata['source'] = source_values[1]
-
-        recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')]
-        if len(recordIdentifier_values)>1:
-            logging.warning("More than one recordIdentifier are extracted from metadata.")
-        if not recordIdentifier_values:
-            logging.warning("No recordIdentifier is extracted.")
-            newsletter_metadata['recordIdentifier'] = None
-        else:
-            newsletter_metadata['recordIdentifier'] = recordIdentifier_values[0]
-
-        type_values = [element.text for element in root.iter() if element.tag.endswith('type')]
-        if len(type_values)>1:
-            logging.warning("More than one type are extracted from metadata.")
-        if not type_values:
-            logging.warning("No type is extracted.")
-            newsletter_metadata['type'] = None
-        else:
-            newsletter_metadata['type'] = type_values[0]
-
-        isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')]
-        if len(isPartOf_values)>1:
-            logging.warning("More than one isPartOf are extracted from metadata.")
-        if not isPartOf_values:
-            logging.warning("No isPartOf is extracted.")
-            newsletter_metadata['isPartOf_1'] = None
-            newsletter_metadata['isPartOf_2'] = None
-        else:
-            newsletter_metadata['isPartOf_1'] = isPartOf_values[0]
-            newsletter_metadata['isPartOf_2'] = isPartOf_values[1]
+        fields = [
+            "title", "language", "issuenumber", "date", "identifier",
+            "temporal", "recordRights", "publisher", "spatial", "source",
+            "recordIdentifier", "type", "isPartOf"
+        ]
+
+        for field in fields:
+            field_values = [element.text for element in root.iter() if element.tag.endswith(field)]
+            if len(field_values) > 1:
+                logging.warning(f"More than one {field}s are extracted from metadata.")
+            if not field_values:
+                logging.warning(f"No {field} is extracted.")
+                newsletter_metadata[field] = None
+            else:
+                newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values)
 
         return newsletter_metadata
 
+
 # Configure logging
 logging.basicConfig(filename='extractor.log', level=logging.DEBUG)
 

From f278030b9464b90d7e971639658df7caff1eb84a Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Tue, 27 Feb 2024 15:45:15 +0100
Subject: [PATCH 11/22] Fix flake8 issues

---
 interest/preprocessor/parser.py | 53 ++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 56845b0..61599b3 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -8,29 +8,28 @@
 import logging
 
 
-
 class XMLExtractor:
-    """Class for extracting XML content and metadata from nested .tgz files."""
+    """Class for extracting XML content and metadata from nested .tgz files."""  # noqa: E501
     def __init__(self, root_dir: str, output_dir: str):
         """
         Initializes the XMLExtractor object.
 
         Parameters:
             root_dir (str): The root directory containing .tgz files.
-            output_dir (str): The output directory for saving extracted JSON files.
+            output_dir (str): The output directory for saving extracted JSON files.  # noqa: E501
         """
         self.root_dir = root_dir
         self.output_dir = output_dir
 
     def extract_xml_string(self) -> None:
         """
-        Extracts XML content and metadata from .tgz files in the root directory.
+        Extracts XML content and metadata from .tgz files in the root directory.  # noqa: E501
         """
         for folder_name in os.listdir(self.root_dir):
             folder_path = os.path.join(self.root_dir, folder_name)
             if not os.path.isdir(folder_path):
                 continue
-            if not folder_name.isdigit():  # Exclude in_progress, manifests, and ocr_complete folders and log files
+            if not folder_name.isdigit():  # Exclude in_progress, manifests, and ocr_complete folders and log files.  # noqa: E501
                 continue
             self.process_folder(folder_name, folder_path)
 
@@ -59,7 +58,7 @@ def process_folder(self, folder_name: str, folder_path: str) -> None:
             self.save_as_json_compressed(news_dict, output_file)
             # self.save_as_json(news_dict, output_file)
 
-    def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]:
+    def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]:  # noqa: E501
         """
         Processes a .tgz file and extracts XML content and metadata.
 
@@ -67,10 +66,9 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s
             outer_tar (tarfile.TarFile): The .tgz file being processed.
 
         Returns:
-            Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata.
+            Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata.  # noqa: E501
         """
         news_dict = {"newsletter_metadata": {}, "articles": {}}
-        articles: Dict[int, Dict[str, str]] = {}
         id = 0
         for entry in outer_tar:
             try:
@@ -84,41 +82,40 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s
                         news_dict["articles"][id] = article
 
                 elif entry.name.endswith(".gz"):
-                    gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz'))
+                    gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz'))  # noqa: E501
                     with outer_tar.extractfile(gz_member) as gz_file:
                         with gzip.open(gz_file, 'rt') as xml_file:
                             xml_string = xml_file.read()
                             newsletter_metadata = self.extract_meta(xml_string)
-                            news_dict["newsletter_metadata"] = newsletter_metadata
+                            news_dict["newsletter_metadata"] = newsletter_metadata  # noqa: E501
                 else:
                     continue
             except Exception as e:
                 logging.error(f"Error processing file {entry.name}: {e}")
         return news_dict
-    
+
     @staticmethod
-    def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:
+    def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:  # noqa: E501
         """
         Saves data as compressed JSON using gzip.
 
         Parameters:
-            data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.
+            data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.  # noqa: E501
             output_file (str): Path to the output JSON file.
         """
         try:
             with gzip.open(output_file, 'wt') as json_file:
                 json.dump(data, json_file, indent=4)
         except Exception as e:
-            logging.error(f"Error saving compressed JSON to {output_file}: {e}")
-
+            logging.error(f"Error saving compressed JSON to {output_file}: {e}")  # noqa: E501
 
     # @staticmethod
-    # def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:
+    # def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None:  # noqa: E501
     #     """
     #     Saves data as JSON to a specified file.
 
     #     Parameters:
-    #         data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.
+    #         data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON.  # noqa: E501
     #         output_file (str): Path to the output JSON file.
     #     """
     #     try:
@@ -137,7 +134,7 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
             file_name (str): Name of the XML file.
 
         Returns:
-            Dict[str, str]: A dictionary containing the extracted title and body of the article.
+            Dict[str, str]: A dictionary containing the extracted title and body of the article.  # noqa: E501
         """
         try:
             root = ET.fromstring(xml_content)
@@ -145,21 +142,21 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
             logging.error(f"Failed to parse XML from file: {file_name}")
             return {}
 
-        title_values = [element.text for element in root.iter() if element.tag.endswith('title')]
+        title_values = [element.text for element in root.iter() if element.tag.endswith('title')]  # noqa: E501
         if len(title_values) > 1:
-            logging.warning("More than one titles are extracted for the article.")
+            logging.warning("More than one titles are extracted for the article.")  # noqa: E501
         if not title_values:
             logging.warning("No title is extracted for the article.")
             title = None
         else:
             title = title_values[0]
 
-        body_values = [element.text for element in root.iter() if element.tag.endswith('p')]
+        body_values = [element.text for element in root.iter() if element.tag.endswith('p')]  # noqa: E501
         if not body_values:
             logging.warning("No body is extracted.")
             body = None
         elif len(body_values) > 1:
-            logging.warning("There are more than one paragraphs in the article.")
+            logging.warning("There are more than one paragraphs in the article.")  # noqa: E501
             body = ' '.join(body_values)
         else:
             body = body_values[0]
@@ -175,7 +172,7 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
             xml_string (str): XML string containing metadata.
 
         Returns:
-            Dict[str, Union[str, None]]: A dictionary containing the extracted metadata.
+            Dict[str, Union[str, None]]: A dictionary containing the extracted metadata.  # noqa: E501
         """
         newsletter_metadata: Dict[str, Union[str, None]] = {}
 
@@ -193,14 +190,14 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
         ]
 
         for field in fields:
-            field_values = [element.text for element in root.iter() if element.tag.endswith(field)]
+            field_values = [element.text for element in root.iter() if element.tag.endswith(field)]  # noqa: E501
             if len(field_values) > 1:
-                logging.warning(f"More than one {field}s are extracted from metadata.")
+                logging.warning(f"More than one {field}s are extracted from metadata.")  # noqa: E501
             if not field_values:
                 logging.warning(f"No {field} is extracted.")
                 newsletter_metadata[field] = None
             else:
-                newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values)
+                newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values)  # noqa: E501
 
         return newsletter_metadata
 
@@ -210,5 +207,7 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
 
 # Example usage
 if __name__ == "__main__":
-    extractor = XMLExtractor("../../data/news/gg", "../../data/news/gg-json-compress")
+    input_dir = "../../data/news/gg"
+    output_dir = "../../data/news/gg-json-compress"
+    extractor = XMLExtractor(input_dir, output_dir)
     extractor.extract_xml_string()

From 7f323ef86c6a617cdcf956631adc507bc1b121cd Mon Sep 17 00:00:00 2001
From: Shiva Nadi <44059592+ShNadi@users.noreply.github.com>
Date: Tue, 27 Feb 2024 15:49:20 +0100
Subject: [PATCH 12/22] Update python-package.yml

comment pylon
---
 .github/workflows/python-package.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 29821d3..ad47b95 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -50,10 +50,10 @@ jobs:
       run: |
        python -m pip install flake8
        flake8 $PACKAGE_NAME
-    - name: Lint with pylint
-      run: |
-       python -m pip install pylint
-       pylint $PACKAGE_NAME
+    # - name: Lint with pylint
+    #   run: |
+    #    python -m pip install pylint
+    #    pylint $PACKAGE_NAME
 #    - name: Check docstrings with pydocstyle
 #      run: |
 #        python -m pip install pydocstyle

From 6682cd20da26eeca62d7f5d5ed095cac364c1f96 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Thu, 29 Feb 2024 13:17:28 +0100
Subject: [PATCH 13/22] Change body to list of paragraphs.

---
 interest/preprocessor/parser.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 61599b3..c705f15 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -7,7 +7,6 @@
 from typing import Dict, Union
 import logging
 
-
 class XMLExtractor:
     """Class for extracting XML content and metadata from nested .tgz files."""  # noqa: E501
     def __init__(self, root_dir: str, output_dir: str):
@@ -155,11 +154,12 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
         if not body_values:
             logging.warning("No body is extracted.")
             body = None
-        elif len(body_values) > 1:
-            logging.warning("There are more than one paragraphs in the article.")  # noqa: E501
-            body = ' '.join(body_values)
+        # elif len(body_values) > 1:
+        #     logging.warning("There are more than one paragraphs in the article.")  # noqa: E501
+        #     body = ' '.join(body_values)
         else:
-            body = body_values[0]
+            # body = body_values[0]
+            body = body_values
 
         return {"title": title, "body": body}
 

From 7b791fbedd3c4d8ed2360252164af2eac781a748 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 1 Mar 2024 15:51:01 +0100
Subject: [PATCH 14/22] Add import parser

---
 interest/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/interest/__init__.py b/interest/__init__.py
index e69de29..6e6ff7e 100644
--- a/interest/__init__.py
+++ b/interest/__init__.py
@@ -0,0 +1 @@
+from interest.preprocessor.parser import XMLExtractor
\ No newline at end of file

From 3a1372ec542ae163ebf6aae2aae693ce0d8894a2 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 1 Mar 2024 15:51:45 +0100
Subject: [PATCH 15/22] Add import parser

---
 interest/preprocessor/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py
index e69de29..6e6ff7e 100644
--- a/interest/preprocessor/__init__.py
+++ b/interest/preprocessor/__init__.py
@@ -0,0 +1 @@
+from interest.preprocessor.parser import XMLExtractor
\ No newline at end of file

From 18d85dde9faddf4c17d5ddf44a7fb213fca92c63 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 1 Mar 2024 15:52:25 +0100
Subject: [PATCH 16/22] Remove main

---
 interest/preprocessor/parser.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index c705f15..99168cd 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -7,6 +7,7 @@
 from typing import Dict, Union
 import logging
 
+
 class XMLExtractor:
     """Class for extracting XML content and metadata from nested .tgz files."""  # noqa: E501
     def __init__(self, root_dir: str, output_dir: str):
@@ -200,14 +201,4 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
                 newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values)  # noqa: E501
 
         return newsletter_metadata
-
-
-# Configure logging
-logging.basicConfig(filename='extractor.log', level=logging.DEBUG)
-
-# Example usage
-if __name__ == "__main__":
-    input_dir = "../../data/news/gg"
-    output_dir = "../../data/news/gg-json-compress"
-    extractor = XMLExtractor(input_dir, output_dir)
-    extractor.extract_xml_string()
+ 
\ No newline at end of file

From 59c300815f6ce91bd62da38b30ed3eae6fa075fe Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 1 Mar 2024 15:55:52 +0100
Subject: [PATCH 17/22] Add convert_input_files.py

---
 pyproject.toml                 |   3 +--
 scripts/.DS_Store              | Bin 0 -> 6148 bytes
 scripts/convert_input_files.py |  22 ++++++++++++++++++++++
 scripts/logs/.DS_Store         | Bin 0 -> 6148 bytes
 4 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 scripts/.DS_Store
 create mode 100644 scripts/convert_input_files.py
 create mode 100644 scripts/logs/.DS_Store

diff --git a/pyproject.toml b/pyproject.toml
index d6befd8..07dc037 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ description = "A package to extract hystorical news sentiments"
 authors = [
     {name = "Shiva Nadi", email = "s.nadi@uu.nl"},
     {name = "Parisa Zahedi", email = "p.zahedi@uu.nl"},
+    {name = "Matty Vermet", email = "m.s.vermet@uu.nl"}
 ]
 readme = "README.md"
 requires-python = ">=3.8"
@@ -23,8 +24,6 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    # "numpy ~= 1.23.4",
-    # "scikit-learn ~= 0.19.1",
 ]
 
 [project.optional-dependencies]
diff --git a/scripts/.DS_Store b/scripts/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..f54d7c8ad083fb73e6aaf03bea7c6ae0f1a19ee1
GIT binary patch
literal 6148
zcmeHKO-my|5Uq|;Cy0=PipPPyipCEV@iIm|c-;{_>`Kg-sDtBl$V?0p0&~_Mv)BD4
z{+-=d-5pV*R|!%D)vvm`s%QFj(;q~n>b=%FQI&{1C}XLD;sfD!)-fqLPZ_98A8qPV
zP7R7F9g7ynBr?Erw?vnuF=O}idj2}pGjn-`*-X%q(JFQ5;W*KkQKBQ{@uJy>uKqIu
zi>QoPP1lsYi~I9gT-47xuD`!gCyBGXR(n>Zsq*y9tT*e;dFSDc?uU6Z?`8F7_ZQbr
zv`(Ut-HcA-cE3?wIM8X{jMH}4B*d*YLVlmcX-oI(x|g;xlN;Iwui{l2)#brpW4pHE
zZ*J`jSNy^52JEfP?cuQEE&f>B``Nfn?o$0|Mn(#2+{nDc5j??o8TWnIO%t77!#9c_
z#R&=n!hkR^DGa#%$(x&$ACmtN284l$VSx7sAIcawEG(L>1BH140Qnke1U6p_ImdSx
zI4mrp1)^*!(55Q)#85UJe&59f4hxGmos@fgDEDRMUMR|Z9sT<zoK#>@N?|}4c+0?)
ztycN`Kl=Rr|29c#!hkUFtr$?{!|<?<A-S`4Y;b(m`p^p~3&#}}Us7P0qZqM#6fZ-K
Z!0)pG3>+2~;ep6Sz|bItFz}@e{0Fp&XKw%i

literal 0
HcmV?d00001

diff --git a/scripts/convert_input_files.py b/scripts/convert_input_files.py
new file mode 100644
index 0000000..22e6dd7
--- /dev/null
+++ b/scripts/convert_input_files.py
@@ -0,0 +1,22 @@
+from interest.preprocessor.parser import XMLExtractor
+from argparse import ArgumentParser
+from pathlib import Path
+import logging
+
+
+logging.basicConfig(filename='logs/extractor.log', level=logging.DEBUG)
+
+
+
+def parse_arguments():
+    parser = ArgumentParser(
+        prog="convert_input_files.py",
+        description="Convert nested gzip files to compressed json")
+    parser.add_argument("--input_dir", required=True)
+    parser.add_argument("--output_dir", required=True)
+    return parser.parse_args()
+
+if __name__=="__main__":
+    args = parse_arguments()
+    extractor = XMLExtractor(Path(args.input_dir), Path(args.output_dir))
+    extractor.extract_xml_string()
diff --git a/scripts/logs/.DS_Store b/scripts/logs/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
GIT binary patch
literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

literal 0
HcmV?d00001


From 977f03a1a14e3288bddb973e13315fb639970415 Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 1 Mar 2024 17:47:09 +0100
Subject: [PATCH 18/22] Remove DS_Store

---
 scripts/logs/.DS_Store | Bin 6148 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 scripts/logs/.DS_Store

diff --git a/scripts/logs/.DS_Store b/scripts/logs/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I


From 8f0da60a3a7987fd4d0c11ec4697aaf2bca1278d Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 1 Mar 2024 17:48:43 +0100
Subject: [PATCH 19/22] Remove DS_Store file

---
 scripts/.DS_Store | Bin 6148 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 scripts/.DS_Store

diff --git a/scripts/.DS_Store b/scripts/.DS_Store
deleted file mode 100644
index f54d7c8ad083fb73e6aaf03bea7c6ae0f1a19ee1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKO-my|5Uq|;Cy0=PipPPyipCEV@iIm|c-;{_>`Kg-sDtBl$V?0p0&~_Mv)BD4
z{+-=d-5pV*R|!%D)vvm`s%QFj(;q~n>b=%FQI&{1C}XLD;sfD!)-fqLPZ_98A8qPV
zP7R7F9g7ynBr?Erw?vnuF=O}idj2}pGjn-`*-X%q(JFQ5;W*KkQKBQ{@uJy>uKqIu
zi>QoPP1lsYi~I9gT-47xuD`!gCyBGXR(n>Zsq*y9tT*e;dFSDc?uU6Z?`8F7_ZQbr
zv`(Ut-HcA-cE3?wIM8X{jMH}4B*d*YLVlmcX-oI(x|g;xlN;Iwui{l2)#brpW4pHE
zZ*J`jSNy^52JEfP?cuQEE&f>B``Nfn?o$0|Mn(#2+{nDc5j??o8TWnIO%t77!#9c_
z#R&=n!hkR^DGa#%$(x&$ACmtN284l$VSx7sAIcawEG(L>1BH140Qnke1U6p_ImdSx
zI4mrp1)^*!(55Q)#85UJe&59f4hxGmos@fgDEDRMUMR|Z9sT<zoK#>@N?|}4c+0?)
ztycN`Kl=Rr|29c#!hkUFtr$?{!|<?<A-S`4Y;b(m`p^p~3&#}}Us7P0qZqM#6fZ-K
Z!0)pG3>+2~;ep6Sz|bItFz}@e{0Fp&XKw%i


From b5dccf69e7f863f090a3375f60d01cc10663e78e Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 1 Mar 2024 17:54:34 +0100
Subject: [PATCH 20/22] Change log path

---
 scripts/convert_input_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_input_files.py b/scripts/convert_input_files.py
index 22e6dd7..b6d2dea 100644
--- a/scripts/convert_input_files.py
+++ b/scripts/convert_input_files.py
@@ -4,7 +4,7 @@
 import logging
 
 
-logging.basicConfig(filename='logs/extractor.log', level=logging.DEBUG)
+logging.basicConfig(filename='extractor.log', level=logging.DEBUG)
 
 
From 8fde5fd46267446c9c0b6eff83a8d98c188f86eb Mon Sep 17 00:00:00 2001
From: Shiva Nadi <s.nadi@uu.nl>
Date: Fri, 1 Mar 2024 18:02:55 +0100
Subject: [PATCH 21/22] Move field list from metadat method to instructor

---
 interest/preprocessor/parser.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index 99168cd..bbaf724 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -20,6 +20,11 @@ def __init__(self, root_dir: str, output_dir: str):
         """
         self.root_dir = root_dir
         self.output_dir = output_dir
+        self.fields = [
+            "title", "language", "issuenumber", "date", "identifier",
+            "temporal", "recordRights", "publisher", "spatial", "source",
+            "recordIdentifier", "type", "isPartOf"
+        ]
 
     def extract_xml_string(self) -> None:
         """
@@ -164,8 +169,7 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
 
         return {"title": title, "body": body}
 
-    @staticmethod
-    def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
+    def extract_meta(self,xml_string: str) -> Dict[str, Union[str, None]]:
         """
         Extracts metadata from XML string.
 
@@ -183,14 +187,7 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]:
             logging.error("Failed to parse XML from file")
             return newsletter_metadata
 
-        # Extracting metadata
-        fields = [
-            "title", "language", "issuenumber", "date", "identifier",
-            "temporal", "recordRights", "publisher", "spatial", "source",
-            "recordIdentifier", "type", "isPartOf"
-        ]
-
-        for field in fields:
+        for field in self.fields:
             field_values = [element.text for element in root.iter() if element.tag.endswith(field)]  # noqa: E501
             if len(field_values) > 1:
                 logging.warning(f"More than one {field}s are extracted from metadata.")  # noqa: E501

From 57235ab488abe7d2becf65e28d7395752699d564 Mon Sep 17 00:00:00 2001
From: parisa-zahedi <parisa.zahedi@gmail.com>
Date: Thu, 7 Mar 2024 12:51:29 +0100
Subject: [PATCH 22/22] Define input file (#3)

* define input file classes

* define document class

* apply pylint

* apply flake8

* replace relative with absolute import

* remove extra getter functions that are not used

* fix variable types in Article class

* fix mypy errors

* fix flake8 errors

* fix flake8 errors

* Fix mypy and flake8 issues

* Comment parser

* Fix flake8 issues

* Comment import parser

---------

Co-authored-by: parisa-zahedi <p.zahedi@uu.nl>
Co-authored-by: Shiva Nadi <s.nadi@uu.nl>
---
 interest/__init__.py              |   8 +-
 interest/delpher_kranten.py       | 118 +++++++++++++++++++++++++++
 interest/document.py              | 131 ++++++++++++++++++++++++++++++
 interest/input_file.py            | 119 +++++++++++++++++++++++++++
 interest/preprocessor/__init__.py |   2 +-
 interest/preprocessor/parser.py   |  28 ++++---
 6 files changed, 393 insertions(+), 13 deletions(-)
 create mode 100644 interest/delpher_kranten.py
 create mode 100644 interest/document.py
 create mode 100644 interest/input_file.py

diff --git a/interest/__init__.py b/interest/__init__.py
index 6e6ff7e..5170041 100644
--- a/interest/__init__.py
+++ b/interest/__init__.py
@@ -1 +1,7 @@
-from interest.preprocessor.parser import XMLExtractor
\ No newline at end of file
+# from interest.preprocessor.parser import XMLExtractor
+from interest.delpher_kranten import KrantenFile
+
+INPUT_FILE_TYPES = {
+    "delpher_kranten": KrantenFile
+
+}
diff --git a/interest/delpher_kranten.py b/interest/delpher_kranten.py
new file mode 100644
index 0000000..2e77575
--- /dev/null
+++ b/interest/delpher_kranten.py
@@ -0,0 +1,118 @@
+"""
+Delpher Kranten Module
+
+This module provides classes and functions for handling Delpher Kranten files.
+"""
+
+import json
+import logging
+import os
+from typing import Optional
+from interest.document import Document, Article
+from interest.input_file import InputFile
+
+
+class KrantenFile(InputFile):
+    """
+    An InputFile implementation for Delpher Kranten.
+
+    Input is a zip file which includes one JSON file. The JSON file contains
+    metadata and articles from one issue of a newspaper.
+
+    Attributes:
+        METADATA_FIELD (str): The key for metadata field in JSON data.
+        TITLE_FIELD (str): The key for title field in metadata.
+        DATE_FIELD (str): The key for date field in metadata.
+        LANGUAGE_FIELD (str): The key for language field in metadata.
+        ARTICLES_FIELD (str): The key for articles field in JSON data.
+        ARTICLE_TITLE_FIELD (str): The key for title field in an article.
+        ARTICLE_BODY_FIELD (str): The key for body field in an article.
+        ENCODING (str): The encoding format for reading the file.
+
+    Methods:
+        read_json(json_file): Read JSON data from a file and parse it into
+        a Document object.
+        base_file_name(): Extract the base file name without extension from
+        the filepath.
+        doc(): Read the directory and parse the JSON file into a Document
+        object.
+    """
+
+    METADATA_FIELD = "newsletter_metadata"
+    TITLE_FIELD = "title"
+    DATE_FIELD = "date"
+    LANGUAGE_FIELD = "language"
+    ARTICLES_FIELD = "articles"
+    ARTICLE_TITLE_FIELD = "title"
+    ARTICLE_BODY_FIELD = "body"
+    ENCODING = "utf-8"
+
+    def read_json(self, json_file) -> Optional[Document]:
+        """
+                Read JSON data from a file and parse it into a Document object.
+
+                Args:
+                    json_file: A file object containing JSON data.
+
+                Returns:
+                    Optional[Document]: A Document object parsed from
+                    the JSON data, or None if parsing fails.
+        """
+        try:
+            json_data = json.load(json_file)
+            metadata = json_data[self.METADATA_FIELD]
+            document_title = metadata[self.TITLE_FIELD]
+            publish_date = metadata[self.DATE_FIELD]
+            language = metadata[self.LANGUAGE_FIELD]
+
+            articles_data = json_data[self.ARTICLES_FIELD]
+
+            articles = []
+            for article_id, article in articles_data.items():
+                article_title = article[self.ARTICLE_TITLE_FIELD]
+                article_body = article[self.ARTICLE_BODY_FIELD]
+                article = Article(article_id=article_id, title=article_title,
+                                  body=article_body)
+                articles.append(article)
+
+            document = Document(title=document_title,
+                                publish_date=publish_date,
+                                language=language,
+                                articles=articles)
+            return document
+
+        except (json.JSONDecodeError, KeyError) as e:
+            logging.error("Error parsing JSON data: %s", e)
+            return None
+
+    def base_file_name(self) -> str:
+        """
+               Extract the base file name without extension from the filepath.
+
+               Returns:
+                   str: The base file name without extension.
+        """
+        file_name_json = os.path.splitext(os.path.basename(self.filepath))[0]
+        base_file_name = os.path.splitext(file_name_json)[0]
+        return base_file_name
+
+    def doc(self) -> Optional[Document]:
+        """
+                Read the directory and parse the JSON file into a Document
+                object.
+
+                Returns:
+                    Optional[Document]: A Document object parsed from the
+                    JSON data, or None if parsing fails.
+        """
+        try:
+            logging.info("Reading directory '%s'...", self._filepath)
+            fh = self.open(encoding=self.ENCODING)
+            document = self.read_json(fh)
+            fh.close()
+            return document
+
+        except OSError as e:
+            logging.error("Error processing gzip file '%s': %s",
+                          self._filepath, e)
+            return None
diff --git a/interest/document.py b/interest/document.py
new file mode 100644
index 0000000..5984d1b
--- /dev/null
+++ b/interest/document.py
@@ -0,0 +1,131 @@
+# pylint: disable=too-few-public-methods
+"""
+This module defines the Document class, which represents a document
+containing articles.
+"""
+from typing import Optional, List, Union
+from datetime import datetime
+
+
+class Article:
+    """A class representing an article.
+
+        This class represents an article with an ID, title, and body text.
+        The body text can be provided as a list
+        of paragraphs, which will be joined into a single string.
+
+        Attributes:
+            id (str): The unique identifier of the article.
+            title (str): The title of the article.
+            body (str): The body text of the article, represented as
+            a single string.
+    """
+    def __init__(self, article_id: str, title: str,
+                 body: Union[str, List[str]]) -> None:
+        """Initialize an Article object with the given ID, title, and body.
+
+                Args:
+                    id (str): The unique identifier of the article.
+                    title (str): The title of the article.
+                    body (Union[str, List[str]): The body text of the article,
+                    provided as a list of paragraphs.
+        """
+        self.id = article_id
+        self.title = title
+        if isinstance(body, list):
+            article_body = '\n'.join(body)
+            self.text = article_body
+        else:
+            self.text = body
+
+
+class Document:
+    """
+        Represents a document containing articles.
+
+        Args:
+            title (str): The title of the document.
+            publish_date (str): The publication date of the document in
+            the format 'YYYY-MM-DD'.
+            language (str): The language of the document.
+            articles (List[Article]): A list of articles included in
+             the document.
+
+        Attributes:
+            _title (str): The title of the document.
+            _publish_date (str): The publication date of the document in
+            the format 'YYYY-MM-DD'.
+            _year (Optional[int]): The year of publication, extracted from
+            publish_date.
+            _language (str): The language of the document.
+            _articles (List[Article]): A list of articles included in the
+             document.
+
+        Properties:
+            title (str): Getter for the title of the document.
+            publish_date (str): Getter for the publication date of the
+            document.
+            year (Optional[int]): Getter for the year of publication.
+            decade (Optional[int]): Getter for the decade of publication.
+            language (str): Getter for the language of the document.
+            articles (List[Article]): Getter for the list of articles
+            included in the document.
+    """
+    def __init__(self, title: str, publish_date: str, language: str,
+                 articles: List[Article]) -> None:
+        self._year: Optional[int] = None
+        self._articles = articles
+        self._title = title
+        self._publish_date = publish_date
+        self._language = language
+
+    @property
+    def title(self) -> str:
+        """
+            Getter for the title of the document.
+
+            Returns:
+                str: The title of the document.
+        """
+        return self._title
+
+    @property
+    def year(self) -> Optional[int]:
+        """
+            Getter for the year of publication.
+
+            Returns:
+                Optional[int]: The year of publication extracted
+                from publish_date, or None if it cannot be determined.
+        """
+        if self._year is not None:
+            return self._year
+        try:
+            date_obj = datetime.strptime(self._publish_date, '%Y-%m-%d')
+            self._year = date_obj.year
+            return self._year
+        except ValueError:
+            return None
+
+    @property
+    def decade(self) -> Optional[int]:
+        """
+            Getter for the decade of publication.
+
+            Returns:
+                Optional[int]: The decade of publication extracted from
+                publish_date,
+                or None if it cannot be determined.
+        """
+        _ = self.year
+        return int(self._year / 10) * 10 if self._year is not None else None
+
+    @property
+    def articles(self) -> List[Article]:
+        """
+            Getter for the list of articles included in the document.
+
+            Returns:
+                List[Article]: The list of articles included in the document.
+        """
+        return self._articles
diff --git a/interest/input_file.py b/interest/input_file.py
new file mode 100644
index 0000000..72156f1
--- /dev/null
+++ b/interest/input_file.py
@@ -0,0 +1,119 @@
+"""
+Input File Module
+This module provides an abstract class for representing various input files.
+"""
+
+import abc
+import gzip
+from pathlib import Path
+from typing import Iterable, TextIO, cast, Optional
+from interest.document import Document, Article
+import logging
+
+# from .document_filter import DocumentFilter
+
+
+class InputFile(abc.ABC):
+    """
+    Abstract class for representing various input files.
+
+    Attributes:
+        _filepath (Path): The file path of the input file.
+
+    Methods:
+        __init__(filepath): Initialize the InputFile with a file path.
+        filepath(): Get the file path of the input file.
+        base_file_name(): Output a list of documents in the input file.
+        open(mode, encoding): Open the input file for reading.
+        articles(): Return all articles for the document found in the
+        input file.
+        doc(): Output a list of documents in the input file.
+    """
+
+    def __init__(self, filepath: Path) -> None:
+        """
+               Initialize the InputFile with a file path.
+
+               Args:
+                   filepath (Path): The file path of the input file.
+        """
+        self._filepath = filepath
+
+    @property
+    def filepath(self) -> Path:
+        """
+                Get the file path of the input file.
+
+                Returns:
+                    Path: The file path of the input file.
+        """
+        return self._filepath
+
+    @abc.abstractmethod
+    def base_file_name(self) -> str:
+        """
+        Output a list of documents in the input file.
+
+        This can be a singleton list if an input file contains only
+        one document.
+
+        Returns:
+            str: The base file name without extension.
+        """
+        return NotImplemented
+
+    def open(self, mode: str = "rt", encoding=None) -> TextIO:
+        """
+                Open the input file for reading.
+
+                Args:
+                    mode (str): The file open mode.
+                    encoding: The encoding format.
+
+                Returns:
+                    TextIO: A file object for reading the input file.
+        """
+        if self._filepath.suffix.startswith(".gz"):
+            return cast(TextIO, gzip.open(self._filepath, mode=mode,
+                                          encoding=encoding))
+
+        # Default to text file
+        return cast(TextIO, open(self._filepath,
+                                 mode=mode, encoding=encoding))
+
+    # pylint: disable=no-member
+    def articles(self) -> Iterable[Article]:
+        """
+        Return all articles for the document found in the input file.
+
+        Yields:
+            Article: An article object.
+        """
+        doc = self.doc()
+        if doc is not None:
+            yield from doc.articles
+        else:
+            logging.error("Document not found or is None for filepath: %s",
+                          self.filepath)
+            return
+
+    @abc.abstractmethod
+    def doc(self) -> Optional[Document]:
+        """
+            Output a list of documents in the input file.
+
+            This can be a singleton list if an input file contains only
+            one document.
+
+            Returns:
+                Document: A document object.
+        """
+        return NotImplemented
+
+    # def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]:
+    #     document = self.doc()
+    #     if filter.filter_document(document):
+    #         if document.articles() is not None:
+    #             for article in document.articles():
+    #                 if filter.filter_article(article):
+    #                     yield article
diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py
index 6e6ff7e..3cec932 100644
--- a/interest/preprocessor/__init__.py
+++ b/interest/preprocessor/__init__.py
@@ -1 +1 @@
-from interest.preprocessor.parser import XMLExtractor
\ No newline at end of file
+# from interest.preprocessor.parser import XMLExtractor
diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py
index bbaf724..25ac1d7 100644
--- a/interest/preprocessor/parser.py
+++ b/interest/preprocessor/parser.py
@@ -4,7 +4,7 @@
 import gzip
 import json
 import xml.etree.ElementTree as ET
-from typing import Dict, Union
+from typing import Dict, Union, Any, Optional, List
 import logging
 
 
@@ -73,7 +73,7 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s
         Returns:
             Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata.  # noqa: E501
         """
-        news_dict = {"newsletter_metadata": {}, "articles": {}}
+        news_dict: Dict[str, Any] = {"newsletter_metadata": {}, "articles": {}}
         id = 0
         for entry in outer_tar:
             try:
@@ -88,9 +88,11 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s
 
                 elif entry.name.endswith(".gz"):
                     gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz'))  # noqa: E501
-                    with outer_tar.extractfile(gz_member) as gz_file:
+                    with outer_tar.extractfile(gz_member) as gz_file:  # type: ignore  # noqa: E501
                         with gzip.open(gz_file, 'rt') as xml_file:
                             xml_string = xml_file.read()
+                            if isinstance(xml_string, bytes):
+                                xml_string = xml_string.decode('utf-8')
                             newsletter_metadata = self.extract_meta(xml_string)
                             news_dict["newsletter_metadata"] = newsletter_metadata  # noqa: E501
                 else:
@@ -130,7 +132,7 @@ def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict
     #         logging.error(f"Error saving JSON to {output_file}: {e}")
 
     @staticmethod
-    def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
+    def extract_article(xml_content: str, file_name: str) -> Dict[str, Union[str, List[Optional[str]]]]:  # noqa: E501
         """
         Extracts article title and body from XML content.
 
@@ -139,7 +141,8 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
             file_name (str): Name of the XML file.
 
         Returns:
-            Dict[str, str]: A dictionary containing the extracted title and body of the article.  # noqa: E501
+            Dict[Optional[str], list[str]]: A dictionary containing the extracted title and body of the article.
+              body contains a list of paragraphs.  # noqa: E501
         """
         try:
             root = ET.fromstring(xml_content)
@@ -152,14 +155,15 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
             logging.warning("More than one titles are extracted for the article.")  # noqa: E501
         if not title_values:
             logging.warning("No title is extracted for the article.")
-            title = None
+            title = ""
         else:
-            title = title_values[0]
+            title = title_values[0] if title_values[0] is not None else ""
+            # title = title_values[0]
 
         body_values = [element.text for element in root.iter() if element.tag.endswith('p')]  # noqa: E501
         if not body_values:
             logging.warning("No body is extracted.")
-            body = None
+            body = []
         # elif len(body_values) > 1:
         #     logging.warning("There are more than one paragraphs in the article.")  # noqa: E501
         #     body = ' '.join(body_values)
@@ -169,7 +173,7 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]:
 
         return {"title": title, "body": body}
 
-    def extract_meta(self,xml_string: str) -> Dict[str, Union[str, None]]:
+    def extract_meta(self, xml_string: str) -> Dict[str, Union[str, None]]:
         """
         Extracts metadata from XML string.
 
@@ -195,7 +199,9 @@ def extract_meta(self,xml_string: str) -> Dict[str, Union[str, None]]:
                 logging.warning(f"No {field} is extracted.")
                 newsletter_metadata[field] = None
             else:
-                newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values)  # noqa: E501
+                filtered_field_values = [value for value in field_values if value is not None]  # noqa: E501
+                newsletter_metadata[field] = filtered_field_values[0] if field != "spatial" else ", ".join(filtered_field_values)  # noqa: E501
+
+                # newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values)  # noqa: E501
 
         return newsletter_metadata
- 
\ No newline at end of file