From ed6ec18dc15f1341576be806112abd0e0ed1b4f2 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Tue, 13 Feb 2024 11:01:23 +0100 Subject: [PATCH 01/22] Add parser --- interest/preprocessor/__init__.py | 0 interest/preprocessor/parser.py | 55 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 interest/preprocessor/__init__.py create mode 100644 interest/preprocessor/parser.py diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py new file mode 100644 index 0000000..fbe4148 --- /dev/null +++ b/interest/preprocessor/parser.py @@ -0,0 +1,55 @@ +import json +import lzma +import re +from collections import Counter, defaultdict +from pathlib import Path +from typing import List, Union, Dict + +import xml.etree.cElementTree as et + +def parse_raw_article(article_input_fp: Union[Path, str]) -> Dict: + """Parse a raw article file into a structured list + + Arguments + --------- + article_input_fp: + Input file to process. + + Returns + -------- + articles: List[Dict] + A list of dictionaries, where each item is for one article and includes + the title and the body of article. + + """ + if article_input_fp !=None: + tree = et.parse(article_input_fp) + root = tree.getroot() + for title_item in root.findall('./title'): + title = title_item.text + for article_item in root.findall('./p'): + body = article_item.text + + return title, body + + +def parse_journal_articles(input_dir: Union[Path, str]) -> Dict: + input_dir = Path(input_dir) + file_list = list(input_dir.glob("*.xml")) + meta_file_list = list(input_dir.glob("*.didl.xml")) + file_list = [item for item in file_list if item not in meta_file_list] + articles: List[Dict] = [] + for file in file_list: + title, body = parse_raw_article(file) + articles.append({"title": title, "body":body}) + return articles + + + + + + +if __name__ == "__main__": + # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml')) + print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) + From bf1f4d840bab468d017b3af3cfd0c18dbe3d944d Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 16 Feb 2024 09:25:39 +0100 Subject: [PATCH 02/22] Add parse_meta_data function --- interest/preprocessor/parser.py | 46 ++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index fbe4148..e71f687 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -39,17 +39,61 @@ def parse_journal_articles(input_dir: Union[Path, str]) -> Dict: meta_file_list = list(input_dir.glob("*.didl.xml")) file_list = [item for item in file_list if item not in meta_file_list] articles: List[Dict] = [] + for file in file_list: title, body = parse_raw_article(file) articles.append({"title": title, "body":body}) return articles +def parse_meta_file(input_dir: Union[Path, str]) -> Dict: + input_dir = Path(input_dir) + meta_file_list = list(input_dir.glob("*.didl.xml")) + + tree=et.parse(meta_file_list[0]) + root=tree.getroot() + + title_values = [element.text for element in root.iter() if element.tag.endswith('title')] + language_values = [element.text for element in root.iter() if element.tag.endswith('language')] + issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')] + date_values = [element.text for element in root.iter() if element.tag.endswith('date')] + identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')] + + print(title_values[0], '*') + print(language_values[0], '*') + print(issuenumber_values[0], '*') + print(date_values[0], '*') + print(identifier_values[0], '*') + + + + + + + # for item in root.findall('.//{urn:mpeg:mpeg21:2002:02-DIDL-NS}Item'): + # for x in item.iter(): + # for t in x.findall('{http://purl.org/dc/elements/1.1/}title'): + # title = t.text + # for l in x.findall('./{http://purl.org/dc/elements/1.1/}language'): + # language = l.text + # for issuenumber in x.findall('./{http://krait.kb.nl/coop/tel/handbook/telterms.html}issuenumber'): + # issue_number = issuenumber.text + # for d in x.findall('./{http://purl.org/dc/elements/1.1/}date'): + # date = d.text + # for i in x.findall('./{http://purl.org/dc/elements/1.1/}identifier'): + # identifier = i.text + + + + + + if __name__ == "__main__": # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml')) - print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) + # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) + parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100') From 43467d8b70ef8792daa4343138ab69ad1253dfb5 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 16 Feb 2024 12:18:53 +0100 Subject: [PATCH 03/22] Extract metadata --- interest/preprocessor/parser.py | 179 ++++++++++++++++++++++++++------ 1 file changed, 147 insertions(+), 32 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index e71f687..94830f1 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -4,6 +4,7 @@ from collections import Counter, defaultdict from pathlib import Path from typing import List, Union, Dict +import logging import xml.etree.cElementTree as et @@ -40,6 +41,7 @@ def parse_journal_articles(input_dir: Union[Path, str]) -> Dict: file_list = [item for item in file_list if item not in meta_file_list] articles: List[Dict] = [] + for file in file_list: title, body = parse_raw_article(file) articles.append({"title": title, "body":body}) @@ -48,46 +50,159 @@ def parse_journal_articles(input_dir: Union[Path, str]) -> Dict: def parse_meta_file(input_dir: Union[Path, str]) -> Dict: input_dir = Path(input_dir) meta_file_list = list(input_dir.glob("*.didl.xml")) - - tree=et.parse(meta_file_list[0]) - root=tree.getroot() - - title_values = [element.text for element in root.iter() if element.tag.endswith('title')] - language_values = [element.text for element in root.iter() if element.tag.endswith('language')] - issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')] - date_values = [element.text for element in root.iter() if element.tag.endswith('date')] - identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')] - - print(title_values[0], '*') - print(language_values[0], '*') - print(issuenumber_values[0], '*') - print(date_values[0], '*') - print(identifier_values[0], '*') + newsletter_metadata: List[Dict] = [] + try: + tree=et.parse(meta_file_list[0]) + root=tree.getroot() + except et.ParseError as e: + logging.error("Failed to parse the xml file:%s", e) - - + + title_values = [element.text for element in root.iter() if element.tag.endswith('title')] + if len(title_values)>1: + logging.warning("More than one titles are extracted from metadata.") + if not title_values: + logging.warning("No title is extracted.") + title = None + else: + title = title_values[0] + language_values = [element.text for element in root.iter() if element.tag.endswith('language')] + if len(language_values)>1: + logging.warning("More than one language are extracted from metadata.") + if not language_values: + logging.warning("No language is extracted.") + language = None + else: + language = language_values[0] - - # for item in root.findall('.//{urn:mpeg:mpeg21:2002:02-DIDL-NS}Item'): - # for x in item.iter(): - # for t in x.findall('{http://purl.org/dc/elements/1.1/}title'): - # title = t.text - # for l in x.findall('./{http://purl.org/dc/elements/1.1/}language'): - # language = l.text - # for issuenumber in x.findall('./{http://krait.kb.nl/coop/tel/handbook/telterms.html}issuenumber'): - # issue_number = issuenumber.text - # for d in x.findall('./{http://purl.org/dc/elements/1.1/}date'): - # date = d.text - # for i in x.findall('./{http://purl.org/dc/elements/1.1/}identifier'): - # identifier = i.text - + issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')] + if len(issuenumber_values)>1: + logging.warning("More than one issuenumbers are extracted from metadata.") + if not issuenumber_values: + logging.warning("No issuenumber is extracted.") + issuenumber = None + else: + issuenumber = issuenumber_values[0] + date_values = [element.text for element in root.iter() if element.tag.endswith('date')] + if len(date_values)>1: + logging.warning("More than one dates are extracted from metadata.") + if not date_values: + logging.warning("No date is extracted.") + date = None + else: + date = date_values[0] + identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')] + if len(identifier_values)>1: + logging.warning("More than one identifiers are extracted from metadata.") + if not identifier_values: + logging.warning("No identifier is extracted.") + identifier = None + else: + identifier = identifier_values[0] + + temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')] + if len(temporal_values)>1: + logging.warning("More than one temporal are extracted from metadata.") + if not temporal_values: + logging.warning("No temporal is extracted.") + temporal = None + else: + temporal = temporal_values[0] + + recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')] + if len(recordRights_values)>1: + logging.warning("More than one recordRights are extracted from metadata.") + if not recordRights_values: + logging.warning("No recordRights is extracted.") + recordRights = None + else: + recordRights = recordRights_values[0] + + publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')] + if len(publisher_values)>1: + logging.warning("More than one publisher are extracted from metadata.") + if not publisher_values: + logging.warning("No publisher is extracted.") + publisher = None + else: + publisher = publisher_values[0] + + spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')] + if len(spatial_values)>1: + logging.warning("More than one spatial are extracted from metadata.") + if not spatial_values: + logging.warning("No spatial is extracted.") + spatial_1 = None + spatial_2 = None + else: + spatial_1 = spatial_values[0] + spatial_2 = spatial_values[1] + + source_values = [element.text for element in root.iter() if element.tag.endswith('source')] + if len(source_values)>1: + logging.warning("More than one source are extracted from metadata.") + if not source_values: + logging.warning("No source is extracted.") + source = None + else: + source = source_values[1] + + recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')] + if len(recordIdentifier_values)>1: + logging.warning("More than one recordIdentifier are extracted from metadata.") + if not recordIdentifier_values: + logging.warning("No recordIdentifier is extracted.") + recordIdentifier = None + else: + recordIdentifier = recordIdentifier_values[0] + + type_values = [element.text for element in root.iter() if element.tag.endswith('type')] + if len(type_values)>1: + logging.warning("More than one type are extracted from metadata.") + if not type_values: + logging.warning("No type is extracted.") + type = None + else: + type = type_values[0] + + isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')] + if len(isPartOf_values)>1: + logging.warning("More than one isPartOf are extracted from metadata.") + if not isPartOf_values: + logging.warning("No isPartOf is extracted.") + isPartOf_1 = None + isPartOf_2 = None + else: + isPartOf_1 = isPartOf_values[0] + isPartOf_2 = isPartOf_values[1] + + + newsletter_metadata.append({ + "title": title, + "language":language, + "issue_number":issuenumber, + "date": date, + "identifier": identifier, + "temporal": temporal, + "recordRights": recordRights, + "publisher": publisher, + "spatial_1": spatial_1, + "spatial_2": spatial_2, + "source": source, + "recordIdentifier": recordIdentifier, + "type": type, + "isPartOf_1":isPartOf_1, + "isPartOf_2":isPartOf_2 + }) + + return newsletter_metadata @@ -95,5 +210,5 @@ def parse_meta_file(input_dir: Union[Path, str]) -> Dict: if __name__ == "__main__": # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml')) # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) - parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100') + print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) From 215f20953b3d88743699bc2712812a2b15025732 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 16 Feb 2024 15:22:57 +0100 Subject: [PATCH 04/22] Add parse_all_articles --- interest/preprocessor/parser.py | 87 +++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 94830f1..9acdfe8 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -3,17 +3,38 @@ import re from collections import Counter, defaultdict from pathlib import Path -from typing import List, Union, Dict +from typing import List, Union, Dict, Optional import logging import xml.etree.cElementTree as et + +def parse_all_articles(input_dir: Union[Path, str]) -> Dict: + + input_dir = Path(input_dir) + file_list = list(input_dir.glob("*.xml")) + # List of meta files + meta_file_list = list(input_dir.glob("*.didl.xml")) + # List of xml files excluded meta file + article_list = [item for item in file_list if item not in meta_file_list] + + articles: List[Dict] = [] + -def parse_raw_article(article_input_fp: Union[Path, str]) -> Dict: + for file in article_list: + article = parse_raw_article(file) + articles.append(article) + + newsletter_metadata= parse_meta_file(meta_file_list[0]) + + news_dict = {"newsletter_metadata": newsletter_metadata, "articles": articles} + return news_dict + +def parse_raw_article(article_fp: Union[Path, str]) -> Dict: """Parse a raw article file into a structured list Arguments --------- - article_input_fp: + article_input_fp: Union[Path, str] Input file to process. Returns @@ -23,41 +44,45 @@ def parse_raw_article(article_input_fp: Union[Path, str]) -> Dict: the title and the body of article. """ - if article_input_fp !=None: - tree = et.parse(article_input_fp) + try: + tree = et.parse(article_fp) root = tree.getroot() - for title_item in root.findall('./title'): - title = title_item.text - for article_item in root.findall('./p'): - body = article_item.text - - return title, body - + except et.ParseError as e: + logging.error("Failed to parse the article file:%s", e) -def parse_journal_articles(input_dir: Union[Path, str]) -> Dict: - input_dir = Path(input_dir) - file_list = list(input_dir.glob("*.xml")) - meta_file_list = list(input_dir.glob("*.didl.xml")) - file_list = [item for item in file_list if item not in meta_file_list] - articles: List[Dict] = [] + title_values = [element.text for element in root.iter() if element.tag.endswith('title')] + if len(title_values)>1: + logging.warning("More than one titles are extracted for the article.") + if not title_values: + logging.warning("No title is extracted for the article.") + title = None + else: + title = title_values[0] + body_values = [element.text for element in root.iter() if element.tag.endswith('p')] + if not body_values: + logging.warning("No body is extracted.") + body = None + if len(body_values)>1: + logging.warning("There are more than on paragraphs in the article.") + body = ' '.join(body_values) + else: + body = body_values[0] - for file in file_list: - title, body = parse_raw_article(file) - articles.append({"title": title, "body":body}) - return articles + return {"title": title, "body":body} -def parse_meta_file(input_dir: Union[Path, str]) -> Dict: - input_dir = Path(input_dir) - meta_file_list = list(input_dir.glob("*.didl.xml")) + +def parse_meta_file(meta_fp: Union[Path, str]) -> Dict: + # input_dir = Path(input_dir) + # meta_file_list = list(input_dir.glob("*.didl.xml")) newsletter_metadata: List[Dict] = [] try: - tree=et.parse(meta_file_list[0]) + tree=et.parse(meta_fp) root=tree.getroot() except et.ParseError as e: - logging.error("Failed to parse the xml file:%s", e) + logging.error("Failed to parse the meta file:%s", e) title_values = [element.text for element in root.iter() if element.tag.endswith('title')] @@ -185,8 +210,8 @@ def parse_meta_file(input_dir: Union[Path, str]) -> Dict: newsletter_metadata.append({ - "title": title, - "language":language, + "title": title, + "language":language, "issue_number":issuenumber, "date": date, "identifier": identifier, @@ -210,5 +235,7 @@ def parse_meta_file(input_dir: Union[Path, str]) -> Dict: if __name__ == "__main__": # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml')) # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) - print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) + # print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) + x = parse_all_articles("../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100") + print(x) From 00573dbefe222e846bfc8a68b12de9b3e127dc8c Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 16 Feb 2024 16:07:38 +0100 Subject: [PATCH 05/22] Convert to class --- interest/preprocessor/parser.py | 457 +++++++++++++++++--------------- 1 file changed, 237 insertions(+), 220 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 9acdfe8..d61049f 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -7,235 +7,252 @@ import logging import xml.etree.cElementTree as et - -def parse_all_articles(input_dir: Union[Path, str]) -> Dict: - - input_dir = Path(input_dir) - file_list = list(input_dir.glob("*.xml")) - # List of meta files - meta_file_list = list(input_dir.glob("*.didl.xml")) - # List of xml files excluded meta file - article_list = [item for item in file_list if item not in meta_file_list] - - articles: List[Dict] = [] - - - for file in article_list: - article = parse_raw_article(file) - articles.append(article) - - newsletter_metadata= parse_meta_file(meta_file_list[0]) - - news_dict = {"newsletter_metadata": newsletter_metadata, "articles": articles} - return news_dict - -def parse_raw_article(article_fp: Union[Path, str]) -> Dict: - """Parse a raw article file into a structured list - - Arguments - --------- - article_input_fp: Union[Path, str] - Input file to process. - - Returns - -------- - articles: List[Dict] - A list of dictionaries, where each item is for one article and includes - the title and the body of article. - - """ - try: - tree = et.parse(article_fp) - root = tree.getroot() - except et.ParseError as e: - logging.error("Failed to parse the article file:%s", e) - - title_values = [element.text for element in root.iter() if element.tag.endswith('title')] - if len(title_values)>1: - logging.warning("More than one titles are extracted for the article.") - if not title_values: - logging.warning("No title is extracted for the article.") - title = None - else: - title = title_values[0] - - body_values = [element.text for element in root.iter() if element.tag.endswith('p')] - if not body_values: - logging.warning("No body is extracted.") - body = None - if len(body_values)>1: - logging.warning("There are more than on paragraphs in the article.") - body = ' '.join(body_values) - else: - body = body_values[0] - - return {"title": title, "body":body} - -def parse_meta_file(meta_fp: Union[Path, str]) -> Dict: - # input_dir = Path(input_dir) - # meta_file_list = list(input_dir.glob("*.didl.xml")) - newsletter_metadata: List[Dict] = [] +class NewsletterFile: + """ Class for parsing xml files to json """ + + def __init__( + self, + input_dir: Union[Path, str], + output_dir: Union[Path, str] + ): + + self.input_dir = Path(input_dir) + self.output_dir = Path(output_dir) - try: - tree=et.parse(meta_fp) - root=tree.getroot() - except et.ParseError as e: - logging.error("Failed to parse the meta file:%s", e) + def parse_all_articles(self) -> Dict: - title_values = [element.text for element in root.iter() if element.tag.endswith('title')] - if len(title_values)>1: - logging.warning("More than one titles are extracted from metadata.") - if not title_values: - logging.warning("No title is extracted.") - title = None - else: - title = title_values[0] - - language_values = [element.text for element in root.iter() if element.tag.endswith('language')] - if len(language_values)>1: - logging.warning("More than one language are extracted from metadata.") - if not language_values: - logging.warning("No language is extracted.") - language = None - else: - language = language_values[0] + file_list = list(self.input_dir.glob("*.xml")) + # List of meta files + meta_file_list = list(self.input_dir.glob("*.didl.xml")) + # List of xml files excluded meta file + article_file_list = [item for item in file_list if item not in meta_file_list] - - issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')] - if len(issuenumber_values)>1: - logging.warning("More than one issuenumbers are extracted from metadata.") - if not issuenumber_values: - logging.warning("No issuenumber is extracted.") - issuenumber = None - else: - issuenumber = issuenumber_values[0] - - - date_values = [element.text for element in root.iter() if element.tag.endswith('date')] - if len(date_values)>1: - logging.warning("More than one dates are extracted from metadata.") - if not date_values: - logging.warning("No date is extracted.") - date = None - else: - date = date_values[0] - - identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')] - if len(identifier_values)>1: - logging.warning("More than one identifiers are extracted from metadata.") - if not identifier_values: - logging.warning("No identifier is extracted.") - identifier = None - else: - identifier = identifier_values[0] - - temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')] - if len(temporal_values)>1: - logging.warning("More than one temporal are extracted from metadata.") - if not temporal_values: - logging.warning("No temporal is extracted.") - temporal = None - else: - temporal = temporal_values[0] - - recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')] - if len(recordRights_values)>1: - logging.warning("More than one recordRights are extracted from metadata.") - if not recordRights_values: - logging.warning("No recordRights is extracted.") - recordRights = None - else: - recordRights = recordRights_values[0] - - publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')] - if len(publisher_values)>1: - logging.warning("More than one publisher are extracted from metadata.") - if not publisher_values: - logging.warning("No publisher is extracted.") - publisher = None - else: - publisher = publisher_values[0] - - spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')] - if len(spatial_values)>1: - logging.warning("More than one spatial are extracted from metadata.") - if not spatial_values: - logging.warning("No spatial is extracted.") - spatial_1 = None - spatial_2 = None - else: - spatial_1 = spatial_values[0] - spatial_2 = spatial_values[1] - - source_values = [element.text for element in root.iter() if element.tag.endswith('source')] - if len(source_values)>1: - logging.warning("More than one source are extracted from metadata.") - if not source_values: - logging.warning("No source is extracted.") - source = None - else: - source = source_values[1] - - recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')] - if len(recordIdentifier_values)>1: - logging.warning("More than one recordIdentifier are extracted from metadata.") - if not recordIdentifier_values: - logging.warning("No recordIdentifier is extracted.") - recordIdentifier = None - else: - recordIdentifier = recordIdentifier_values[0] - - type_values = [element.text for element in root.iter() if element.tag.endswith('type')] - if len(type_values)>1: - logging.warning("More than one type are extracted from metadata.") - if not type_values: - logging.warning("No type is extracted.") - type = None - else: - type = type_values[0] - - isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')] - if len(isPartOf_values)>1: - logging.warning("More than one isPartOf are extracted from metadata.") - if not isPartOf_values: - logging.warning("No isPartOf is extracted.") - isPartOf_1 = None - isPartOf_2 = None - else: - isPartOf_1 = isPartOf_values[0] - isPartOf_2 = isPartOf_values[1] - - - newsletter_metadata.append({ - "title": title, - "language":language, - "issue_number":issuenumber, - "date": date, - "identifier": identifier, - "temporal": temporal, - "recordRights": recordRights, - "publisher": publisher, - "spatial_1": spatial_1, - "spatial_2": spatial_2, - "source": source, - "recordIdentifier": recordIdentifier, - "type": type, - "isPartOf_1":isPartOf_1, - "isPartOf_2":isPartOf_2 - }) - - return newsletter_metadata + articles: List[Dict] = [] + + + for file in article_file_list: + article = self._parse_raw_article(file) + articles.append(article) + + newsletter_metadata= self._parse_meta_file(meta_file_list[0]) + + news_dict = {"newsletter_metadata": newsletter_metadata, "articles": articles} + return news_dict + + def _parse_raw_article(self, article_fp: Union[Path, str]) -> Dict: + """Parse a raw article file into a structured list + + Arguments + --------- + article_input_fp: Union[Path, str] + Input file to process. + + Returns + -------- + articles: List[Dict] + A list of dictionaries, where each item is for one article and includes + the title and the body of article. + + """ + try: + tree = et.parse(article_fp) + root = tree.getroot() + except et.ParseError as e: + logging.error("Failed to parse the article file:%s", e) + + title_values = [element.text for element in root.iter() if element.tag.endswith('title')] + if len(title_values)>1: + logging.warning("More than one titles are extracted for the article.") + if not title_values: + logging.warning("No title is extracted for the article.") + title = None + else: + title = title_values[0] + + body_values = [element.text for element in root.iter() if element.tag.endswith('p')] + if not body_values: + logging.warning("No body is extracted.") + body = None + if len(body_values)>1: + logging.warning("There are more than on paragraphs in the article.") + body = ' '.join(body_values) + else: + body = body_values[0] + + return {"title": title, "body":body} + + + def _parse_meta_file(self, meta_fp: Union[Path, str]) -> Dict: + + newsletter_metadata: List[Dict] = [] + + + try: + tree=et.parse(meta_fp) + root=tree.getroot() + except et.ParseError as e: + logging.error("Failed to parse the meta file:%s", e) + + + title_values = [element.text for element in root.iter() if element.tag.endswith('title')] + if len(title_values)>1: + logging.warning("More than one titles are extracted from metadata.") + if not title_values: + logging.warning("No title is extracted.") + title = None + else: + title = title_values[0] + + language_values = [element.text for element in root.iter() if element.tag.endswith('language')] + if len(language_values)>1: + logging.warning("More than one language are extracted from metadata.") + if not language_values: + logging.warning("No language is extracted.") + language = None + else: + language = language_values[0] + + + issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')] + if len(issuenumber_values)>1: + logging.warning("More than one issuenumbers are extracted from metadata.") + if not issuenumber_values: + logging.warning("No issuenumber is extracted.") + issuenumber = None + else: + issuenumber = issuenumber_values[0] + + + date_values = [element.text for element in root.iter() if element.tag.endswith('date')] + if len(date_values)>1: + logging.warning("More than one dates are extracted from metadata.") + if not date_values: + logging.warning("No date is extracted.") + date = None + else: + date = date_values[0] + + identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')] + if len(identifier_values)>1: + logging.warning("More than one identifiers are extracted from metadata.") + if not identifier_values: + logging.warning("No identifier is extracted.") + identifier = None + else: + identifier = identifier_values[0] + + temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')] + if len(temporal_values)>1: + logging.warning("More than one temporal are extracted from metadata.") + if not temporal_values: + logging.warning("No temporal is extracted.") + temporal = None + else: + temporal = temporal_values[0] + + recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')] + if len(recordRights_values)>1: + logging.warning("More than one recordRights are extracted from metadata.") + if not recordRights_values: + logging.warning("No recordRights is extracted.") + recordRights = None + else: + recordRights = recordRights_values[0] + + publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')] + if len(publisher_values)>1: + logging.warning("More than one publisher are extracted from metadata.") + if not publisher_values: + logging.warning("No publisher is extracted.") + publisher = None + else: + publisher = publisher_values[0] + + spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')] + if len(spatial_values)>1: + logging.warning("More than one spatial are extracted from metadata.") + if not spatial_values: + logging.warning("No spatial is extracted.") + spatial_1 = None + spatial_2 = None + else: + spatial_1 = spatial_values[0] + spatial_2 = spatial_values[1] + + source_values = [element.text for element in root.iter() if element.tag.endswith('source')] + if len(source_values)>1: + logging.warning("More than one source are extracted from metadata.") + if not source_values: + logging.warning("No source is extracted.") + source = None + else: + source = source_values[1] + + recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')] + if len(recordIdentifier_values)>1: + logging.warning("More than one recordIdentifier are extracted from metadata.") + if not recordIdentifier_values: + logging.warning("No recordIdentifier is extracted.") + recordIdentifier = None + else: + recordIdentifier = recordIdentifier_values[0] + + type_values = [element.text for element in root.iter() if element.tag.endswith('type')] + if len(type_values)>1: + logging.warning("More than one type are extracted from metadata.") + if not type_values: + logging.warning("No type is extracted.") + type = None + else: + type = type_values[0] + + isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')] + if len(isPartOf_values)>1: + logging.warning("More than one isPartOf are extracted from metadata.") + if not isPartOf_values: + logging.warning("No isPartOf is extracted.") + isPartOf_1 = None + isPartOf_2 = None + else: + isPartOf_1 = isPartOf_values[0] + isPartOf_2 = isPartOf_values[1] + + + newsletter_metadata.append({ + "title": title, + "language":language, + "issue_number":issuenumber, + "date": date, + "identifier": identifier, + "temporal": temporal, + "recordRights": recordRights, + "publisher": publisher, + "spatial_1": spatial_1, + "spatial_2": spatial_2, + "source": source, + "recordIdentifier": recordIdentifier, + "type": type, + "isPartOf_1":isPartOf_1, + "isPartOf_2":isPartOf_2 + }) + + return newsletter_metadata if __name__ == "__main__": - # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml')) - # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) - # print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) - x = parse_all_articles("../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100") - print(x) + x = NewsletterFile(input_dir = '../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100', output_dir=' ') + print(x.parse_all_articles()) + # print(x.input_dir) + + # # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml')) + # # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) + # # print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) + # x = parse_all_articles("../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100") + # print(x) + From be5a852d77de85410afa7902dfca2c1652b43b43 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Mon, 19 Feb 2024 10:04:25 +0100 Subject: [PATCH 06/22] Add id to the articles --- interest/preprocessor/parser.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index d61049f..64dcf3d 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -30,12 +30,17 @@ def parse_all_articles(self) -> Dict: # List of xml files excluded meta file article_file_list = [item for item in file_list if item not in meta_file_list] - articles: List[Dict] = [] + # articles: List[Dict] = [] + articles: dict[Dict] = {} + id = 0 + for file in article_file_list: article = self._parse_raw_article(file) - articles.append(article) + id += 1 + articles[id] = article + # articles.append(article) newsletter_metadata= self._parse_meta_file(meta_file_list[0]) @@ -246,7 +251,12 @@ def _parse_meta_file(self, meta_fp: Union[Path, str]) -> Dict: if __name__ == "__main__": x = NewsletterFile(input_dir = '../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100', output_dir=' ') - print(x.parse_all_articles()) + h = x.parse_all_articles() + # print(h.keys()) + print(h['articles'][1]) + # print(h['newsletter_metadata']) + # print(h['articles'][38]['title']) + # print(x.input_dir) # # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml')) From 55855c7db56b790520ec6d5ded90b67045c4be79 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Tue, 27 Feb 2024 12:52:50 +0100 Subject: [PATCH 07/22] Add new layout --- interest/preprocessor/parser.py | 269 +++++++++++++++----------------- 1 file changed, 125 insertions(+), 144 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 64dcf3d..6daaa39 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -1,75 +1,90 @@ + +import os +import tarfile +import gzip import json -import lzma -import re -from collections import Counter, defaultdict -from pathlib import Path -from typing import List, Union, Dict, Optional +import xml.etree.ElementTree as ET +from typing import Dict, Union import logging -import xml.etree.cElementTree as et - - -class NewsletterFile: - """ Class for parsing xml files to json """ - - def __init__( - self, - input_dir: Union[Path, str], - output_dir: Union[Path, str] - ): - - self.input_dir = Path(input_dir) - self.output_dir = Path(output_dir) - - - def parse_all_articles(self) -> Dict: - - file_list = list(self.input_dir.glob("*.xml")) - # List of meta files - meta_file_list = list(self.input_dir.glob("*.didl.xml")) - # List of xml files excluded meta file - article_file_list = [item for item in file_list if item not in meta_file_list] - # articles: List[Dict] = [] - articles: dict[Dict] = {} +class XMLExtractor: + def __init__(self, root_dir: str, output_dir: str): + self.root_dir = root_dir + self.output_dir = output_dir + + def extract_xml_string(self) -> None: + for folder_name in os.listdir(self.root_dir): + folder_path = os.path.join(self.root_dir, folder_name) + if not os.path.isdir(folder_path): + continue + if not folder_name.isdigit(): # Exclude in_progress, manifests, and ocr_complete folders and log files + continue + self.process_folder(folder_name, folder_path) + + def process_folder(self, folder_name: str, folder_path: str) -> None: + for tgz_filename in os.listdir(folder_path): + if not tgz_filename.endswith('.tgz'): + continue + tgz_file_path = os.path.join(folder_path, tgz_filename) + base_name = os.path.splitext(tgz_filename)[0] + output_folder = os.path.join(self.output_dir, folder_name) + os.makedirs(output_folder, exist_ok=True) + try: + with tarfile.open(tgz_file_path, "r:gz") as outer_tar: + news_dict = self.process_tar(outer_tar) + except tarfile.TarError as e: + logging.error(f"Error extracting {tgz_filename}: {e}") + continue + output_file = os.path.join(output_folder, f"{base_name}.json") + self.save_as_json(news_dict, output_file) + + def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: + news_dict = {"newsletter_metadata": {}, "articles": {}} + articles: Dict[int, Dict[str, str]] = {} id = 0 - - - - for file in article_file_list: - article = self._parse_raw_article(file) - id += 1 - articles[id] = article - # articles.append(article) - - newsletter_metadata= self._parse_meta_file(meta_file_list[0]) - - news_dict = {"newsletter_metadata": newsletter_metadata, "articles": articles} + for entry in outer_tar: + try: + if entry.name.endswith(".xml"): + file = outer_tar.extractfile(entry) + if file is not None: + content = file.read() + xml_content = content.decode('utf-8', 'ignore') + article = self.extract_article(xml_content, entry.name) + id += 1 + news_dict["articles"][id] = article + + elif entry.name.endswith(".gz"): + gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz')) + with outer_tar.extractfile(gz_member) as gz_file: + with gzip.open(gz_file, 'rt') as xml_file: + xml_string = xml_file.read() + newsletter_metadata = self.extract_meta(xml_string) + news_dict["newsletter_metadata"] = newsletter_metadata + else: + continue + except Exception as e: + logging.error(f"Error processing file {entry.name}: {e}") return news_dict - def _parse_raw_article(self, article_fp: Union[Path, str]) -> Dict: - """Parse a raw article file into a structured list - - Arguments - --------- - article_input_fp: Union[Path, str] - Input file to process. - - Returns - -------- - articles: List[Dict] - A list of dictionaries, where each item is for one article and includes - the title and the body of article. - - """ + @staticmethod + def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: try: - tree = et.parse(article_fp) - root = tree.getroot() - except et.ParseError as e: - logging.error("Failed to parse the article file:%s", e) + with open(output_file, 'w') as json_file: + json.dump(data, json_file, indent=4) + except Exception as e: + logging.error(f"Error saving JSON to {output_file}: {e}") + + @staticmethod + def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: + try: + root = ET.fromstring(xml_content) + except ET.ParseError: + logging.error(f"Failed to parse XML from file: {file_name}") + return {} title_values = [element.text for element in root.iter() if element.tag.endswith('title')] - if len(title_values)>1: + if len(title_values) > 1: logging.warning("More than one titles are extracted for the article.") if not title_values: logging.warning("No title is extracted for the article.") @@ -81,54 +96,52 @@ def _parse_raw_article(self, article_fp: Union[Path, str]) -> Dict: if not body_values: logging.warning("No body is extracted.") body = None - if len(body_values)>1: - logging.warning("There are more than on paragraphs in the article.") + elif len(body_values) > 1: + logging.warning("There are more than one paragraphs in the article.") body = ' '.join(body_values) else: body = body_values[0] - return {"title": title, "body":body} - - - def _parse_meta_file(self, meta_fp: Union[Path, str]) -> Dict: - - newsletter_metadata: List[Dict] = [] + return {"title": title, "body": body} + @staticmethod + def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: + newsletter_metadata: Dict[str, Union[str, None]] = {} try: - tree=et.parse(meta_fp) - root=tree.getroot() - except et.ParseError as e: - logging.error("Failed to parse the meta file:%s", e) + root = ET.fromstring(xml_string) + except ET.ParseError: + logging.error("Failed to parse XML from file") + return newsletter_metadata - + # Extracting metadata title_values = [element.text for element in root.iter() if element.tag.endswith('title')] if len(title_values)>1: logging.warning("More than one titles are extracted from metadata.") if not title_values: logging.warning("No title is extracted.") - title = None + newsletter_metadata['title'] = None else: - title = title_values[0] + newsletter_metadata['title'] = title_values[0] language_values = [element.text for element in root.iter() if element.tag.endswith('language')] if len(language_values)>1: logging.warning("More than one language are extracted from metadata.") if not language_values: logging.warning("No language is extracted.") - language = None + newsletter_metadata['language'] = None else: - language = language_values[0] + newsletter_metadata['language'] = language_values[0] + - issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')] if len(issuenumber_values)>1: logging.warning("More than one issuenumbers are extracted from metadata.") if not issuenumber_values: logging.warning("No issuenumber is extracted.") - issuenumber = None + newsletter_metadata['issuenumber'] = None else: - issuenumber = issuenumber_values[0] + newsletter_metadata['issuenumber'] = issuenumber_values[0] date_values = [element.text for element in root.iter() if element.tag.endswith('date')] @@ -136,133 +149,101 @@ def _parse_meta_file(self, meta_fp: Union[Path, str]) -> Dict: logging.warning("More than one dates are extracted from metadata.") if not date_values: logging.warning("No date is extracted.") - date = None + newsletter_metadata['date'] = None else: - date = date_values[0] + newsletter_metadata['date'] = date_values[0] identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')] if len(identifier_values)>1: logging.warning("More than one identifiers are extracted from metadata.") if not identifier_values: logging.warning("No identifier is extracted.") - identifier = None + newsletter_metadata['identifier'] = None else: - identifier = identifier_values[0] + newsletter_metadata['identifier'] = identifier_values[0] temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')] if len(temporal_values)>1: logging.warning("More than one temporal are extracted from metadata.") if not temporal_values: logging.warning("No temporal is extracted.") - temporal = None + newsletter_metadata['temporal'] = None else: - temporal = temporal_values[0] + newsletter_metadata['temporal'] = temporal_values[0] recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')] if len(recordRights_values)>1: logging.warning("More than one recordRights are extracted from metadata.") if not recordRights_values: logging.warning("No recordRights is extracted.") - recordRights = None + newsletter_metadata['recordRights'] = None else: - recordRights = recordRights_values[0] + newsletter_metadata['recordRights'] = recordRights_values[0] publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')] if len(publisher_values)>1: logging.warning("More than one publisher are extracted from metadata.") if not publisher_values: logging.warning("No publisher is extracted.") - publisher = None + newsletter_metadata['publisher'] = None else: - publisher = publisher_values[0] + newsletter_metadata['publisher'] = publisher_values[0] spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')] if len(spatial_values)>1: logging.warning("More than one spatial are extracted from metadata.") if not spatial_values: logging.warning("No spatial is extracted.") - spatial_1 = None - spatial_2 = None + newsletter_metadata['spatial_1'] = None + newsletter_metadata['spatial_2'] = None else: - spatial_1 = spatial_values[0] - spatial_2 = spatial_values[1] + newsletter_metadata['spatial_1'] = spatial_values[0] + newsletter_metadata['spatial_2'] = spatial_values[1] source_values = [element.text for element in root.iter() if element.tag.endswith('source')] if len(source_values)>1: logging.warning("More than one source are extracted from metadata.") if not source_values: logging.warning("No source is extracted.") - source = None + newsletter_metadata['source'] = None else: - source = source_values[1] + newsletter_metadata['source'] = source_values[1] recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')] if len(recordIdentifier_values)>1: logging.warning("More than one recordIdentifier are extracted from metadata.") if not recordIdentifier_values: logging.warning("No recordIdentifier is extracted.") - recordIdentifier = None + newsletter_metadata['recordIdentifier'] = None else: - recordIdentifier = recordIdentifier_values[0] + newsletter_metadata['recordIdentifier'] = recordIdentifier_values[0] type_values = [element.text for element in root.iter() if element.tag.endswith('type')] if len(type_values)>1: logging.warning("More than one type are extracted from metadata.") if not type_values: logging.warning("No type is extracted.") - type = None + newsletter_metadata['type'] = None else: - type = type_values[0] + newsletter_metadata['type'] = type_values[0] isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')] if len(isPartOf_values)>1: logging.warning("More than one isPartOf are extracted from metadata.") if not isPartOf_values: logging.warning("No isPartOf is extracted.") - isPartOf_1 = None - isPartOf_2 = None + newsletter_metadata['isPartOf_1'] = None + newsletter_metadata['isPartOf_2'] = None else: - isPartOf_1 = isPartOf_values[0] - isPartOf_2 = isPartOf_values[1] - - - newsletter_metadata.append({ - "title": title, - "language":language, - "issue_number":issuenumber, - "date": date, - "identifier": identifier, - "temporal": temporal, - "recordRights": recordRights, - "publisher": publisher, - "spatial_1": spatial_1, - "spatial_2": spatial_2, - "source": source, - "recordIdentifier": recordIdentifier, - "type": type, - "isPartOf_1":isPartOf_1, - "isPartOf_2":isPartOf_2 - }) - - return newsletter_metadata + newsletter_metadata['isPartOf_1'] = isPartOf_values[0] + newsletter_metadata['isPartOf_2'] = isPartOf_values[1] + return newsletter_metadata - +# Configure logging +logging.basicConfig(filename='extractor.log', level=logging.DEBUG) +# Example usage if __name__ == "__main__": - x = NewsletterFile(input_dir = '../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100', output_dir=' ') - h = x.parse_all_articles() - # print(h.keys()) - print(h['articles'][1]) - # print(h['newsletter_metadata']) - # print(h['articles'][38]['title']) - - # print(x.input_dir) - - # # print(parse_raw_article('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100/MMKB12_000002100_00022_text.xml')) - # # print(parse_journal_articles('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) - # # print(parse_meta_file('../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100')) - # x = parse_all_articles("../../data/news/2022_harvest_KRANTEN/00/KRANTEN_KBPERS01_000002100") - # print(x) - - + extractor = XMLExtractor("../../data/news/gg", "../../data/news/gg-json") + extractor.extract_xml_string() From 112be91d026708d5a07f2a5a3cddb4e23d3348a6 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Tue, 27 Feb 2024 13:00:42 +0100 Subject: [PATCH 08/22] Add documentation --- interest/preprocessor/parser.py | 53 +++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 6daaa39..2c14a77 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -9,11 +9,22 @@ class XMLExtractor: + """Class for extracting XML content and metadata from nested .tgz files.""" def __init__(self, root_dir: str, output_dir: str): + """ + Initializes the XMLExtractor object. + + Parameters: + root_dir (str): The root directory containing .tgz files. + output_dir (str): The output directory for saving extracted JSON files. + """ self.root_dir = root_dir self.output_dir = output_dir def extract_xml_string(self) -> None: + """ + Extracts XML content and metadata from .tgz files in the root directory. + """ for folder_name in os.listdir(self.root_dir): folder_path = os.path.join(self.root_dir, folder_name) if not os.path.isdir(folder_path): @@ -23,6 +34,13 @@ def extract_xml_string(self) -> None: self.process_folder(folder_name, folder_path) def process_folder(self, folder_name: str, folder_path: str) -> None: + """ + Processes .tgz files within a folder. + + Parameters: + folder_name (str): Name of the folder being processed. + folder_path (str): Path to the folder being processed. + """ for tgz_filename in os.listdir(folder_path): if not tgz_filename.endswith('.tgz'): continue @@ -40,6 +58,15 @@ def process_folder(self, folder_name: str, folder_path: str) -> None: self.save_as_json(news_dict, output_file) def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: + """ + Processes a .tgz file and extracts XML content and metadata. + + Parameters: + outer_tar (tarfile.TarFile): The .tgz file being processed. + + Returns: + Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata. + """ news_dict = {"newsletter_metadata": {}, "articles": {}} articles: Dict[int, Dict[str, str]] = {} id = 0 @@ -69,6 +96,13 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s @staticmethod def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: + """ + Saves data as JSON to a specified file. + + Parameters: + data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. + output_file (str): Path to the output JSON file. + """ try: with open(output_file, 'w') as json_file: json.dump(data, json_file, indent=4) @@ -77,6 +111,16 @@ def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]] @staticmethod def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: + """ + Extracts article title and body from XML content. + + Parameters: + xml_content (str): XML content of the article. + file_name (str): Name of the XML file. + + Returns: + Dict[str, str]: A dictionary containing the extracted title and body of the article. + """ try: root = ET.fromstring(xml_content) except ET.ParseError: @@ -106,6 +150,15 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: @staticmethod def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: + """ + Extracts metadata from XML string. + + Parameters: + xml_string (str): XML string containing metadata. + + Returns: + Dict[str, Union[str, None]]: A dictionary containing the extracted metadata. + """ newsletter_metadata: Dict[str, Union[str, None]] = {} try: From 6a8aca2fea27515331ec7e0a285d6114e03b569f Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Tue, 27 Feb 2024 13:31:24 +0100 Subject: [PATCH 09/22] Compress json files while saving --- interest/preprocessor/parser.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 2c14a77..14a67c5 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -8,6 +8,7 @@ import logging + class XMLExtractor: """Class for extracting XML content and metadata from nested .tgz files.""" def __init__(self, root_dir: str, output_dir: str): @@ -55,7 +56,8 @@ def process_folder(self, folder_name: str, folder_path: str) -> None: logging.error(f"Error extracting {tgz_filename}: {e}") continue output_file = os.path.join(output_folder, f"{base_name}.json") - self.save_as_json(news_dict, output_file) + self.save_as_json_compressed(news_dict, output_file) + # self.save_as_json(news_dict, output_file) def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: """ @@ -93,21 +95,37 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s except Exception as e: logging.error(f"Error processing file {entry.name}: {e}") return news_dict - + @staticmethod - def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: + def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: """ - Saves data as JSON to a specified file. + Saves data as compressed JSON using gzip. Parameters: data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. output_file (str): Path to the output JSON file. """ try: - with open(output_file, 'w') as json_file: + with gzip.open(output_file, 'wt') as json_file: json.dump(data, json_file, indent=4) except Exception as e: - logging.error(f"Error saving JSON to {output_file}: {e}") + logging.error(f"Error saving compressed JSON to {output_file}: {e}") + + + # @staticmethod + # def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: + # """ + # Saves data as JSON to a specified file. + + # Parameters: + # data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. + # output_file (str): Path to the output JSON file. + # """ + # try: + # with open(output_file, 'w') as json_file: + # json.dump(data, json_file, indent=4) + # except Exception as e: + # logging.error(f"Error saving JSON to {output_file}: {e}") @staticmethod def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: @@ -298,5 +316,5 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: # Example usage if __name__ == "__main__": - extractor = XMLExtractor("../../data/news/gg", "../../data/news/gg-json") + extractor = XMLExtractor("../../data/news/gg", "../../data/news/gg-json-compress") extractor.extract_xml_string() From c61cf3e700c246cea5ff60860ca2db5dd0bc676e Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Tue, 27 Feb 2024 15:25:54 +0100 Subject: [PATCH 10/22] Shorten extract_meta method --- interest/preprocessor/parser.py | 138 ++++---------------------------- 1 file changed, 16 insertions(+), 122 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 14a67c5..56845b0 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -186,131 +186,25 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: return newsletter_metadata # Extracting metadata - title_values = [element.text for element in root.iter() if element.tag.endswith('title')] - if len(title_values)>1: - logging.warning("More than one titles are extracted from metadata.") - if not title_values: - logging.warning("No title is extracted.") - newsletter_metadata['title'] = None - else: - newsletter_metadata['title'] = title_values[0] - - language_values = [element.text for element in root.iter() if element.tag.endswith('language')] - if len(language_values)>1: - logging.warning("More than one language are extracted from metadata.") - if not language_values: - logging.warning("No language is extracted.") - newsletter_metadata['language'] = None - else: - newsletter_metadata['language'] = language_values[0] - - - issuenumber_values = [element.text for element in root.iter() if element.tag.endswith('issuenumber')] - if len(issuenumber_values)>1: - logging.warning("More than one issuenumbers are extracted from metadata.") - if not issuenumber_values: - logging.warning("No issuenumber is extracted.") - newsletter_metadata['issuenumber'] = None - else: - newsletter_metadata['issuenumber'] = issuenumber_values[0] - - - date_values = [element.text for element in root.iter() if element.tag.endswith('date')] - if len(date_values)>1: - logging.warning("More than one dates are extracted from metadata.") - if not date_values: - logging.warning("No date is extracted.") - newsletter_metadata['date'] = None - else: - newsletter_metadata['date'] = date_values[0] - - identifier_values = [element.text for element in root.iter() if element.tag.endswith('identifier')] - if len(identifier_values)>1: - logging.warning("More than one identifiers are extracted from metadata.") - if not identifier_values: - logging.warning("No identifier is extracted.") - newsletter_metadata['identifier'] = None - else: - newsletter_metadata['identifier'] = identifier_values[0] - - temporal_values = [element.text for element in root.iter() if element.tag.endswith('temporal')] - if len(temporal_values)>1: - logging.warning("More than one temporal are extracted from metadata.") - if not temporal_values: - logging.warning("No temporal is extracted.") - newsletter_metadata['temporal'] = None - else: - newsletter_metadata['temporal'] = temporal_values[0] - - recordRights_values = [element.text for element in root.iter() if element.tag.endswith('recordRights')] - if len(recordRights_values)>1: - logging.warning("More than one recordRights are extracted from metadata.") - if not recordRights_values: - logging.warning("No recordRights is extracted.") - newsletter_metadata['recordRights'] = None - else: - newsletter_metadata['recordRights'] = recordRights_values[0] - - publisher_values = [element.text for element in root.iter() if element.tag.endswith('publisher')] - if len(publisher_values)>1: - logging.warning("More than one publisher are extracted from metadata.") - if not publisher_values: - logging.warning("No publisher is extracted.") - newsletter_metadata['publisher'] = None - else: - newsletter_metadata['publisher'] = publisher_values[0] - - spatial_values = [element.text for element in root.iter() if element.tag.endswith('spatial')] - if len(spatial_values)>1: - logging.warning("More than one spatial are extracted from metadata.") - if not spatial_values: - logging.warning("No spatial is extracted.") - newsletter_metadata['spatial_1'] = None - newsletter_metadata['spatial_2'] = None - else: - newsletter_metadata['spatial_1'] = spatial_values[0] - newsletter_metadata['spatial_2'] = spatial_values[1] - - source_values = [element.text for element in root.iter() if element.tag.endswith('source')] - if len(source_values)>1: - logging.warning("More than one source are extracted from metadata.") - if not source_values: - logging.warning("No source is extracted.") - newsletter_metadata['source'] = None - else: - newsletter_metadata['source'] = source_values[1] - - recordIdentifier_values = [element.text for element in root.iter() if element.tag.endswith('recordIdentifier')] - if len(recordIdentifier_values)>1: - logging.warning("More than one recordIdentifier are extracted from metadata.") - if not recordIdentifier_values: - logging.warning("No recordIdentifier is extracted.") - newsletter_metadata['recordIdentifier'] = None - else: - newsletter_metadata['recordIdentifier'] = recordIdentifier_values[0] - - type_values = [element.text for element in root.iter() if element.tag.endswith('type')] - if len(type_values)>1: - logging.warning("More than one type are extracted from metadata.") - if not type_values: - logging.warning("No type is extracted.") - newsletter_metadata['type'] = None - else: - newsletter_metadata['type'] = type_values[0] - - isPartOf_values = [element.text for element in root.iter() if element.tag.endswith('isPartOf')] - if len(isPartOf_values)>1: - logging.warning("More than one isPartOf are extracted from metadata.") - if not isPartOf_values: - logging.warning("No isPartOf is extracted.") - newsletter_metadata['isPartOf_1'] = None - newsletter_metadata['isPartOf_2'] = None - else: - newsletter_metadata['isPartOf_1'] = isPartOf_values[0] - newsletter_metadata['isPartOf_2'] = isPartOf_values[1] + fields = [ + "title", "language", "issuenumber", "date", "identifier", + "temporal", "recordRights", "publisher", "spatial", "source", + "recordIdentifier", "type", "isPartOf" + ] + + for field in fields: + field_values = [element.text for element in root.iter() if element.tag.endswith(field)] + if len(field_values) > 1: + logging.warning(f"More than one {field}s are extracted from metadata.") + if not field_values: + logging.warning(f"No {field} is extracted.") + newsletter_metadata[field] = None + else: + newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values) return newsletter_metadata + # Configure logging logging.basicConfig(filename='extractor.log', level=logging.DEBUG) From f278030b9464b90d7e971639658df7caff1eb84a Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Tue, 27 Feb 2024 15:45:15 +0100 Subject: [PATCH 11/22] Fix flake8 issues --- interest/preprocessor/parser.py | 53 ++++++++++++++++----------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 56845b0..61599b3 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -8,29 +8,28 @@ import logging - class XMLExtractor: - """Class for extracting XML content and metadata from nested .tgz files.""" + """Class for extracting XML content and metadata from nested .tgz files.""" # noqa: E501 def __init__(self, root_dir: str, output_dir: str): """ Initializes the XMLExtractor object. Parameters: root_dir (str): The root directory containing .tgz files. - output_dir (str): The output directory for saving extracted JSON files. + output_dir (str): The output directory for saving extracted JSON files. # noqa: E501 """ self.root_dir = root_dir self.output_dir = output_dir def extract_xml_string(self) -> None: """ - Extracts XML content and metadata from .tgz files in the root directory. + Extracts XML content and metadata from .tgz files in the root directory. # noqa: E501 """ for folder_name in os.listdir(self.root_dir): folder_path = os.path.join(self.root_dir, folder_name) if not os.path.isdir(folder_path): continue - if not folder_name.isdigit(): # Exclude in_progress, manifests, and ocr_complete folders and log files + if not folder_name.isdigit(): # Exclude in_progress, manifests, and ocr_complete folders and log files. # noqa: E501 continue self.process_folder(folder_name, folder_path) @@ -59,7 +58,7 @@ def process_folder(self, folder_name: str, folder_path: str) -> None: self.save_as_json_compressed(news_dict, output_file) # self.save_as_json(news_dict, output_file) - def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: + def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: # noqa: E501 """ Processes a .tgz file and extracts XML content and metadata. @@ -67,10 +66,9 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s outer_tar (tarfile.TarFile): The .tgz file being processed. Returns: - Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata. + Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata. # noqa: E501 """ news_dict = {"newsletter_metadata": {}, "articles": {}} - articles: Dict[int, Dict[str, str]] = {} id = 0 for entry in outer_tar: try: @@ -84,41 +82,40 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s news_dict["articles"][id] = article elif entry.name.endswith(".gz"): - gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz')) + gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz')) # noqa: E501 with outer_tar.extractfile(gz_member) as gz_file: with gzip.open(gz_file, 'rt') as xml_file: xml_string = xml_file.read() newsletter_metadata = self.extract_meta(xml_string) - news_dict["newsletter_metadata"] = newsletter_metadata + news_dict["newsletter_metadata"] = newsletter_metadata # noqa: E501 else: continue except Exception as e: logging.error(f"Error processing file {entry.name}: {e}") return news_dict - + @staticmethod - def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: + def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: # noqa: E501 """ Saves data as compressed JSON using gzip. Parameters: - data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. + data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. # noqa: E501 output_file (str): Path to the output JSON file. """ try: with gzip.open(output_file, 'wt') as json_file: json.dump(data, json_file, indent=4) except Exception as e: - logging.error(f"Error saving compressed JSON to {output_file}: {e}") - + logging.error(f"Error saving compressed JSON to {output_file}: {e}") # noqa: E501 # @staticmethod - # def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: + # def save_as_json(data: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]], output_file: str) -> None: # noqa: E501 # """ # Saves data as JSON to a specified file. # Parameters: - # data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. + # data (Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]): Data to be saved as JSON. # noqa: E501 # output_file (str): Path to the output JSON file. # """ # try: @@ -137,7 +134,7 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: file_name (str): Name of the XML file. Returns: - Dict[str, str]: A dictionary containing the extracted title and body of the article. + Dict[str, str]: A dictionary containing the extracted title and body of the article. # noqa: E501 """ try: root = ET.fromstring(xml_content) @@ -145,21 +142,21 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: logging.error(f"Failed to parse XML from file: {file_name}") return {} - title_values = [element.text for element in root.iter() if element.tag.endswith('title')] + title_values = [element.text for element in root.iter() if element.tag.endswith('title')] # noqa: E501 if len(title_values) > 1: - logging.warning("More than one titles are extracted for the article.") + logging.warning("More than one titles are extracted for the article.") # noqa: E501 if not title_values: logging.warning("No title is extracted for the article.") title = None else: title = title_values[0] - body_values = [element.text for element in root.iter() if element.tag.endswith('p')] + body_values = [element.text for element in root.iter() if element.tag.endswith('p')] # noqa: E501 if not body_values: logging.warning("No body is extracted.") body = None elif len(body_values) > 1: - logging.warning("There are more than one paragraphs in the article.") + logging.warning("There are more than one paragraphs in the article.") # noqa: E501 body = ' '.join(body_values) else: body = body_values[0] @@ -175,7 +172,7 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: xml_string (str): XML string containing metadata. Returns: - Dict[str, Union[str, None]]: A dictionary containing the extracted metadata. + Dict[str, Union[str, None]]: A dictionary containing the extracted metadata. # noqa: E501 """ newsletter_metadata: Dict[str, Union[str, None]] = {} @@ -193,14 +190,14 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: ] for field in fields: - field_values = [element.text for element in root.iter() if element.tag.endswith(field)] + field_values = [element.text for element in root.iter() if element.tag.endswith(field)] # noqa: E501 if len(field_values) > 1: - logging.warning(f"More than one {field}s are extracted from metadata.") + logging.warning(f"More than one {field}s are extracted from metadata.") # noqa: E501 if not field_values: logging.warning(f"No {field} is extracted.") newsletter_metadata[field] = None else: - newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values) + newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values) # noqa: E501 return newsletter_metadata @@ -210,5 +207,7 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: # Example usage if __name__ == "__main__": - extractor = XMLExtractor("../../data/news/gg", "../../data/news/gg-json-compress") + input_dir = "../../data/news/gg" + output_dir = "../../data/news/gg-json-compress" + extractor = XMLExtractor(input_dir, output_dir) extractor.extract_xml_string() From 7f323ef86c6a617cdcf956631adc507bc1b121cd Mon Sep 17 00:00:00 2001 From: Shiva Nadi <44059592+ShNadi@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:49:20 +0100 Subject: [PATCH 12/22] Update python-package.yml comment pylon --- .github/workflows/python-package.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 29821d3..ad47b95 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -50,10 +50,10 @@ jobs: run: | python -m pip install flake8 flake8 $PACKAGE_NAME - - name: Lint with pylint - run: | - python -m pip install pylint - pylint $PACKAGE_NAME + # - name: Lint with pylint + # run: | + # python -m pip install pylint + # pylint $PACKAGE_NAME # - name: Check docstrings with pydocstyle # run: | # python -m pip install pydocstyle From 6682cd20da26eeca62d7f5d5ed095cac364c1f96 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Thu, 29 Feb 2024 13:17:28 +0100 Subject: [PATCH 13/22] Change body to list of paragraphs. --- interest/preprocessor/parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 61599b3..c705f15 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -7,7 +7,6 @@ from typing import Dict, Union import logging - class XMLExtractor: """Class for extracting XML content and metadata from nested .tgz files.""" # noqa: E501 def __init__(self, root_dir: str, output_dir: str): @@ -155,11 +154,12 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: if not body_values: logging.warning("No body is extracted.") body = None - elif len(body_values) > 1: - logging.warning("There are more than one paragraphs in the article.") # noqa: E501 - body = ' '.join(body_values) + # elif len(body_values) > 1: + # logging.warning("There are more than one paragraphs in the article.") # noqa: E501 + # body = ' '.join(body_values) else: - body = body_values[0] + # body = body_values[0] + body = body_values return {"title": title, "body": body} From 7b791fbedd3c4d8ed2360252164af2eac781a748 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 1 Mar 2024 15:51:01 +0100 Subject: [PATCH 14/22] Add import parser --- interest/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/interest/__init__.py b/interest/__init__.py index e69de29..6e6ff7e 100644 --- a/interest/__init__.py +++ b/interest/__init__.py @@ -0,0 +1 @@ +from interest.preprocessor.parser import XMLExtractor \ No newline at end of file From 3a1372ec542ae163ebf6aae2aae693ce0d8894a2 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 1 Mar 2024 15:51:45 +0100 Subject: [PATCH 15/22] Add import parser --- interest/preprocessor/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py index e69de29..6e6ff7e 100644 --- a/interest/preprocessor/__init__.py +++ b/interest/preprocessor/__init__.py @@ -0,0 +1 @@ +from interest.preprocessor.parser import XMLExtractor \ No newline at end of file From 18d85dde9faddf4c17d5ddf44a7fb213fca92c63 Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 1 Mar 2024 15:52:25 +0100 Subject: [PATCH 16/22] Remove main --- interest/preprocessor/parser.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index c705f15..99168cd 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -7,6 +7,7 @@ from typing import Dict, Union import logging + class XMLExtractor: """Class for extracting XML content and metadata from nested .tgz files.""" # noqa: E501 def __init__(self, root_dir: str, output_dir: str): @@ -200,14 +201,4 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values) # noqa: E501 return newsletter_metadata - - -# Configure logging -logging.basicConfig(filename='extractor.log', level=logging.DEBUG) - -# Example usage -if __name__ == "__main__": - input_dir = "../../data/news/gg" - output_dir = "../../data/news/gg-json-compress" - extractor = XMLExtractor(input_dir, output_dir) - extractor.extract_xml_string() + \ No newline at end of file From 59c300815f6ce91bd62da38b30ed3eae6fa075fe Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 1 Mar 2024 15:55:52 +0100 Subject: [PATCH 17/22] Add convert_input_files.py --- pyproject.toml | 3 +-- scripts/.DS_Store | Bin 0 -> 6148 bytes scripts/convert_input_files.py | 22 ++++++++++++++++++++++ scripts/logs/.DS_Store | Bin 0 -> 6148 bytes 4 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 scripts/.DS_Store create mode 100644 scripts/convert_input_files.py create mode 100644 scripts/logs/.DS_Store diff --git a/pyproject.toml b/pyproject.toml index d6befd8..07dc037 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ description = "A package to extract hystorical news sentiments" authors = [ {name = "Shiva Nadi", email = "s.nadi@uu.nl"}, {name = "Parisa Zahedi", email = "p.zahedi@uu.nl"}, + {name = "Matty Vermet", email = "m.s.vermet@uu.nl"} ] readme = "README.md" requires-python = ">=3.8" @@ -23,8 +24,6 @@ classifiers = [ ] dynamic = ["version"] dependencies = [ - # "numpy ~= 1.23.4", - # "scikit-learn ~= 0.19.1", ] [project.optional-dependencies] diff --git a/scripts/.DS_Store b/scripts/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f54d7c8ad083fb73e6aaf03bea7c6ae0f1a19ee1 GIT binary patch literal 6148 zcmeHKO-my|5Uq|;Cy0=PipPPyipCEV@iIm|c-;{_>`Kg-sDtBl$V?0p0&~_Mv)BD4 z{+-=d-5pV*R|!%D)vvm`s%QFj(;q~n>b=%FQI&{1C}XLD;sfD!)-fqLPZ_98A8qPV zP7R7F9g7ynBr?Erw?vnuF=O}idj2}pGjn-`*-X%q(JFQ5;W*KkQKBQ{@uJy>uKqIu zi>QoPP1lsYi~I9gT-47xuD`!gCyBGXR(n>Zsq*y9tT*e;dFSDc?uU6Z?`8F7_ZQbr zv`(Ut-HcA-cE3?wIM8X{jMH}4B*d*YLVlmcX-oI(x|g;xlN;Iwui{l2)#brpW4pHE zZ*J`jSNy^52JEfP?cuQEE&f>B``Nfn?o$0|Mn(#2+{nDc5j??o8TWnIO%t77!#9c_ z#R&=n!hkR^DGa#%$(x&$ACmtN284l$VSx7sAIcawEG(L>1BH140Qnke1U6p_ImdSx zI4mrp1)^*!(55Q)#85UJe&59f4hxGmos@fgDEDRMUMR|Z9sT@N?|}4c+0?) ztycN`Kl=Rr|29c#!hkUFtr$?{!|+2~;ep6Sz|bItFz}@e{0Fp&XKw%i literal 0 HcmV?d00001 diff --git a/scripts/convert_input_files.py b/scripts/convert_input_files.py new file mode 100644 index 0000000..22e6dd7 --- /dev/null +++ b/scripts/convert_input_files.py @@ -0,0 +1,22 @@ +from interest.preprocessor.parser import XMLExtractor +from argparse import ArgumentParser +from pathlib import Path +import logging + + +logging.basicConfig(filename='logs/extractor.log', level=logging.DEBUG) + + + +def parse_arguments(): + parser = ArgumentParser( + prog="convert_input_files.py", + description="Convert nested gzip files to compressed json") + parser.add_argument("--input_dir", required=True) + parser.add_argument("--output_dir", required=True) + return parser.parse_args() + +if __name__=="__main__": + args = parse_arguments() + extractor = XMLExtractor(Path(args.input_dir), Path(args.output_dir)) + extractor.extract_xml_string() diff --git a/scripts/logs/.DS_Store b/scripts/logs/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Fri, 1 Mar 2024 17:47:09 +0100 Subject: [PATCH 18/22] Remove DS_Store --- scripts/logs/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 scripts/logs/.DS_Store diff --git a/scripts/logs/.DS_Store b/scripts/logs/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Fri, 1 Mar 2024 17:48:43 +0100 Subject: [PATCH 19/22] Remove DS_Store file --- scripts/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 scripts/.DS_Store diff --git a/scripts/.DS_Store b/scripts/.DS_Store deleted file mode 100644 index f54d7c8ad083fb73e6aaf03bea7c6ae0f1a19ee1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKO-my|5Uq|;Cy0=PipPPyipCEV@iIm|c-;{_>`Kg-sDtBl$V?0p0&~_Mv)BD4 z{+-=d-5pV*R|!%D)vvm`s%QFj(;q~n>b=%FQI&{1C}XLD;sfD!)-fqLPZ_98A8qPV zP7R7F9g7ynBr?Erw?vnuF=O}idj2}pGjn-`*-X%q(JFQ5;W*KkQKBQ{@uJy>uKqIu zi>QoPP1lsYi~I9gT-47xuD`!gCyBGXR(n>Zsq*y9tT*e;dFSDc?uU6Z?`8F7_ZQbr zv`(Ut-HcA-cE3?wIM8X{jMH}4B*d*YLVlmcX-oI(x|g;xlN;Iwui{l2)#brpW4pHE zZ*J`jSNy^52JEfP?cuQEE&f>B``Nfn?o$0|Mn(#2+{nDc5j??o8TWnIO%t77!#9c_ z#R&=n!hkR^DGa#%$(x&$ACmtN284l$VSx7sAIcawEG(L>1BH140Qnke1U6p_ImdSx zI4mrp1)^*!(55Q)#85UJe&59f4hxGmos@fgDEDRMUMR|Z9sT@N?|}4c+0?) ztycN`Kl=Rr|29c#!hkUFtr$?{!|+2~;ep6Sz|bItFz}@e{0Fp&XKw%i From b5dccf69e7f863f090a3375f60d01cc10663e78e Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 1 Mar 2024 17:54:34 +0100 Subject: [PATCH 20/22] Change log path --- scripts/convert_input_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_input_files.py b/scripts/convert_input_files.py index 22e6dd7..b6d2dea 100644 --- a/scripts/convert_input_files.py +++ b/scripts/convert_input_files.py @@ -4,7 +4,7 @@ import logging -logging.basicConfig(filename='logs/extractor.log', level=logging.DEBUG) +logging.basicConfig(filename='extractor.log', level=logging.DEBUG) From 8fde5fd46267446c9c0b6eff83a8d98c188f86eb Mon Sep 17 00:00:00 2001 From: Shiva Nadi Date: Fri, 1 Mar 2024 18:02:55 +0100 Subject: [PATCH 21/22] Move field list from metadat method to instructor --- interest/preprocessor/parser.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index 99168cd..bbaf724 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -20,6 +20,11 @@ def __init__(self, root_dir: str, output_dir: str): """ self.root_dir = root_dir self.output_dir = output_dir + self.fields = [ + "title", "language", "issuenumber", "date", "identifier", + "temporal", "recordRights", "publisher", "spatial", "source", + "recordIdentifier", "type", "isPartOf" + ] def extract_xml_string(self) -> None: """ @@ -164,8 +169,7 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: return {"title": title, "body": body} - @staticmethod - def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: + def extract_meta(self,xml_string: str) -> Dict[str, Union[str, None]]: """ Extracts metadata from XML string. @@ -183,14 +187,7 @@ def extract_meta(xml_string: str) -> Dict[str, Union[str, None]]: logging.error("Failed to parse XML from file") return newsletter_metadata - # Extracting metadata - fields = [ - "title", "language", "issuenumber", "date", "identifier", - "temporal", "recordRights", "publisher", "spatial", "source", - "recordIdentifier", "type", "isPartOf" - ] - - for field in fields: + for field in self.fields: field_values = [element.text for element in root.iter() if element.tag.endswith(field)] # noqa: E501 if len(field_values) > 1: logging.warning(f"More than one {field}s are extracted from metadata.") # noqa: E501 From 57235ab488abe7d2becf65e28d7395752699d564 Mon Sep 17 00:00:00 2001 From: parisa-zahedi Date: Thu, 7 Mar 2024 12:51:29 +0100 Subject: [PATCH 22/22] Define input file (#3) * define input file classes * define document class * apply pylint * apply flake8 * replace relative with absolute import * remove extra getter functions that are not used * fix variable types in Article class * fix mypy errors * fix flake8 errors * fix flake8 errors * Fix mypy and flake8 issues * Comment parser * Fix flake8 issues * Comment import parser --------- Co-authored-by: parisa-zahedi Co-authored-by: Shiva Nadi --- interest/__init__.py | 8 +- interest/delpher_kranten.py | 118 +++++++++++++++++++++++++++ interest/document.py | 131 ++++++++++++++++++++++++++++++ interest/input_file.py | 119 +++++++++++++++++++++++++++ interest/preprocessor/__init__.py | 2 +- interest/preprocessor/parser.py | 28 ++++--- 6 files changed, 393 insertions(+), 13 deletions(-) create mode 100644 interest/delpher_kranten.py create mode 100644 interest/document.py create mode 100644 interest/input_file.py diff --git a/interest/__init__.py b/interest/__init__.py index 6e6ff7e..5170041 100644 --- a/interest/__init__.py +++ b/interest/__init__.py @@ -1 +1,7 @@ -from interest.preprocessor.parser import XMLExtractor \ No newline at end of file +# from interest.preprocessor.parser import XMLExtractor +from interest.delpher_kranten import KrantenFile + +INPUT_FILE_TYPES = { + "delpher_kranten": KrantenFile + +} diff --git a/interest/delpher_kranten.py b/interest/delpher_kranten.py new file mode 100644 index 0000000..2e77575 --- /dev/null +++ b/interest/delpher_kranten.py @@ -0,0 +1,118 @@ +""" +Delpher Kranten Module + +This module provides classes and functions for handling Delpher Kranten files. +""" + +import json +import logging +import os +from typing import Optional +from interest.document import Document, Article +from interest.input_file import InputFile + + +class KrantenFile(InputFile): + """ + An InputFile implementation for Delpher Kranten. + + Input is a zip file which includes one JSON file. The JSON file contains + metadata and articles from one issue of a newspaper. + + Attributes: + METADATA_FIELD (str): The key for metadata field in JSON data. + TITLE_FIELD (str): The key for title field in metadata. + DATE_FIELD (str): The key for date field in metadata. + LANGUAGE_FIELD (str): The key for language field in metadata. + ARTICLES_FIELD (str): The key for articles field in JSON data. + ARTICLE_TITLE_FIELD (str): The key for title field in an article. + ARTICLE_BODY_FIELD (str): The key for body field in an article. + ENCODING (str): The encoding format for reading the file. + + Methods: + read_json(json_file): Read JSON data from a file and parse it into + a Document object. + base_file_name(): Extract the base file name without extension from + the filepath. + doc(): Read the directory and parse the JSON file into a Document + object. + """ + + METADATA_FIELD = "newsletter_metadata" + TITLE_FIELD = "title" + DATE_FIELD = "date" + LANGUAGE_FIELD = "language" + ARTICLES_FIELD = "articles" + ARTICLE_TITLE_FIELD = "title" + ARTICLE_BODY_FIELD = "body" + ENCODING = "utf-8" + + def read_json(self, json_file) -> Optional[Document]: + """ + Read JSON data from a file and parse it into a Document object. + + Args: + json_file: A file object containing JSON data. + + Returns: + Optional[Document]: A Document object parsed from + the JSON data, or None if parsing fails. + """ + try: + json_data = json.load(json_file) + metadata = json_data[self.METADATA_FIELD] + document_title = metadata[self.TITLE_FIELD] + publish_date = metadata[self.DATE_FIELD] + language = metadata[self.LANGUAGE_FIELD] + + articles_data = json_data[self.ARTICLES_FIELD] + + articles = [] + for article_id, article in articles_data.items(): + article_title = article[self.ARTICLE_TITLE_FIELD] + article_body = article[self.ARTICLE_BODY_FIELD] + article = Article(article_id=article_id, title=article_title, + body=article_body) + articles.append(article) + + document = Document(title=document_title, + publish_date=publish_date, + language=language, + articles=articles) + return document + + except (json.JSONDecodeError, KeyError) as e: + logging.error("Error parsing JSON data: %s", e) + return None + + def base_file_name(self) -> str: + """ + Extract the base file name without extension from the filepath. + + Returns: + str: The base file name without extension. + """ + file_name_json = os.path.splitext(os.path.basename(self.filepath))[0] + base_file_name = os.path.splitext(file_name_json)[0] + return base_file_name + + def doc(self) -> Optional[Document]: + """ + Read the directory and parse the JSON file into a Document + object. + + Returns: + Optional[Document]: A Document object parsed from the + JSON data, or None if parsing fails. + """ + try: + logging.info("Reading directory '%s'...", self._filepath) + fh = self.open(encoding=self.ENCODING) + document = self.read_json(fh) + fh.close() + return document + + except OSError as e: + logging.error("Error processing gzip file '%s': %s", + self._filepath, e) + return None diff --git a/interest/document.py b/interest/document.py new file mode 100644 index 0000000..5984d1b --- /dev/null +++ b/interest/document.py @@ -0,0 +1,131 @@ +# pylint: disable=too-few-public-methods +""" +This module defines the Document class, which represents a document +containing articles. +""" +from typing import Optional, List, Union +from datetime import datetime + + +class Article: + """A class representing an article. + + This class represents an article with an ID, title, and body text. + The body text can be provided as a list + of paragraphs, which will be joined into a single string. + + Attributes: + id (str): The unique identifier of the article. + title (str): The title of the article. + body (str): The body text of the article, represented as + a single string. + """ + def __init__(self, article_id: str, title: str, + body: Union[str, List[str]]) -> None: + """Initialize an Article object with the given ID, title, and body. + + Args: + id (str): The unique identifier of the article. + title (str): The title of the article. + body (Union[str, List[str]): The body text of the article, + provided as a list of paragraphs. + """ + self.id = article_id + self.title = title + if isinstance(body, list): + article_body = '\n'.join(body) + self.text = article_body + else: + self.text = body + + +class Document: + """ + Represents a document containing articles. + + Args: + title (str): The title of the document. + publish_date (str): The publication date of the document in + the format 'YYYY-MM-DD'. + language (str): The language of the document. + articles (List[Article]): A list of articles included in + the document. + + Attributes: + _title (str): The title of the document. + _publish_date (str): The publication date of the document in + the format 'YYYY-MM-DD'. + _year (Optional[int]): The year of publication, extracted from + publish_date. + _language (str): The language of the document. + _articles (List[Article]): A list of articles included in the + document. + + Properties: + title (str): Getter for the title of the document. + publish_date (str): Getter for the publication date of the + document. + year (Optional[int]): Getter for the year of publication. + decade (Optional[int]): Getter for the decade of publication. + language (str): Getter for the language of the document. + articles (List[Article]): Getter for the list of articles + included in the document. + """ + def __init__(self, title: str, publish_date: str, language: str, + articles: List[Article]) -> None: + self._year: Optional[int] = None + self._articles = articles + self._title = title + self._publish_date = publish_date + self._language = language + + @property + def title(self) -> str: + """ + Getter for the title of the document. + + Returns: + str: The title of the document. + """ + return self._title + + @property + def year(self) -> Optional[int]: + """ + Getter for the year of publication. + + Returns: + Optional[int]: The year of publication extracted + from publish_date, or None if it cannot be determined. + """ + if self._year is not None: + return self._year + try: + date_obj = datetime.strptime(self._publish_date, '%Y-%m-%d') + self._year = date_obj.year + return self._year + except ValueError: + return None + + @property + def decade(self) -> Optional[int]: + """ + Getter for the decade of publication. + + Returns: + Optional[int]: The decade of publication extracted from + publish_date, + or None if it cannot be determined. + """ + _ = self.year + return int(self._year / 10) * 10 if self._year is not None else None + + @property + def articles(self) -> List[Article]: + """ + Getter for the list of articles included in the document. + + Returns: + List[Article]: The list of articles included in the document. + """ + return self._articles diff --git a/interest/input_file.py b/interest/input_file.py new file mode 100644 index 0000000..72156f1 --- /dev/null +++ b/interest/input_file.py @@ -0,0 +1,119 @@ +""" +Input File Module +This module provides an abstract class for representing various input files. +""" + +import abc +import gzip +from pathlib import Path +from typing import Iterable, TextIO, cast, Optional +from interest.document import Document, Article +import logging + +# from .document_filter import DocumentFilter + + +class InputFile(abc.ABC): + """ + Abstract class for representing various input files. + + Attributes: + _filepath (Path): The file path of the input file. + + Methods: + __init__(filepath): Initialize the InputFile with a file path. + filepath(): Get the file path of the input file. + base_file_name(): Output a list of documents in the input file. + open(mode, encoding): Open the input file for reading. + articles(): Return all articles for the document found in the + input file. + doc(): Output a list of documents in the input file. + """ + + def __init__(self, filepath: Path) -> None: + """ + Initialize the InputFile with a file path. + + Args: + filepath (Path): The file path of the input file. + """ + self._filepath = filepath + + @property + def filepath(self) -> Path: + """ + Get the file path of the input file. + + Returns: + Path: The file path of the input file. + """ + return self._filepath + + @abc.abstractmethod + def base_file_name(self) -> str: + """ + Output a list of documents in the input file. + + This can be a singleton list if an input file contains only + one document. + + Returns: + str: The base file name without extension. + """ + return NotImplemented + + def open(self, mode: str = "rt", encoding=None) -> TextIO: + """ + Open the input file for reading. + + Args: + mode (str): The file open mode. + encoding: The encoding format. + + Returns: + TextIO: A file object for reading the input file. + """ + if self._filepath.suffix.startswith(".gz"): + return cast(TextIO, gzip.open(self._filepath, mode=mode, + encoding=encoding)) + + # Default to text file + return cast(TextIO, open(self._filepath, + mode=mode, encoding=encoding)) + + # pylint: disable=no-member + def articles(self) -> Iterable[Article]: + """ + Return all articles for the document found in the input file. + + Yields: + Article: An article object. + """ + doc = self.doc() + if doc is not None: + yield from doc.articles + else: + logging.error("Document not found or is None for filepath: %s", + self.filepath) + return + + @abc.abstractmethod + def doc(self) -> Optional[Document]: + """ + Output a list of documents in the input file. + + This can be a singleton list if an input file contains only + one document. + + Returns: + Document: A document object. + """ + return NotImplemented + + # def selected_articles(self, filter: DocumentFilter) -> Iterable[Article]: + # document = self.doc() + # if filter.filter_document(document): + # if document.articles() is not None: + # for article in document.articles(): + # if filter.filter_article(article): + # yield article diff --git a/interest/preprocessor/__init__.py b/interest/preprocessor/__init__.py index 6e6ff7e..3cec932 100644 --- a/interest/preprocessor/__init__.py +++ b/interest/preprocessor/__init__.py @@ -1 +1 @@ -from interest.preprocessor.parser import XMLExtractor \ No newline at end of file +# from interest.preprocessor.parser import XMLExtractor diff --git a/interest/preprocessor/parser.py b/interest/preprocessor/parser.py index bbaf724..25ac1d7 100644 --- a/interest/preprocessor/parser.py +++ b/interest/preprocessor/parser.py @@ -4,7 +4,7 @@ import gzip import json import xml.etree.ElementTree as ET -from typing import Dict, Union +from typing import Dict, Union, Any, Optional, List import logging @@ -73,7 +73,7 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s Returns: Dict[str, Union[Dict[str, str], Dict[int, Dict[str, str]]]]: A dictionary containing extracted content and metadata. # noqa: E501 """ - news_dict = {"newsletter_metadata": {}, "articles": {}} + news_dict: Dict[str, Any] = {"newsletter_metadata": {}, "articles": {}} id = 0 for entry in outer_tar: try: @@ -88,9 +88,11 @@ def process_tar(self, outer_tar: tarfile.TarFile) -> Dict[str, Union[Dict[str, s elif entry.name.endswith(".gz"): gz_member = next(member for member in outer_tar.getmembers() if member.name.endswith('.gz')) # noqa: E501 - with outer_tar.extractfile(gz_member) as gz_file: + with outer_tar.extractfile(gz_member) as gz_file: # type: ignore # noqa: E501 with gzip.open(gz_file, 'rt') as xml_file: xml_string = xml_file.read() + if isinstance(xml_string, bytes): + xml_string = xml_string.decode('utf-8') newsletter_metadata = self.extract_meta(xml_string) news_dict["newsletter_metadata"] = newsletter_metadata # noqa: E501 else: @@ -130,7 +132,7 @@ def save_as_json_compressed(data: Dict[str, Union[Dict[str, str], Dict[int, Dict # logging.error(f"Error saving JSON to {output_file}: {e}") @staticmethod - def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: + def extract_article(xml_content: str, file_name: str) -> Dict[str, Union[str, List[Optional[str]]]]: # noqa: E501 """ Extracts article title and body from XML content. @@ -139,7 +141,8 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: file_name (str): Name of the XML file. Returns: - Dict[str, str]: A dictionary containing the extracted title and body of the article. # noqa: E501 + Dict[Optional[str], list[str]]: A dictionary containing the extracted title and body of the article. + body contains a list of paragraphs. # noqa: E501 """ try: root = ET.fromstring(xml_content) @@ -152,14 +155,15 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: logging.warning("More than one titles are extracted for the article.") # noqa: E501 if not title_values: logging.warning("No title is extracted for the article.") - title = None + title = "" else: - title = title_values[0] + title = title_values[0] if title_values[0] is not None else "" + # title = title_values[0] body_values = [element.text for element in root.iter() if element.tag.endswith('p')] # noqa: E501 if not body_values: logging.warning("No body is extracted.") - body = None + body = [] # elif len(body_values) > 1: # logging.warning("There are more than one paragraphs in the article.") # noqa: E501 # body = ' '.join(body_values) @@ -169,7 +173,7 @@ def extract_article(xml_content: str, file_name: str) -> Dict[str, str]: return {"title": title, "body": body} - def extract_meta(self,xml_string: str) -> Dict[str, Union[str, None]]: + def extract_meta(self, xml_string: str) -> Dict[str, Union[str, None]]: """ Extracts metadata from XML string. @@ -195,7 +199,9 @@ def extract_meta(self,xml_string: str) -> Dict[str, Union[str, None]]: logging.warning(f"No {field} is extracted.") newsletter_metadata[field] = None else: - newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values) # noqa: E501 + filtered_field_values = [value for value in field_values if value is not None] # noqa: E501 + newsletter_metadata[field] = filtered_field_values[0] if field != "spatial" else ", ".join(filtered_field_values) # noqa: E501 + + # newsletter_metadata[field] = field_values[0] if field != "spatial" else ", ".join(field_values) # noqa: E501 return newsletter_metadata - \ No newline at end of file