Merge pull request #82 from PickwickSoft/feature/#72/data-loader-for-xml

✨ Create data loader for XML
PickwickSoft · Dec 30, 2023 · 92bda09 · 92bda09
2 parents 198d9c2 + 9e76342
commit 92bda09
Show file tree

Hide file tree

Showing 11 changed files with 354 additions and 121 deletions.
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -64,14 +64,14 @@ jobs:
  # Install dependencies. `--no-root` means "install all dependencies but not the project
  # itself", which is what you want to avoid caching _your_ code. The `if` statement
  # ensures this only runs on a cache miss.
- - run: poetry install --no-root
+ - run: poetry install --no-root --extras "all"
  if: steps.cache-deps.outputs.cache-hit != 'true'
 
  # Now install _your_ project. This isn't necessary for many types of projects -- particularly
  # things like Django apps don't need this. But it's a good idea since it fully-exercises the
  # pyproject.toml and makes that if you add things like console-scripts at some point that
  # they'll be installed and working.
- - run: poetry install
+ - run: poetry install --extras "all"
 
  # Runs a single command using the runners shell
  - name: Run Unittests

diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ Now you might be wondering why another library when there are already a few impl
 * The implementation achieves 100% test coverage.
 * It follows Pythonic principles, resulting in clean and readable code.
 * It adds some cool innovative features such as conditions or error handling and an even more declarative look.
-* It provides loaders for various data sources such as CSV
+* It provides loaders for various data sources such as CSV, JSON and XML files.
 
 Let's take a look at a small example:
 
@@ -213,14 +213,15 @@ Stream.concat(Stream.of([1, 2]), Stream.of([3, 4]))
 
 Creates a new Stream from multiple Streams. Order doesn't change.
 
-## Use loaders: Load data from CSV and JSON files in just one line
+## Use loaders: Load data from CSV, JSON and XML files in just one line
 
-PyStreamAPI offers a convenient way to load data from CSV and JSON files. Like that you can start processing your files right away without having to worry about reading and parsing the files.
+PyStreamAPI offers a convenient way to load data from CSV, JSON and XML files. Like that you can start processing your
+files right away without having to worry about reading and parsing the files.
 
 You can import the loaders with:
 
 ```python
-from pystreamapi.loaders import csv, json
+from pystreamapi.loaders import csv, json, xml
 ```
 Now you can use the loaders directly when creating your Stream:
 
@@ -241,6 +242,25 @@ Stream.of(json("data.json")) \
 
 You can access the attributes of the data structures directly like you would do with a normal object.
 
+For XML:
+
+In order to use the XML loader, you need to install the optional xml dependency:
+
+```bash
+pip install streams.py[xml_loader]
+```
+
+Afterward, you can use the XML loader like this:
+
+```python
+Stream.of(xml("data.xml"))
+ .map(lambda x: x.attr1)
+ .for_each(print)
+```
+
+The access to the attributes is using a node path syntax. For more details on how to use the node path syntax, please
+refer to the [documentation](https://pystreamapi.pickwicksoft.org/reference/data-loaders).
+
 ## API Reference
 For a more detailed documentation view the docs on GitBook: [PyStreamAPI Docs](https://pystreamapi.pickwicksoft.org/)
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "streams.py"
-version = "1.1.0"
+version = "1.2.0"
 authors = ["Stefan Garlonta <stefan@pickwicksoft.org>"]
 description = "A stream library for Python inspired by Java Stream API"
 keywords = ["streams", "parallel", "data"]
@@ -15,6 +15,11 @@ packages = [
 [tool.poetry.dependencies]
 python = ">=3.7,<4.0"
 joblib = ">=1.2,<1.4"
+defusedxml = { version = ">=0.7,<0.8", optional = true }
+
+[tool.poetry.extras]
+xml_loader = ["defusedxml"]
+all = ["defusedxml"]
 
 [tool.poetry.group.test.dependencies]
 parameterized = "*"

diff --git a/pystreamapi/__init__.py b/pystreamapi/__init__.py
@@ -1,5 +1,5 @@
 from pystreamapi.__stream import Stream
 from pystreamapi._streams.error.__levels import ErrorLevel
 
-__version__ = "1.1.0"
+__version__ = "1.2.0"
 __all__ = ["Stream", "ErrorLevel"]
diff --git a/pystreamapi/loaders/__init__.py b/pystreamapi/loaders/__init__.py
@@ -1,7 +1,9 @@
 from pystreamapi.loaders.__csv.__csv_loader import csv
 from pystreamapi.loaders.__json.__json_loader import json
+from pystreamapi.loaders.__xml.__xml_loader import xml
 
 __all__ = [
  'csv',
- 'json'
+ 'json',
+ 'xml'
 ]
diff --git a/pystreamapi/loaders/__xml/__init__.py b/pystreamapi/loaders/__xml/__init__.py
diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py
@@ -0,0 +1,117 @@
+try:
+ from defusedxml import ElementTree
+except ImportError as exc:
+ raise ImportError(
+ "Please install the xml_loader extra dependency to use the xml loader."
+ ) from exc
+from collections import namedtuple
+from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
+from pystreamapi.loaders.__loader_utils import LoaderUtils
+
+
+class __XmlLoaderUtil:
+ """Utility class for the XML loader."""
+
+ def __init__(self):
+ self.cast_types = True
+ self.retrieve_children = True
+
+
+config = __XmlLoaderUtil()
+
+
+def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
+ encoding="utf-8") -> LazyFileIterable:
+ """
+ Loads XML data from either a path or a string and converts it into a list of namedtuples.
+ Warning: This method isn't safe against malicious XML trees. Parse only safe XML from sources
+ you trust.
+
+ Returns:
+ LazyFileIterable: A list of namedtuples, where each namedtuple represents an XML element.
+ :param retrieve_children: If true, the children of the root element are used as stream
+ elements.
+ :param encoding: The encoding of the XML file.
+ :param src: Either the path to an XML file or an XML string.
+ :param read_from_src: If True, src is treated as an XML string. If False, src is treated as
+ a path to an XML file.
+ :param cast_types: Set as False to disable casting of values to int, bool or float.
+ """
+ config.cast_types = cast_types
+ config.retrieve_children = retrieve_children
+ if read_from_src:
+ return LazyFileIterable(lambda: __load_xml_string(src))
+ path = LoaderUtils.validate_path(src)
+ return LazyFileIterable(lambda: __load_xml_file(path, encoding))
+
+
+def __load_xml_file(file_path, encoding):
+ """Load an XML file and convert it into a list of namedtuples."""
+ # skipcq: PTC-W6004
+ with open(file_path, mode='r', encoding=encoding) as xmlfile:
+ src = xmlfile.read()
+ if src:
+ return __parse_xml_string(src)
+ return []
+
+
+def __load_xml_string(xml_string):
+ """Load XML data from a string and convert it into a list of namedtuples."""
+ return __parse_xml_string(xml_string)
+
+
+def __parse_xml_string(xml_string):
+ """Parse XML string and convert it into a list of namedtuples."""
+ root = ElementTree.fromstring(xml_string)
+ parsed_xml = __parse_xml(root)
+ return __flatten(parsed_xml) if config.retrieve_children else [parsed_xml]
+
+
+def __parse_xml(element):
+ """Parse XML element and convert it into a namedtuple."""
+ if len(element) == 0:
+ return __parse_empty_element(element)
+ if len(element) == 1:
+ return __parse_single_element(element)
+ return __parse_multiple_elements(element)
+
+
+def __parse_empty_element(element):
+ """Parse XML element without children and convert it into a namedtuple."""
+ return LoaderUtils.try_cast(element.text) if config.cast_types else element.text
+
+
+def __parse_single_element(element):
+ """Parse XML element with a single child and convert it into a namedtuple."""
+ sub_element = element[0]
+ sub_item = __parse_xml(sub_element)
+ Item = namedtuple(element.tag, [sub_element.tag])
+ return Item(sub_item)
+
+
+def __parse_multiple_elements(element):
+ """Parse XML element with multiple children and convert it into a namedtuple."""
+ tag_dict = {}
+ for e in element:
+ if e.tag not in tag_dict:
+ tag_dict[e.tag] = []
+ tag_dict[e.tag].append(__parse_xml(e))
+ filtered_dict = __filter_single_items(tag_dict)
+ Item = namedtuple(element.tag, filtered_dict.keys())
+ return Item(*filtered_dict.values())
+
+
+def __filter_single_items(tag_dict):
+ """Filter out single-item lists from a dictionary."""
+ return {key: value[0] if len(value) == 1 else value for key, value in tag_dict.items()}
+
+
+def __flatten(data):
+ """Flatten a list of lists."""
+ res = []
+ for item in data:
+ if isinstance(item, list):
+ res.extend(item)
+ else:
+ res.append(item)
+ return res
diff --git a/setup.cfg b/setup.cfg
diff --git a/tests/test_xml_loader.py b/tests/test_xml_loader.py
@@ -0,0 +1,107 @@
+# pylint: disable=not-context-manager
+from unittest import TestCase
+from unittest.mock import patch, mock_open
+from xml.etree.ElementTree import ParseError
+
+from file_test import OPEN, PATH_EXISTS, PATH_ISFILE
+from pystreamapi.loaders import xml
+
+file_content = """
+<employees>
+ <employee>
+ <name>John Doe</name>
+ <salary>80000</salary>
+ </employee>
+ <employee>
+ <name>Alice Smith</name>
+ <child>
+ <name>Frank</name>
+ </child>
+ </employee>
+ <founder>
+ <cars>
+ <car>Bugatti</car>
+ <car>Mercedes</car>
+ </cars>
+ </founder>
+</employees>
+"""
+file_path = 'path/to/data.xml'
+
+
+class TestXmlLoader(TestCase):
+
+ def test_xml_loader_from_file_children(self):
+ with (patch(OPEN, mock_open(read_data=file_content)),
+ patch(PATH_EXISTS, return_value=True),
+ patch(PATH_ISFILE, return_value=True)):
+ data = xml(file_path)
+ self.assertEqual(len(data), 3)
+ self.assertEqual(data[0].salary, 80000)
+ self.assertIsInstance(data[0].salary, int)
+ self.assertEqual(data[1].child.name, "Frank")
+ self.assertIsInstance(data[1].child.name, str)
+ self.assertEqual(data[2].cars.car[0], 'Bugatti')
+ self.assertIsInstance(data[2].cars.car[0], str)
+
+ def test_xml_loader_from_file_no_children_false(self):
+ with (patch(OPEN, mock_open(read_data=file_content)),
+ patch(PATH_EXISTS, return_value=True),
+ patch(PATH_ISFILE, return_value=True)):
+ data = xml(file_path, retrieve_children=False)
+ self.assertEqual(len(data), 1)
+ self.assertEqual(data[0].employee[0].salary, 80000)
+ self.assertIsInstance(data[0].employee[0].salary, int)
+ self.assertEqual(data[0].employee[1].child.name, "Frank")
+ self.assertIsInstance(data[0].employee[1].child.name, str)
+ self.assertEqual(data[0].founder.cars.car[0], 'Bugatti')
+ self.assertIsInstance(data[0].founder.cars.car[0], str)
+
+ def test_xml_loader_no_casting(self):
+ with (patch(OPEN, mock_open(read_data=file_content)),
+ patch(PATH_EXISTS, return_value=True),
+ patch(PATH_ISFILE, return_value=True)):
+ data = xml(file_path, cast_types=False)
+ self.assertEqual(len(data), 3)
+ self.assertEqual(data[0].salary, '80000')
+ self.assertIsInstance(data[0].salary, str)
+ self.assertEqual(data[1].child.name, "Frank")
+ self.assertIsInstance(data[1].child.name, str)
+ self.assertEqual(data[2].cars.car[0], 'Bugatti')
+ self.assertIsInstance(data[2].cars.car[0], str)
+
+ def test_xml_loader_is_iterable(self):
+ with (patch(OPEN, mock_open(read_data=file_content)),
+ patch(PATH_EXISTS, return_value=True),
+ patch(PATH_ISFILE, return_value=True)):
+ data = xml(file_path)
+ self.assertEqual(len(list(iter(data))), 3)
+
+ def test_xml_loader_with_empty_file(self):
+ with (patch(OPEN, mock_open(read_data="")),
+ patch(PATH_EXISTS, return_value=True),
+ patch(PATH_ISFILE, return_value=True)):
+ data = xml(file_path)
+ self.assertEqual(len(data), 0)
+
+ def test_xml_loader_with_invalid_path(self):
+ with self.assertRaises(FileNotFoundError):
+ xml('path/to/invalid.xml')
+
+ def test_xml_loader_with_no_file(self):
+ with self.assertRaises(ValueError):
+ xml('./')
+
+ def test_xml_loader_from_string(self):
+ data = xml(file_content, read_from_src=True)
+ self.assertEqual(len(data), 3)
+ self.assertEqual(data[0].salary, 80000)
+ self.assertIsInstance(data[0].salary, int)
+ self.assertEqual(data[1].child.name, "Frank")
+ self.assertIsInstance(data[1].child.name, str)
+ self.assertEqual(data[2].cars.car[0], 'Bugatti')
+ self.assertIsInstance(data[2].cars.car[0], str)
+
+ def test_xml_loader_from_empty_string(self):
+ with self.assertRaises(ParseError):
+ len(xml('', read_from_src=True))
diff --git a/tox.ini b/tox.ini
@@ -8,11 +8,12 @@ deps =
  optional.py
  joblib
  parameterized
+ defusedxml
 commands =
  coverage run -m unittest discover -s tests -t tests --pattern 'test_*.py'
  coverage xml
 
 [coverage:run]
 relative_files = True
 source = pystreamapi/
-branch = True
+branch = True