Skip to content

Commit

Permalink
Merge pull request #82 from PickwickSoft/feature/#72/data-loader-for-xml
Browse files Browse the repository at this point in the history
✨ Create data loader for XML
  • Loading branch information
garlontas committed Dec 30, 2023
2 parents 198d9c2 + 9e76342 commit 92bda09
Show file tree
Hide file tree
Showing 11 changed files with 354 additions and 121 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,14 @@ jobs:
# Install dependencies. `--no-root` means "install all dependencies but not the project
# itself", which is what you want to avoid caching _your_ code. The `if` statement
# ensures this only runs on a cache miss.
- run: poetry install --no-root
- run: poetry install --no-root --extras "all"
if: steps.cache-deps.outputs.cache-hit != 'true'

# Now install _your_ project. This isn't necessary for many types of projects -- particularly
# things like Django apps don't need this. But it's a good idea since it fully-exercises the
# pyproject.toml and makes that if you add things like console-scripts at some point that
# they'll be installed and working.
- run: poetry install
- run: poetry install --extras "all"

# Runs a single command using the runners shell
- name: Run Unittests
Expand Down
28 changes: 24 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Now you might be wondering why another library when there are already a few impl
* The implementation achieves 100% test coverage.
* It follows Pythonic principles, resulting in clean and readable code.
* It adds some cool innovative features such as conditions or error handling and an even more declarative look.
* It provides loaders for various data sources such as CSV
* It provides loaders for various data sources such as CSV, JSON and XML files.

Let's take a look at a small example:

Expand Down Expand Up @@ -213,14 +213,15 @@ Stream.concat(Stream.of([1, 2]), Stream.of([3, 4]))

Creates a new Stream from multiple Streams. Order doesn't change.

## Use loaders: Load data from CSV and JSON files in just one line
## Use loaders: Load data from CSV, JSON and XML files in just one line

PyStreamAPI offers a convenient way to load data from CSV and JSON files. Like that you can start processing your files right away without having to worry about reading and parsing the files.
PyStreamAPI offers a convenient way to load data from CSV, JSON and XML files. Like that you can start processing your
files right away without having to worry about reading and parsing the files.

You can import the loaders with:

```python
from pystreamapi.loaders import csv, json
from pystreamapi.loaders import csv, json, xml
```
Now you can use the loaders directly when creating your Stream:

Expand All @@ -241,6 +242,25 @@ Stream.of(json("data.json")) \

You can access the attributes of the data structures directly like you would do with a normal object.

For XML:

In order to use the XML loader, you need to install the optional xml dependency:

```bash
pip install streams.py[xml_loader]
```

Afterward, you can use the XML loader like this:

```python
Stream.of(xml("data.xml"))
.map(lambda x: x.attr1)
.for_each(print)
```

The access to the attributes is using a node path syntax. For more details on how to use the node path syntax, please
refer to the [documentation](https://pystreamapi.pickwicksoft.org/reference/data-loaders).

## API Reference
For a more detailed documentation view the docs on GitBook: [PyStreamAPI Docs](https://pystreamapi.pickwicksoft.org/)

Expand Down
173 changes: 92 additions & 81 deletions poetry.lock

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "streams.py"
version = "1.1.0"
version = "1.2.0"
authors = ["Stefan Garlonta <stefan@pickwicksoft.org>"]
description = "A stream library for Python inspired by Java Stream API"
keywords = ["streams", "parallel", "data"]
Expand All @@ -15,6 +15,11 @@ packages = [
[tool.poetry.dependencies]
python = ">=3.7,<4.0"
joblib = ">=1.2,<1.4"
defusedxml = { version = ">=0.7,<0.8", optional = true }

[tool.poetry.extras]
xml_loader = ["defusedxml"]
all = ["defusedxml"]

[tool.poetry.group.test.dependencies]
parameterized = "*"
Expand Down
2 changes: 1 addition & 1 deletion pystreamapi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pystreamapi.__stream import Stream
from pystreamapi._streams.error.__levels import ErrorLevel

__version__ = "1.1.0"
__version__ = "1.2.0"
__all__ = ["Stream", "ErrorLevel"]
4 changes: 3 additions & 1 deletion pystreamapi/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from pystreamapi.loaders.__csv.__csv_loader import csv
from pystreamapi.loaders.__json.__json_loader import json
from pystreamapi.loaders.__xml.__xml_loader import xml

__all__ = [
'csv',
'json'
'json',
'xml'
]
Empty file.
117 changes: 117 additions & 0 deletions pystreamapi/loaders/__xml/__xml_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
try:
from defusedxml import ElementTree
except ImportError as exc:
raise ImportError(
"Please install the xml_loader extra dependency to use the xml loader."
) from exc
from collections import namedtuple
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
from pystreamapi.loaders.__loader_utils import LoaderUtils


class __XmlLoaderUtil:
"""Utility class for the XML loader."""

def __init__(self):
self.cast_types = True
self.retrieve_children = True


config = __XmlLoaderUtil()


def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
encoding="utf-8") -> LazyFileIterable:
"""
Loads XML data from either a path or a string and converts it into a list of namedtuples.
Warning: This method isn't safe against malicious XML trees. Parse only safe XML from sources
you trust.
Returns:
LazyFileIterable: A list of namedtuples, where each namedtuple represents an XML element.
:param retrieve_children: If true, the children of the root element are used as stream
elements.
:param encoding: The encoding of the XML file.
:param src: Either the path to an XML file or an XML string.
:param read_from_src: If True, src is treated as an XML string. If False, src is treated as
a path to an XML file.
:param cast_types: Set as False to disable casting of values to int, bool or float.
"""
config.cast_types = cast_types
config.retrieve_children = retrieve_children
if read_from_src:
return LazyFileIterable(lambda: __load_xml_string(src))
path = LoaderUtils.validate_path(src)
return LazyFileIterable(lambda: __load_xml_file(path, encoding))


def __load_xml_file(file_path, encoding):
"""Load an XML file and convert it into a list of namedtuples."""
# skipcq: PTC-W6004
with open(file_path, mode='r', encoding=encoding) as xmlfile:
src = xmlfile.read()
if src:
return __parse_xml_string(src)
return []


def __load_xml_string(xml_string):
"""Load XML data from a string and convert it into a list of namedtuples."""
return __parse_xml_string(xml_string)


def __parse_xml_string(xml_string):
"""Parse XML string and convert it into a list of namedtuples."""
root = ElementTree.fromstring(xml_string)
parsed_xml = __parse_xml(root)
return __flatten(parsed_xml) if config.retrieve_children else [parsed_xml]


def __parse_xml(element):
"""Parse XML element and convert it into a namedtuple."""
if len(element) == 0:
return __parse_empty_element(element)
if len(element) == 1:
return __parse_single_element(element)
return __parse_multiple_elements(element)


def __parse_empty_element(element):
"""Parse XML element without children and convert it into a namedtuple."""
return LoaderUtils.try_cast(element.text) if config.cast_types else element.text


def __parse_single_element(element):
"""Parse XML element with a single child and convert it into a namedtuple."""
sub_element = element[0]
sub_item = __parse_xml(sub_element)
Item = namedtuple(element.tag, [sub_element.tag])
return Item(sub_item)


def __parse_multiple_elements(element):
"""Parse XML element with multiple children and convert it into a namedtuple."""
tag_dict = {}
for e in element:
if e.tag not in tag_dict:
tag_dict[e.tag] = []
tag_dict[e.tag].append(__parse_xml(e))
filtered_dict = __filter_single_items(tag_dict)
Item = namedtuple(element.tag, filtered_dict.keys())
return Item(*filtered_dict.values())


def __filter_single_items(tag_dict):
"""Filter out single-item lists from a dictionary."""
return {key: value[0] if len(value) == 1 else value for key, value in tag_dict.items()}


def __flatten(data):
"""Flatten a list of lists."""
res = []
for item in data:
if isinstance(item, list):
res.extend(item)
else:
res.append(item)
return res
30 changes: 0 additions & 30 deletions setup.cfg

This file was deleted.

107 changes: 107 additions & 0 deletions tests/test_xml_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# pylint: disable=not-context-manager
from unittest import TestCase
from unittest.mock import patch, mock_open
from xml.etree.ElementTree import ParseError

from file_test import OPEN, PATH_EXISTS, PATH_ISFILE
from pystreamapi.loaders import xml

file_content = """
<employees>
<employee>
<name>John Doe</name>
<salary>80000</salary>
</employee>
<employee>
<name>Alice Smith</name>
<child>
<name>Frank</name>
</child>
</employee>
<founder>
<cars>
<car>Bugatti</car>
<car>Mercedes</car>
</cars>
</founder>
</employees>
"""
file_path = 'path/to/data.xml'


class TestXmlLoader(TestCase):

def test_xml_loader_from_file_children(self):
with (patch(OPEN, mock_open(read_data=file_content)),
patch(PATH_EXISTS, return_value=True),
patch(PATH_ISFILE, return_value=True)):
data = xml(file_path)
self.assertEqual(len(data), 3)
self.assertEqual(data[0].salary, 80000)
self.assertIsInstance(data[0].salary, int)
self.assertEqual(data[1].child.name, "Frank")
self.assertIsInstance(data[1].child.name, str)
self.assertEqual(data[2].cars.car[0], 'Bugatti')
self.assertIsInstance(data[2].cars.car[0], str)

def test_xml_loader_from_file_no_children_false(self):
with (patch(OPEN, mock_open(read_data=file_content)),
patch(PATH_EXISTS, return_value=True),
patch(PATH_ISFILE, return_value=True)):
data = xml(file_path, retrieve_children=False)
self.assertEqual(len(data), 1)
self.assertEqual(data[0].employee[0].salary, 80000)
self.assertIsInstance(data[0].employee[0].salary, int)
self.assertEqual(data[0].employee[1].child.name, "Frank")
self.assertIsInstance(data[0].employee[1].child.name, str)
self.assertEqual(data[0].founder.cars.car[0], 'Bugatti')
self.assertIsInstance(data[0].founder.cars.car[0], str)

def test_xml_loader_no_casting(self):
with (patch(OPEN, mock_open(read_data=file_content)),
patch(PATH_EXISTS, return_value=True),
patch(PATH_ISFILE, return_value=True)):
data = xml(file_path, cast_types=False)
self.assertEqual(len(data), 3)
self.assertEqual(data[0].salary, '80000')
self.assertIsInstance(data[0].salary, str)
self.assertEqual(data[1].child.name, "Frank")
self.assertIsInstance(data[1].child.name, str)
self.assertEqual(data[2].cars.car[0], 'Bugatti')
self.assertIsInstance(data[2].cars.car[0], str)

def test_xml_loader_is_iterable(self):
with (patch(OPEN, mock_open(read_data=file_content)),
patch(PATH_EXISTS, return_value=True),
patch(PATH_ISFILE, return_value=True)):
data = xml(file_path)
self.assertEqual(len(list(iter(data))), 3)

def test_xml_loader_with_empty_file(self):
with (patch(OPEN, mock_open(read_data="")),
patch(PATH_EXISTS, return_value=True),
patch(PATH_ISFILE, return_value=True)):
data = xml(file_path)
self.assertEqual(len(data), 0)

def test_xml_loader_with_invalid_path(self):
with self.assertRaises(FileNotFoundError):
xml('path/to/invalid.xml')

def test_xml_loader_with_no_file(self):
with self.assertRaises(ValueError):
xml('./')

def test_xml_loader_from_string(self):
data = xml(file_content, read_from_src=True)
self.assertEqual(len(data), 3)
self.assertEqual(data[0].salary, 80000)
self.assertIsInstance(data[0].salary, int)
self.assertEqual(data[1].child.name, "Frank")
self.assertIsInstance(data[1].child.name, str)
self.assertEqual(data[2].cars.car[0], 'Bugatti')
self.assertIsInstance(data[2].cars.car[0], str)

def test_xml_loader_from_empty_string(self):
with self.assertRaises(ParseError):
len(xml('', read_from_src=True))
3 changes: 2 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ deps =
optional.py
joblib
parameterized
defusedxml
commands =
coverage run -m unittest discover -s tests -t tests --pattern 'test_*.py'
coverage xml

[coverage:run]
relative_files = True
source = pystreamapi/
branch = True
branch = True

0 comments on commit 92bda09

Please sign in to comment.