Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/#70/data loader for json #77

Merged
merged 6 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,23 +213,33 @@ Stream.concat(Stream.of([1, 2]), Stream.of([3, 4]))

Creates a new Stream from multiple Streams. Order doesn't change.

## Use loaders: Load data from CSV files in just one line
## Use loaders: Load data from CSV and JSON files in just one line

PyStreamAPI offers a convenient way to load data from CSV files. Like that you can start processing your CSV right away without having to worry about reading and parsing the file.
PyStreamAPI offers a convenient way to load data from CSV and JSON files. Like that you can start processing your files right away without having to worry about reading and parsing the files.

You can import the loader with:
You can import the loaders with:

```python
from pystreamapi.loaders import csv
from pystreamapi.loaders import csv, json
```
Now you can use the loader directly when creating your Stream:
Now you can use the loaders directly when creating your Stream:

For CSV:

```python
Stream.of(csv("data.csv", delimiter=";")) \
.map(lambda x: x.attr1) \
.for_each(print)
```
You can access the attributes of the CSV rows directly like you would with a normal object.

For JSON:
```python
Stream.of(json("data.json")) \
.map(lambda x: x.attr1) \
.for_each(print)
```

You can access the attributes of the data structures directly like you would do with a normal object.

## API Reference
For a more detailed documentation view the docs on GitBook: [PyStreamAPI Docs](https://pystreamapi.pickwicksoft.org/)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import contextlib
import os
from collections import namedtuple
from csv import reader

from pystreamapi.loaders.__loader_utils import LoaderUtils
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable


Expand All @@ -17,7 +16,7 @@ def csv(file_path: str, cast_types=True, delimiter=',', encoding="utf-8") -> Laz
:param file_path: The path to the CSV file.
:param delimiter: The delimiter used in the CSV file.
"""
file_path = __validate_path(file_path)
file_path = LoaderUtils.validate_path(file_path)
return LazyFileIterable(lambda: __load_csv(file_path, cast_types, delimiter, encoding))


Expand All @@ -28,28 +27,24 @@ def __load_csv(file_path, cast, delimiter, encoding):
csvreader = reader(csvfile, delimiter=delimiter)

# Create a namedtuple type, casting the header values to int or float if possible
Row = namedtuple('Row', list(next(csvreader, [])))
header = __get_csv_header(csvreader)

mapper = __try_cast if cast else lambda x: x
Row = namedtuple('Row', list(header))

mapper = LoaderUtils.try_cast if cast else lambda x: x

# Process the data, casting values to int or float if possible
data = [Row(*[mapper(value) for value in row]) for row in csvreader]
return data


def __validate_path(file_path: str):
"""Validate the path to the CSV file"""
if not os.path.exists(file_path):
raise FileNotFoundError("The specified file does not exist.")
if not os.path.isfile(file_path):
raise ValueError("The specified path is not a file.")
return file_path


def __try_cast(value):
"""Try to cast value to primary data types from python (int, float, bool)"""
for cast in (int, float):
with contextlib.suppress(ValueError):
return cast(value)
# Try to cast to bool
return value.lower() == 'true' if value.lower() in ('true', 'false') else value
def __get_csv_header(csvreader):
"""Get the header of a CSV file. If the header is empty, return an empty list"""
while True:
try:
header = next(csvreader)
if header:
break
except StopIteration:
return []
return header
File renamed without changes.
6 changes: 4 additions & 2 deletions pystreamapi/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from pystreamapi.loaders.__csv_loader import csv
from pystreamapi.loaders.__csv.__csv_loader import csv
from pystreamapi.loaders.__json.__json_loader import json

__all__ = [
'csv'
'csv',
'json'
]
Empty file.
46 changes: 46 additions & 0 deletions pystreamapi/loaders/__json/__json_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json as jsonlib
from collections import namedtuple

from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
from pystreamapi.loaders.__loader_utils import LoaderUtils


def json(src: str, read_from_src=False) -> LazyFileIterable:
"""
Loads JSON data from either a path or a string and converts it into a list of namedtuples.

Returns:
list: A list of namedtuples, where each namedtuple represents an object in the JSON.
:param src: Either the path to a JSON file or a JSON string.
:param read_from_src: If True, src is treated as a JSON string. If False, src is treated as
a path to a JSON file.
"""
if read_from_src:
return LazyFileIterable(lambda: __load_json_string(src))
path = LoaderUtils.validate_path(src)
return LazyFileIterable(lambda: __load_json_file(path))


def __load_json_file(file_path):
"""Load a JSON file and convert it into a list of namedtuples"""
# skipcq: PTC-W6004
with open(file_path, mode='r', encoding='utf-8') as jsonfile:
src = jsonfile.read()
if src == '':
return []
data = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
return data


def __load_json_string(json_string):
"""Load JSON data from a string and convert it into a list of namedtuples"""
return jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)


def __dict_to_namedtuple(d, name='Item'):
"""Convert a dictionary to a namedtuple"""
if isinstance(d, dict):
fields = list(d.keys())
Item = namedtuple(name, fields)
return Item(**{k: __dict_to_namedtuple(v, k) for k, v in d.items()})
return d
24 changes: 24 additions & 0 deletions pystreamapi/loaders/__loader_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import contextlib
import os


class LoaderUtils:
"""Utility class for loaders to validate paths and cast data"""

@staticmethod
def try_cast(value):
"""Try to cast value to primary data types from python (int, float, bool)"""
for cast in (int, float):
with contextlib.suppress(ValueError):
return cast(value)
# Try to cast to bool
return value.lower() == 'true' if value.lower() in ('true', 'false') else value

@staticmethod
def validate_path(file_path: str):
"""Validate the path to the CSV file"""
if not os.path.exists(file_path):
raise FileNotFoundError("The specified file does not exist.")
if not os.path.isfile(file_path):
raise ValueError("The specified path is not a file.")
return file_path
3 changes: 0 additions & 3 deletions tests/assets/data.csv

This file was deleted.

2 changes: 0 additions & 2 deletions tests/assets/data2.csv

This file was deleted.

71 changes: 71 additions & 0 deletions tests/test_csv_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# pylint: disable=not-context-manager
from unittest import TestCase
from unittest.mock import patch, mock_open

from pystreamapi.loaders import csv

file_content = """
attr1,attr2
1,2.0
a,b
"""


class TestCSVLoader(TestCase):

def test_csv_loader(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv')
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, 1)
self.assertIsInstance(data[0].attr1, int)
self.assertEqual(data[0].attr2, 2.0)
self.assertIsInstance(data[0].attr2, float)
self.assertEqual(data[1].attr1, 'a')
self.assertIsInstance(data[1].attr1, str)

def test_csv_loader_with_casting_disabled(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv', cast_types=False)
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, '1')
self.assertIsInstance(data[0].attr1, str)
self.assertEqual(data[0].attr2, '2.0')
self.assertIsInstance(data[0].attr2, str)
self.assertEqual(data[1].attr1, 'a')
self.assertIsInstance(data[1].attr1, str)

def test_csv_loader_is_iterable(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv')
self.assertEqual(len(list(iter(data))), 2)

def test_csv_loader_with_custom_delimiter(self):
with (patch('builtins.open', mock_open(read_data=file_content.replace(",", ";"))),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv', delimiter=';')
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, 1)
self.assertIsInstance(data[0].attr1, int)

def test_csv_loader_with_empty_file(self):
with (patch('builtins.open', mock_open(read_data="")),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv')
self.assertEqual(len(data), 0)

def test_csv_loader_with_invalid_path(self):
with self.assertRaises(FileNotFoundError):
csv('path/to/invalid.csv')

def test_csv_loader_with_no_file(self):
with self.assertRaises(ValueError):
csv('./')
71 changes: 71 additions & 0 deletions tests/test_json_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# pylint: disable=not-context-manager
from json import JSONDecodeError
from unittest import TestCase
from unittest.mock import patch, mock_open

from pystreamapi.loaders import json

file_content = """
[
{
"attr1": 1,
"attr2": 2.0
},
{
"attr1": "a",
"attr2": "b"
}
]
"""


class TestJsonLoader(TestCase):

def test_json_loader_from_file(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = json('path/to/data.json')
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, 1)
self.assertIsInstance(data[0].attr1, int)
self.assertEqual(data[0].attr2, 2.0)
self.assertIsInstance(data[0].attr2, float)
self.assertEqual(data[1].attr1, 'a')
self.assertIsInstance(data[1].attr1, str)

def test_json_loader_is_iterable(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = json('path/to/data.json')
self.assertEqual(len(list(iter(data))), 2)

def test_json_loader_with_empty_file(self):
with (patch('builtins.open', mock_open(read_data="")),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = json('path/to/data.json')
self.assertEqual(len(data), 0)

def test_json_loader_with_invalid_path(self):
with self.assertRaises(FileNotFoundError):
json('path/to/invalid.json')

def test_json_loader_with_no_file(self):
with self.assertRaises(ValueError):
json('./')

def test_json_loader_from_string(self):
data = json(file_content, read_from_src=True)
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, 1)
self.assertIsInstance(data[0].attr1, int)
self.assertEqual(data[0].attr2, 2.0)
self.assertIsInstance(data[0].attr2, float)
self.assertEqual(data[1].attr1, 'a')
self.assertIsInstance(data[1].attr1, str)

def test_json_loader_from_empty_string(self):
with self.assertRaises(JSONDecodeError):
self.assertEqual(len(json('', read_from_src=True)), 0)
53 changes: 0 additions & 53 deletions tests/test_loaders.py

This file was deleted.

Loading