Skip to content

Commit

Permalink
Merge pull request #77 from PickwickSoft/feature/#70/data-loader-for-…
Browse files Browse the repository at this point in the history
…json

Feature/#70/data loader for json
  • Loading branch information
garlontas authored Sep 28, 2023
2 parents 775b687 + 561ac17 commit 31c8dea
Show file tree
Hide file tree
Showing 12 changed files with 248 additions and 87 deletions.
22 changes: 16 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,23 +213,33 @@ Stream.concat(Stream.of([1, 2]), Stream.of([3, 4]))

Creates a new Stream from multiple Streams. Order doesn't change.

## Use loaders: Load data from CSV files in just one line
## Use loaders: Load data from CSV and JSON files in just one line

PyStreamAPI offers a convenient way to load data from CSV files. Like that you can start processing your CSV right away without having to worry about reading and parsing the file.
PyStreamAPI offers a convenient way to load data from CSV and JSON files. Like that you can start processing your files right away without having to worry about reading and parsing the files.

You can import the loader with:
You can import the loaders with:

```python
from pystreamapi.loaders import csv
from pystreamapi.loaders import csv, json
```
Now you can use the loader directly when creating your Stream:
Now you can use the loaders directly when creating your Stream:

For CSV:

```python
Stream.of(csv("data.csv", delimiter=";")) \
.map(lambda x: x.attr1) \
.for_each(print)
```
You can access the attributes of the CSV rows directly like you would with a normal object.

For JSON:
```python
Stream.of(json("data.json")) \
.map(lambda x: x.attr1) \
.for_each(print)
```

You can access the attributes of the data structures directly like you would do with a normal object.

## API Reference
For a more detailed documentation view the docs on GitBook: [PyStreamAPI Docs](https://pystreamapi.pickwicksoft.org/)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import contextlib
import os
from collections import namedtuple
from csv import reader

from pystreamapi.loaders.__loader_utils import LoaderUtils
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable


Expand All @@ -17,7 +16,7 @@ def csv(file_path: str, cast_types=True, delimiter=',', encoding="utf-8") -> Laz
:param file_path: The path to the CSV file.
:param delimiter: The delimiter used in the CSV file.
"""
file_path = __validate_path(file_path)
file_path = LoaderUtils.validate_path(file_path)
return LazyFileIterable(lambda: __load_csv(file_path, cast_types, delimiter, encoding))


Expand All @@ -28,28 +27,24 @@ def __load_csv(file_path, cast, delimiter, encoding):
csvreader = reader(csvfile, delimiter=delimiter)

# Create a namedtuple type, casting the header values to int or float if possible
Row = namedtuple('Row', list(next(csvreader, [])))
header = __get_csv_header(csvreader)

mapper = __try_cast if cast else lambda x: x
Row = namedtuple('Row', list(header))

mapper = LoaderUtils.try_cast if cast else lambda x: x

# Process the data, casting values to int or float if possible
data = [Row(*[mapper(value) for value in row]) for row in csvreader]
return data


def __validate_path(file_path: str):
"""Validate the path to the CSV file"""
if not os.path.exists(file_path):
raise FileNotFoundError("The specified file does not exist.")
if not os.path.isfile(file_path):
raise ValueError("The specified path is not a file.")
return file_path


def __try_cast(value):
"""Try to cast value to primary data types from python (int, float, bool)"""
for cast in (int, float):
with contextlib.suppress(ValueError):
return cast(value)
# Try to cast to bool
return value.lower() == 'true' if value.lower() in ('true', 'false') else value
def __get_csv_header(csvreader):
"""Get the header of a CSV file. If the header is empty, return an empty list"""
while True:
try:
header = next(csvreader)
if header:
break
except StopIteration:
return []
return header
File renamed without changes.
6 changes: 4 additions & 2 deletions pystreamapi/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from pystreamapi.loaders.__csv_loader import csv
from pystreamapi.loaders.__csv.__csv_loader import csv
from pystreamapi.loaders.__json.__json_loader import json

__all__ = [
'csv'
'csv',
'json'
]
Empty file.
46 changes: 46 additions & 0 deletions pystreamapi/loaders/__json/__json_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json as jsonlib
from collections import namedtuple

from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
from pystreamapi.loaders.__loader_utils import LoaderUtils


def json(src: str, read_from_src=False) -> LazyFileIterable:
"""
Loads JSON data from either a path or a string and converts it into a list of namedtuples.
Returns:
list: A list of namedtuples, where each namedtuple represents an object in the JSON.
:param src: Either the path to a JSON file or a JSON string.
:param read_from_src: If True, src is treated as a JSON string. If False, src is treated as
a path to a JSON file.
"""
if read_from_src:
return LazyFileIterable(lambda: __load_json_string(src))
path = LoaderUtils.validate_path(src)
return LazyFileIterable(lambda: __load_json_file(path))


def __load_json_file(file_path):
"""Load a JSON file and convert it into a list of namedtuples"""
# skipcq: PTC-W6004
with open(file_path, mode='r', encoding='utf-8') as jsonfile:
src = jsonfile.read()
if src == '':
return []
data = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
return data


def __load_json_string(json_string):
"""Load JSON data from a string and convert it into a list of namedtuples"""
return jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)


def __dict_to_namedtuple(d, name='Item'):
"""Convert a dictionary to a namedtuple"""
if isinstance(d, dict):
fields = list(d.keys())
Item = namedtuple(name, fields)
return Item(**{k: __dict_to_namedtuple(v, k) for k, v in d.items()})
return d
24 changes: 24 additions & 0 deletions pystreamapi/loaders/__loader_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import contextlib
import os


class LoaderUtils:
"""Utility class for loaders to validate paths and cast data"""

@staticmethod
def try_cast(value):
"""Try to cast value to primary data types from python (int, float, bool)"""
for cast in (int, float):
with contextlib.suppress(ValueError):
return cast(value)
# Try to cast to bool
return value.lower() == 'true' if value.lower() in ('true', 'false') else value

@staticmethod
def validate_path(file_path: str):
"""Validate the path to the CSV file"""
if not os.path.exists(file_path):
raise FileNotFoundError("The specified file does not exist.")
if not os.path.isfile(file_path):
raise ValueError("The specified path is not a file.")
return file_path
3 changes: 0 additions & 3 deletions tests/assets/data.csv

This file was deleted.

2 changes: 0 additions & 2 deletions tests/assets/data2.csv

This file was deleted.

71 changes: 71 additions & 0 deletions tests/test_csv_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# pylint: disable=not-context-manager
from unittest import TestCase
from unittest.mock import patch, mock_open

from pystreamapi.loaders import csv

file_content = """
attr1,attr2
1,2.0
a,b
"""


class TestCSVLoader(TestCase):

def test_csv_loader(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv')
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, 1)
self.assertIsInstance(data[0].attr1, int)
self.assertEqual(data[0].attr2, 2.0)
self.assertIsInstance(data[0].attr2, float)
self.assertEqual(data[1].attr1, 'a')
self.assertIsInstance(data[1].attr1, str)

def test_csv_loader_with_casting_disabled(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv', cast_types=False)
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, '1')
self.assertIsInstance(data[0].attr1, str)
self.assertEqual(data[0].attr2, '2.0')
self.assertIsInstance(data[0].attr2, str)
self.assertEqual(data[1].attr1, 'a')
self.assertIsInstance(data[1].attr1, str)

def test_csv_loader_is_iterable(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv')
self.assertEqual(len(list(iter(data))), 2)

def test_csv_loader_with_custom_delimiter(self):
with (patch('builtins.open', mock_open(read_data=file_content.replace(",", ";"))),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv', delimiter=';')
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, 1)
self.assertIsInstance(data[0].attr1, int)

def test_csv_loader_with_empty_file(self):
with (patch('builtins.open', mock_open(read_data="")),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = csv('path/to/data.csv')
self.assertEqual(len(data), 0)

def test_csv_loader_with_invalid_path(self):
with self.assertRaises(FileNotFoundError):
csv('path/to/invalid.csv')

def test_csv_loader_with_no_file(self):
with self.assertRaises(ValueError):
csv('./')
71 changes: 71 additions & 0 deletions tests/test_json_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# pylint: disable=not-context-manager
from json import JSONDecodeError
from unittest import TestCase
from unittest.mock import patch, mock_open

from pystreamapi.loaders import json

file_content = """
[
{
"attr1": 1,
"attr2": 2.0
},
{
"attr1": "a",
"attr2": "b"
}
]
"""


class TestJsonLoader(TestCase):

def test_json_loader_from_file(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = json('path/to/data.json')
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, 1)
self.assertIsInstance(data[0].attr1, int)
self.assertEqual(data[0].attr2, 2.0)
self.assertIsInstance(data[0].attr2, float)
self.assertEqual(data[1].attr1, 'a')
self.assertIsInstance(data[1].attr1, str)

def test_json_loader_is_iterable(self):
with (patch('builtins.open', mock_open(read_data=file_content)),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = json('path/to/data.json')
self.assertEqual(len(list(iter(data))), 2)

def test_json_loader_with_empty_file(self):
with (patch('builtins.open', mock_open(read_data="")),
patch('os.path.exists', return_value=True),
patch('os.path.isfile', return_value=True)):
data = json('path/to/data.json')
self.assertEqual(len(data), 0)

def test_json_loader_with_invalid_path(self):
with self.assertRaises(FileNotFoundError):
json('path/to/invalid.json')

def test_json_loader_with_no_file(self):
with self.assertRaises(ValueError):
json('./')

def test_json_loader_from_string(self):
data = json(file_content, read_from_src=True)
self.assertEqual(len(data), 2)
self.assertEqual(data[0].attr1, 1)
self.assertIsInstance(data[0].attr1, int)
self.assertEqual(data[0].attr2, 2.0)
self.assertIsInstance(data[0].attr2, float)
self.assertEqual(data[1].attr1, 'a')
self.assertIsInstance(data[1].attr1, str)

def test_json_loader_from_empty_string(self):
with self.assertRaises(JSONDecodeError):
self.assertEqual(len(json('', read_from_src=True)), 0)
53 changes: 0 additions & 53 deletions tests/test_loaders.py

This file was deleted.

0 comments on commit 31c8dea

Please sign in to comment.