Merge pull request #77 from PickwickSoft/feature/#70/data-loader-for-…

…json Feature/#70/data loader for json
PickwickSoft · Sep 28, 2023 · 31c8dea · 31c8dea
2 parents 775b687 + 561ac17
commit 31c8dea
Show file tree

Hide file tree

Showing 12 changed files with 248 additions and 87 deletions.
diff --git a/README.md b/README.md
@@ -213,23 +213,33 @@ Stream.concat(Stream.of([1, 2]), Stream.of([3, 4]))
 
 Creates a new Stream from multiple Streams. Order doesn't change.
 
-## Use loaders: Load data from CSV files in just one line
+## Use loaders: Load data from CSV and JSON files in just one line
 
-PyStreamAPI offers a convenient way to load data from CSV files. Like that you can start processing your CSV right away without having to worry about reading and parsing the file.
+PyStreamAPI offers a convenient way to load data from CSV and JSON files. Like that you can start processing your files right away without having to worry about reading and parsing the files.
 
-You can import the loader with:
+You can import the loaders with:
 
 ```python
-from pystreamapi.loaders import csv
+from pystreamapi.loaders import csv, json
 ```
-Now you can use the loader directly when creating your Stream:
+Now you can use the loaders directly when creating your Stream:
+
+For CSV:
 
 ```python
 Stream.of(csv("data.csv", delimiter=";")) \
     .map(lambda x: x.attr1) \
     .for_each(print)
 ```
-You can access the attributes of the CSV rows directly like you would with a normal object.
+
+For JSON:
+```python
+Stream.of(json("data.json")) \
+    .map(lambda x: x.attr1) \
+    .for_each(print)
+```
+
+You can access the attributes of the data structures directly like you would do with a normal object.
 
 ## API Reference
 For a more detailed documentation view the docs on GitBook: [PyStreamAPI Docs](https://pystreamapi.pickwicksoft.org/)

diff --git a/pystreamapi/loaders/__csv_loader.py → pystreamapi/loaders/__csv/__csv_loader.py b/pystreamapi/loaders/__csv_loader.py → pystreamapi/loaders/__csv/__csv_loader.py
@@ -1,8 +1,7 @@
-import contextlib
-import os
 from collections import namedtuple
 from csv import reader
 
+from pystreamapi.loaders.__loader_utils import LoaderUtils
 from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
 
 
@@ -17,7 +16,7 @@ def csv(file_path: str, cast_types=True, delimiter=',', encoding="utf-8") -> Laz
         :param file_path: The path to the CSV file.
         :param delimiter: The delimiter used in the CSV file.
     """
-    file_path = __validate_path(file_path)
+    file_path = LoaderUtils.validate_path(file_path)
     return LazyFileIterable(lambda: __load_csv(file_path, cast_types, delimiter, encoding))
 
 
@@ -28,28 +27,24 @@ def __load_csv(file_path, cast, delimiter, encoding):
         csvreader = reader(csvfile, delimiter=delimiter)
 
         # Create a namedtuple type, casting the header values to int or float if possible
-        Row = namedtuple('Row', list(next(csvreader, [])))
+        header = __get_csv_header(csvreader)
 
-        mapper = __try_cast if cast else lambda x: x
+        Row = namedtuple('Row', list(header))
+
+        mapper = LoaderUtils.try_cast if cast else lambda x: x
 
         # Process the data, casting values to int or float if possible
         data = [Row(*[mapper(value) for value in row]) for row in csvreader]
     return data
 
 
-def __validate_path(file_path: str):
-    """Validate the path to the CSV file"""
-    if not os.path.exists(file_path):
-        raise FileNotFoundError("The specified file does not exist.")
-    if not os.path.isfile(file_path):
-        raise ValueError("The specified path is not a file.")
-    return file_path
-
-
-def __try_cast(value):
-    """Try to cast value to primary data types from python (int, float, bool)"""
-    for cast in (int, float):
-        with contextlib.suppress(ValueError):
-            return cast(value)
-    # Try to cast to bool
-    return value.lower() == 'true' if value.lower() in ('true', 'false') else value
+def __get_csv_header(csvreader):
+    """Get the header of a CSV file. If the header is empty, return an empty list"""
+    while True:
+        try:
+            header = next(csvreader)
+            if header:
+                break
+        except StopIteration:
+            return []
+    return header
diff --git a/tests/assets/empty.csv → pystreamapi/loaders/__csv/__init__.py b/tests/assets/empty.csv → pystreamapi/loaders/__csv/__init__.py
diff --git a/pystreamapi/loaders/__init__.py b/pystreamapi/loaders/__init__.py
@@ -1,5 +1,7 @@
-from pystreamapi.loaders.__csv_loader import csv
+from pystreamapi.loaders.__csv.__csv_loader import csv
+from pystreamapi.loaders.__json.__json_loader import json
 
 __all__ = [
-    'csv'
+    'csv',
+    'json'
 ]
diff --git a/pystreamapi/loaders/__json/__init__.py b/pystreamapi/loaders/__json/__init__.py
diff --git a/pystreamapi/loaders/__json/__json_loader.py b/pystreamapi/loaders/__json/__json_loader.py
@@ -0,0 +1,46 @@
+import json as jsonlib
+from collections import namedtuple
+
+from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
+from pystreamapi.loaders.__loader_utils import LoaderUtils
+
+
+def json(src: str, read_from_src=False) -> LazyFileIterable:
+    """
+    Loads JSON data from either a path or a string and converts it into a list of namedtuples.
+
+    Returns:
+        list: A list of namedtuples, where each namedtuple represents an object in the JSON.
+        :param src: Either the path to a JSON file or a JSON string.
+        :param read_from_src: If True, src is treated as a JSON string. If False, src is treated as
+        a path to a JSON file.
+    """
+    if read_from_src:
+        return LazyFileIterable(lambda: __load_json_string(src))
+    path = LoaderUtils.validate_path(src)
+    return LazyFileIterable(lambda: __load_json_file(path))
+
+
+def __load_json_file(file_path):
+    """Load a JSON file and convert it into a list of namedtuples"""
+    # skipcq: PTC-W6004
+    with open(file_path, mode='r', encoding='utf-8') as jsonfile:
+        src = jsonfile.read()
+        if src == '':
+            return []
+        data = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
+    return data
+
+
+def __load_json_string(json_string):
+    """Load JSON data from a string and convert it into a list of namedtuples"""
+    return jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)
+
+
+def __dict_to_namedtuple(d, name='Item'):
+    """Convert a dictionary to a namedtuple"""
+    if isinstance(d, dict):
+        fields = list(d.keys())
+        Item = namedtuple(name, fields)
+        return Item(**{k: __dict_to_namedtuple(v, k) for k, v in d.items()})
+    return d
diff --git a/pystreamapi/loaders/__loader_utils.py b/pystreamapi/loaders/__loader_utils.py
@@ -0,0 +1,24 @@
+import contextlib
+import os
+
+
+class LoaderUtils:
+    """Utility class for loaders to validate paths and cast data"""
+
+    @staticmethod
+    def try_cast(value):
+        """Try to cast value to primary data types from python (int, float, bool)"""
+        for cast in (int, float):
+            with contextlib.suppress(ValueError):
+                return cast(value)
+        # Try to cast to bool
+        return value.lower() == 'true' if value.lower() in ('true', 'false') else value
+
+    @staticmethod
+    def validate_path(file_path: str):
+        """Validate the path to the CSV file"""
+        if not os.path.exists(file_path):
+            raise FileNotFoundError("The specified file does not exist.")
+        if not os.path.isfile(file_path):
+            raise ValueError("The specified path is not a file.")
+        return file_path
diff --git a/tests/assets/data.csv b/tests/assets/data.csv
diff --git a/tests/assets/data2.csv b/tests/assets/data2.csv
diff --git a/tests/test_csv_loader.py b/tests/test_csv_loader.py
@@ -0,0 +1,71 @@
+# pylint: disable=not-context-manager
+from unittest import TestCase
+from unittest.mock import patch, mock_open
+
+from pystreamapi.loaders import csv
+
+file_content = """
+attr1,attr2
+1,2.0
+a,b
+"""
+
+
+class TestCSVLoader(TestCase):
+
+    def test_csv_loader(self):
+        with (patch('builtins.open', mock_open(read_data=file_content)),
+              patch('os.path.exists', return_value=True),
+              patch('os.path.isfile', return_value=True)):
+            data = csv('path/to/data.csv')
+            self.assertEqual(len(data), 2)
+            self.assertEqual(data[0].attr1, 1)
+            self.assertIsInstance(data[0].attr1, int)
+            self.assertEqual(data[0].attr2, 2.0)
+            self.assertIsInstance(data[0].attr2, float)
+            self.assertEqual(data[1].attr1, 'a')
+            self.assertIsInstance(data[1].attr1, str)
+
+    def test_csv_loader_with_casting_disabled(self):
+        with (patch('builtins.open', mock_open(read_data=file_content)),
+              patch('os.path.exists', return_value=True),
+              patch('os.path.isfile', return_value=True)):
+            data = csv('path/to/data.csv', cast_types=False)
+            self.assertEqual(len(data), 2)
+            self.assertEqual(data[0].attr1, '1')
+            self.assertIsInstance(data[0].attr1, str)
+            self.assertEqual(data[0].attr2, '2.0')
+            self.assertIsInstance(data[0].attr2, str)
+            self.assertEqual(data[1].attr1, 'a')
+            self.assertIsInstance(data[1].attr1, str)
+
+    def test_csv_loader_is_iterable(self):
+        with (patch('builtins.open', mock_open(read_data=file_content)),
+              patch('os.path.exists', return_value=True),
+              patch('os.path.isfile', return_value=True)):
+            data = csv('path/to/data.csv')
+            self.assertEqual(len(list(iter(data))), 2)
+
+    def test_csv_loader_with_custom_delimiter(self):
+        with (patch('builtins.open', mock_open(read_data=file_content.replace(",", ";"))),
+              patch('os.path.exists', return_value=True),
+              patch('os.path.isfile', return_value=True)):
+            data = csv('path/to/data.csv', delimiter=';')
+            self.assertEqual(len(data), 2)
+            self.assertEqual(data[0].attr1, 1)
+            self.assertIsInstance(data[0].attr1, int)
+
+    def test_csv_loader_with_empty_file(self):
+        with (patch('builtins.open', mock_open(read_data="")),
+              patch('os.path.exists', return_value=True),
+              patch('os.path.isfile', return_value=True)):
+            data = csv('path/to/data.csv')
+            self.assertEqual(len(data), 0)
+
+    def test_csv_loader_with_invalid_path(self):
+        with self.assertRaises(FileNotFoundError):
+            csv('path/to/invalid.csv')
+
+    def test_csv_loader_with_no_file(self):
+        with self.assertRaises(ValueError):
+            csv('./')
diff --git a/tests/test_json_loader.py b/tests/test_json_loader.py
@@ -0,0 +1,71 @@
+# pylint: disable=not-context-manager
+from json import JSONDecodeError
+from unittest import TestCase
+from unittest.mock import patch, mock_open
+
+from pystreamapi.loaders import json
+
+file_content = """
+[
+    {
+        "attr1": 1,
+        "attr2": 2.0
+    },
+    {
+        "attr1": "a",
+        "attr2": "b"
+    }
+]
+"""
+
+
+class TestJsonLoader(TestCase):
+
+    def test_json_loader_from_file(self):
+        with (patch('builtins.open', mock_open(read_data=file_content)),
+              patch('os.path.exists', return_value=True),
+              patch('os.path.isfile', return_value=True)):
+            data = json('path/to/data.json')
+            self.assertEqual(len(data), 2)
+            self.assertEqual(data[0].attr1, 1)
+            self.assertIsInstance(data[0].attr1, int)
+            self.assertEqual(data[0].attr2, 2.0)
+            self.assertIsInstance(data[0].attr2, float)
+            self.assertEqual(data[1].attr1, 'a')
+            self.assertIsInstance(data[1].attr1, str)
+
+    def test_json_loader_is_iterable(self):
+        with (patch('builtins.open', mock_open(read_data=file_content)),
+              patch('os.path.exists', return_value=True),
+              patch('os.path.isfile', return_value=True)):
+            data = json('path/to/data.json')
+            self.assertEqual(len(list(iter(data))), 2)
+
+    def test_json_loader_with_empty_file(self):
+        with (patch('builtins.open', mock_open(read_data="")),
+              patch('os.path.exists', return_value=True),
+              patch('os.path.isfile', return_value=True)):
+            data = json('path/to/data.json')
+            self.assertEqual(len(data), 0)
+
+    def test_json_loader_with_invalid_path(self):
+        with self.assertRaises(FileNotFoundError):
+            json('path/to/invalid.json')
+
+    def test_json_loader_with_no_file(self):
+        with self.assertRaises(ValueError):
+            json('./')
+
+    def test_json_loader_from_string(self):
+        data = json(file_content, read_from_src=True)
+        self.assertEqual(len(data), 2)
+        self.assertEqual(data[0].attr1, 1)
+        self.assertIsInstance(data[0].attr1, int)
+        self.assertEqual(data[0].attr2, 2.0)
+        self.assertIsInstance(data[0].attr2, float)
+        self.assertEqual(data[1].attr1, 'a')
+        self.assertIsInstance(data[1].attr1, str)
+
+    def test_json_loader_from_empty_string(self):
+        with self.assertRaises(JSONDecodeError):
+            self.assertEqual(len(json('', read_from_src=True)), 0)
diff --git a/tests/test_loaders.py b/tests/test_loaders.py