Skip to content

Commit 31c8dea

Browse files
authored
Merge pull request #77 from PickwickSoft/feature/#70/data-loader-for-json
Feature/#70/data loader for json
2 parents 775b687 + 561ac17 commit 31c8dea

12 files changed

Lines changed: 248 additions & 87 deletions

File tree

README.md

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -213,23 +213,33 @@ Stream.concat(Stream.of([1, 2]), Stream.of([3, 4]))
213213

214214
Creates a new Stream from multiple Streams. Order doesn't change.
215215

216-
## Use loaders: Load data from CSV files in just one line
216+
## Use loaders: Load data from CSV and JSON files in just one line
217217

218-
PyStreamAPI offers a convenient way to load data from CSV files. Like that you can start processing your CSV right away without having to worry about reading and parsing the file.
218+
PyStreamAPI offers a convenient way to load data from CSV and JSON files. Like that you can start processing your files right away without having to worry about reading and parsing the files.
219219

220-
You can import the loader with:
220+
You can import the loaders with:
221221

222222
```python
223-
from pystreamapi.loaders import csv
223+
from pystreamapi.loaders import csv, json
224224
```
225-
Now you can use the loader directly when creating your Stream:
225+
Now you can use the loaders directly when creating your Stream:
226+
227+
For CSV:
226228

227229
```python
228230
Stream.of(csv("data.csv", delimiter=";")) \
229231
.map(lambda x: x.attr1) \
230232
.for_each(print)
231233
```
232-
You can access the attributes of the CSV rows directly like you would with a normal object.
234+
235+
For JSON:
236+
```python
237+
Stream.of(json("data.json")) \
238+
.map(lambda x: x.attr1) \
239+
.for_each(print)
240+
```
241+
242+
You can access the attributes of the data structures directly like you would do with a normal object.
233243

234244
## API Reference
235245
For a more detailed documentation view the docs on GitBook: [PyStreamAPI Docs](https://pystreamapi.pickwicksoft.org/)
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
import contextlib
2-
import os
31
from collections import namedtuple
42
from csv import reader
53

4+
from pystreamapi.loaders.__loader_utils import LoaderUtils
65
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
76

87

@@ -17,7 +16,7 @@ def csv(file_path: str, cast_types=True, delimiter=',', encoding="utf-8") -> Laz
1716
:param file_path: The path to the CSV file.
1817
:param delimiter: The delimiter used in the CSV file.
1918
"""
20-
file_path = __validate_path(file_path)
19+
file_path = LoaderUtils.validate_path(file_path)
2120
return LazyFileIterable(lambda: __load_csv(file_path, cast_types, delimiter, encoding))
2221

2322

@@ -28,28 +27,24 @@ def __load_csv(file_path, cast, delimiter, encoding):
2827
csvreader = reader(csvfile, delimiter=delimiter)
2928

3029
# Create a namedtuple type, casting the header values to int or float if possible
31-
Row = namedtuple('Row', list(next(csvreader, [])))
30+
header = __get_csv_header(csvreader)
3231

33-
mapper = __try_cast if cast else lambda x: x
32+
Row = namedtuple('Row', list(header))
33+
34+
mapper = LoaderUtils.try_cast if cast else lambda x: x
3435

3536
# Process the data, casting values to int or float if possible
3637
data = [Row(*[mapper(value) for value in row]) for row in csvreader]
3738
return data
3839

3940

40-
def __validate_path(file_path: str):
41-
"""Validate the path to the CSV file"""
42-
if not os.path.exists(file_path):
43-
raise FileNotFoundError("The specified file does not exist.")
44-
if not os.path.isfile(file_path):
45-
raise ValueError("The specified path is not a file.")
46-
return file_path
47-
48-
49-
def __try_cast(value):
50-
"""Try to cast value to primary data types from python (int, float, bool)"""
51-
for cast in (int, float):
52-
with contextlib.suppress(ValueError):
53-
return cast(value)
54-
# Try to cast to bool
55-
return value.lower() == 'true' if value.lower() in ('true', 'false') else value
41+
def __get_csv_header(csvreader):
42+
"""Get the header of a CSV file. If the header is empty, return an empty list"""
43+
while True:
44+
try:
45+
header = next(csvreader)
46+
if header:
47+
break
48+
except StopIteration:
49+
return []
50+
return header

pystreamapi/loaders/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
from pystreamapi.loaders.__csv_loader import csv
1+
from pystreamapi.loaders.__csv.__csv_loader import csv
2+
from pystreamapi.loaders.__json.__json_loader import json
23

34
__all__ = [
4-
'csv'
5+
'csv',
6+
'json'
57
]

pystreamapi/loaders/__json/__init__.py

Whitespace-only changes.
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import json as jsonlib
2+
from collections import namedtuple
3+
4+
from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
5+
from pystreamapi.loaders.__loader_utils import LoaderUtils
6+
7+
8+
def json(src: str, read_from_src=False) -> LazyFileIterable:
9+
"""
10+
Loads JSON data from either a path or a string and converts it into a list of namedtuples.
11+
12+
Returns:
13+
list: A list of namedtuples, where each namedtuple represents an object in the JSON.
14+
:param src: Either the path to a JSON file or a JSON string.
15+
:param read_from_src: If True, src is treated as a JSON string. If False, src is treated as
16+
a path to a JSON file.
17+
"""
18+
if read_from_src:
19+
return LazyFileIterable(lambda: __load_json_string(src))
20+
path = LoaderUtils.validate_path(src)
21+
return LazyFileIterable(lambda: __load_json_file(path))
22+
23+
24+
def __load_json_file(file_path):
25+
"""Load a JSON file and convert it into a list of namedtuples"""
26+
# skipcq: PTC-W6004
27+
with open(file_path, mode='r', encoding='utf-8') as jsonfile:
28+
src = jsonfile.read()
29+
if src == '':
30+
return []
31+
data = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
32+
return data
33+
34+
35+
def __load_json_string(json_string):
36+
"""Load JSON data from a string and convert it into a list of namedtuples"""
37+
return jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)
38+
39+
40+
def __dict_to_namedtuple(d, name='Item'):
41+
"""Convert a dictionary to a namedtuple"""
42+
if isinstance(d, dict):
43+
fields = list(d.keys())
44+
Item = namedtuple(name, fields)
45+
return Item(**{k: __dict_to_namedtuple(v, k) for k, v in d.items()})
46+
return d
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import contextlib
2+
import os
3+
4+
5+
class LoaderUtils:
6+
"""Utility class for loaders to validate paths and cast data"""
7+
8+
@staticmethod
9+
def try_cast(value):
10+
"""Try to cast value to primary data types from python (int, float, bool)"""
11+
for cast in (int, float):
12+
with contextlib.suppress(ValueError):
13+
return cast(value)
14+
# Try to cast to bool
15+
return value.lower() == 'true' if value.lower() in ('true', 'false') else value
16+
17+
@staticmethod
18+
def validate_path(file_path: str):
19+
"""Validate the path to the CSV file"""
20+
if not os.path.exists(file_path):
21+
raise FileNotFoundError("The specified file does not exist.")
22+
if not os.path.isfile(file_path):
23+
raise ValueError("The specified path is not a file.")
24+
return file_path

tests/assets/data.csv

Lines changed: 0 additions & 3 deletions
This file was deleted.

tests/assets/data2.csv

Lines changed: 0 additions & 2 deletions
This file was deleted.

tests/test_csv_loader.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# pylint: disable=not-context-manager
2+
from unittest import TestCase
3+
from unittest.mock import patch, mock_open
4+
5+
from pystreamapi.loaders import csv
6+
7+
file_content = """
8+
attr1,attr2
9+
1,2.0
10+
a,b
11+
"""
12+
13+
14+
class TestCSVLoader(TestCase):
15+
16+
def test_csv_loader(self):
17+
with (patch('builtins.open', mock_open(read_data=file_content)),
18+
patch('os.path.exists', return_value=True),
19+
patch('os.path.isfile', return_value=True)):
20+
data = csv('path/to/data.csv')
21+
self.assertEqual(len(data), 2)
22+
self.assertEqual(data[0].attr1, 1)
23+
self.assertIsInstance(data[0].attr1, int)
24+
self.assertEqual(data[0].attr2, 2.0)
25+
self.assertIsInstance(data[0].attr2, float)
26+
self.assertEqual(data[1].attr1, 'a')
27+
self.assertIsInstance(data[1].attr1, str)
28+
29+
def test_csv_loader_with_casting_disabled(self):
30+
with (patch('builtins.open', mock_open(read_data=file_content)),
31+
patch('os.path.exists', return_value=True),
32+
patch('os.path.isfile', return_value=True)):
33+
data = csv('path/to/data.csv', cast_types=False)
34+
self.assertEqual(len(data), 2)
35+
self.assertEqual(data[0].attr1, '1')
36+
self.assertIsInstance(data[0].attr1, str)
37+
self.assertEqual(data[0].attr2, '2.0')
38+
self.assertIsInstance(data[0].attr2, str)
39+
self.assertEqual(data[1].attr1, 'a')
40+
self.assertIsInstance(data[1].attr1, str)
41+
42+
def test_csv_loader_is_iterable(self):
43+
with (patch('builtins.open', mock_open(read_data=file_content)),
44+
patch('os.path.exists', return_value=True),
45+
patch('os.path.isfile', return_value=True)):
46+
data = csv('path/to/data.csv')
47+
self.assertEqual(len(list(iter(data))), 2)
48+
49+
def test_csv_loader_with_custom_delimiter(self):
50+
with (patch('builtins.open', mock_open(read_data=file_content.replace(",", ";"))),
51+
patch('os.path.exists', return_value=True),
52+
patch('os.path.isfile', return_value=True)):
53+
data = csv('path/to/data.csv', delimiter=';')
54+
self.assertEqual(len(data), 2)
55+
self.assertEqual(data[0].attr1, 1)
56+
self.assertIsInstance(data[0].attr1, int)
57+
58+
def test_csv_loader_with_empty_file(self):
59+
with (patch('builtins.open', mock_open(read_data="")),
60+
patch('os.path.exists', return_value=True),
61+
patch('os.path.isfile', return_value=True)):
62+
data = csv('path/to/data.csv')
63+
self.assertEqual(len(data), 0)
64+
65+
def test_csv_loader_with_invalid_path(self):
66+
with self.assertRaises(FileNotFoundError):
67+
csv('path/to/invalid.csv')
68+
69+
def test_csv_loader_with_no_file(self):
70+
with self.assertRaises(ValueError):
71+
csv('./')

0 commit comments

Comments
 (0)