add format() and format_iterate()

ashafer01 · ashafer01 · commit f6025e4850d9 · 2024-04-09T21:19:57.000-07:00
diff --git a/REFERENCE.md b/REFERENCE.md
@@ -30,10 +30,19 @@ obtain the value at the path
 * if default is passed, return it if the leaf value was not found
 * if default is not passed and the leaf value is not found, propagate the LookupError
 
+### function `format()`
+
+```
+format(obj: datapath.types.Collection, format_string: str) -> str
+```
+
+Given a standard Python format string with {} notation, interpret the identifiers
+as a datapath within `obj`, and apply standard formatting language to the result.
+
 ### function `iterate()`
 
 ```
-iterate(obj: datapath.types.Collection, path: str, default: Any = NO_DEFAULT) -> Generator[tuple[str, Any], NoneType, NoneType]
+iterate(obj: datapath.types.Collection, path: str, default: Any = NO_DEFAULT) -> Generator[datapath._base.iterate_result[str, Any], NoneType, NoneType]
 ```
 
 yield entries from a collection using an iterable path -- that is, one containing one or more
@@ -60,6 +69,65 @@ Examples:
 * `test1.*test*`    # "test1" in a root dict must be a dict, yield each key that contains "test"
 * `test1[].*`       # combining dict and list iteration works
 
+### function `format_iterate()`
+
+```
+format_iterate(obj: datapath.types.Collection, format_string: str, default: Any = NO_DEFAULT, iter_func: Callable = <class 'zip'>) -> Generator[str, NoneType, NoneType]
+```
+
+Given a standard Python format string with {} notation, interpret the identifiers as iterable datapaths within `obj`.
+One value will be consumed from each iterable path and formatted using the standard language.
+
+`default` is passed through to all `iterate()` calls, which in turn passes it through to the leaf `get()` calls.
+There is no way to use a different default value for different iterable datapaths in replacement fields.
+
+By default, the values from the iterators will be obtained with the
+[`zip()` builtin](https://docs.python.org/3/library/functions.html#zip) with `strict=False`, meaning if the different
+iterable format strings produce a differnt number of results, iteration will stop when the shortest one stops, and
+the values will all correspond to the same index from each `iterate()` result.
+
+Example:
+
+```
+>>> test_obj = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}]
+>>> for text in format_iterate(test_obj, 'a {[].a} b {[].b}'):
+...     print(text)
+...
+a 1 b 2
+a 3 b 4
+a 5 b 6
+
+```
+
+If different behavior is desired, a different function can be passed:
+
+`iter_func` must have approximately the same basic signature as `builtins.zip()`,
+[`itertools.product()`](https://docs.python.org/3/library/itertools.html#itertools.product),
+and [`itertools.zip_longest()`](https://docs.python.org/3/library/itertools.html#itertools.zip_longest).
+
+More specifically, it must accept an arbitrary number of Iterables (specifically the Generator
+returned by `datapath.iterate()`), and yield a Sequence with a value from each one in order when the return
+value is iterated.
+
+You can supply extra keyword arguments to any function with this signature by utilizing
+[`functools.partial()`](https://docs.python.org/3/library/functools.html#functools.partial). Passing positional
+arguments to a partial will probably not work as expected, and is not recommended.
+
+Example with a partial and `itertools.zip_longest()`:
+
+```
+>>> import functools, itertools
+>>> test_obj = {'a': list('123'), 'b': list('4567')}
+>>> for text in format_iterate(test_obj, 'a {a[]} b {b[]}',
+...                            iter_func=functools.partial(itertools.zip_longest, fillvalue='X')):
+...     print(text)
+a 1 b 4
+a 2 b 5
+a 3 b 6
+a X b 7
+
+```
+
 ### function `put()`
 
 ```
@@ -144,6 +212,7 @@ Example:
 ```
 >>> join(['a', 'b', 5])
 'a.b[5]'
+
 ```
 
 ### function `leaf()`
diff --git a/build.sh b/build.sh
@@ -15,6 +15,8 @@ python3 -m unittest -v
 pylint -E datapath setup.py docs.py
 python3 docs.py
 
+rm -rf dist/*
+
 python3 setup.py sdist
 python3 setup.py bdist_wheel
 
diff --git a/datapath/__init__.py b/datapath/__init__.py
@@ -19,6 +19,7 @@
     complete_collection,
     UnfoldProcessor,
 )
+from .format import format, format_iterate
 from .types import (
     DatapathError,
     ValidationError,
@@ -28,7 +29,9 @@
 
 __all__ = [
     'get',
+    'format',
     'iterate',
+    'format_iterate',
     'put',
     'delete',
     'discard',
diff --git a/datapath/_base.py b/datapath/_base.py
@@ -123,6 +123,7 @@ def join(split_path: Iterable[Key]) -> str:
     ```
     >>> join(['a', 'b', 5])
     'a.b[5]'
+
     ```
     """
     path = ''
@@ -187,9 +188,13 @@ def _get(obj: Collection, split_path: SplitPath, default: Any = NO_DEFAULT) -> A
         return default
 
 
+class iterate_result(tuple):
+    pass
+
+
 def iterate(obj: Collection,
             path: str,
-            default: Any = NO_DEFAULT) -> Generator[tuple[str, Any], None, None]:
+            default: Any = NO_DEFAULT) -> Generator[iterate_result[str, Any], None, None]:
     """
     yield entries from a collection using an iterable path -- that is, one containing one or more
     sets of empty square brackets (`[]`) or a key with a `*` (`*`/`wild*cards*`/etc.)
@@ -222,7 +227,7 @@ def iterate(obj: Collection,
 def _iterate(obj: Collection,
              split_path: SplitPath,
              base_path: SplitPath,
-             default: Any) -> Generator[tuple[str, Any], None, None]:
+             default: Any) -> Generator[iterate_result[str, Any], None, None]:
     """recursive core of iterate()"""
     if not isinstance(obj, _collection_types):
         raise ValidationError(f'{join(base_path + split_path)}: must be list/dict')
@@ -238,7 +243,7 @@ def _iterate(obj: Collection,
 
     if iter_index is None:
         # no iteration points found, just need to get()
-        yield join(base_path + split_path), _get(obj, split_path, default)
+        yield iterate_result((join(base_path + split_path), _get(obj, split_path, default)))
         return
 
     # find the collection referred to by the portion of the path before the first iteration point
@@ -264,7 +269,7 @@ def _iterate(obj: Collection,
             yield from _iterate(element, after_split_path, key_split_path, default)
         else:
             # if there is no path after, then this element is what we're after
-            yield join(key_split_path), element
+            yield iterate_result((join(key_split_path), element))
 
 
 def put(obj: Collection, path: str, value: Any) -> None:
diff --git a/datapath/format.py b/datapath/format.py
@@ -0,0 +1,121 @@
+import builtins
+import string
+from typing import Any, Callable, Generator
+
+from ._base import get, iterate, iterate_result
+from .types import Collection, NO_DEFAULT
+
+
+class _Format(string.Formatter):
+    """implements the `format()` function"""
+
+    def __init__(self, obj: Collection):
+        string.Formatter.__init__(self)
+        self._datapath_obj = obj
+
+    def get_field(self, field_name, args, kwargs):
+        return get(self._datapath_obj, field_name), None
+
+
+def format(obj: Collection, format_string: str) -> str:
+    """
+    Given a standard Python format string with {} notation, interpret the identifiers
+    as a datapath within `obj`, and apply standard formatting language to the result.
+    """
+    return _Format(obj).format(format_string)
+
+
+def _do_format(value: Any, format_spec: str, conversion: str) -> str:
+    """do the standard !r / !s / !a format string conversions, followed by builtins.format"""
+    if not conversion:
+        pass
+    elif conversion == 'r':
+        value = repr(value)
+    elif conversion == 's':
+        value = str(value)
+    elif conversion == 'a':
+        value = ascii(value)
+    else:
+        raise ValueError(f'unhandled conversion flag {conversion!r}')
+    return builtins.format(value, format_spec)
+
+
+def format_iterate(obj: Collection,
+                   format_string: str,
+                   default: Any = NO_DEFAULT,
+                   iter_func: Callable = zip) -> Generator[str, None, None]:
+    """
+    Given a standard Python format string with {} notation, interpret the identifiers as iterable datapaths within `obj`.
+    One value will be consumed from each iterable path and formatted using the standard language.
+
+    `default` is passed through to all `iterate()` calls, which in turn passes it through to the leaf `get()` calls.
+    There is no way to use a different default value for different iterable datapaths in replacement fields.
+
+    By default, the values from the iterators will be obtained with the
+    [`zip()` builtin](https://docs.python.org/3/library/functions.html#zip) with `strict=False`, meaning if the different
+    iterable format strings produce a differnt number of results, iteration will stop when the shortest one stops, and
+    the values will all correspond to the same index from each `iterate()` result.
+
+    Example:
+
+    ```
+    >>> test_obj = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}]
+    >>> for text in format_iterate(test_obj, 'a {[].a} b {[].b}'):
+    ...     print(text)
+    ...
+    a 1 b 2
+    a 3 b 4
+    a 5 b 6
+
+    ```
+
+    If different behavior is desired, a different function can be passed:
+
+    `iter_func` must have approximately the same basic signature as `builtins.zip()`,
+    [`itertools.product()`](https://docs.python.org/3/library/itertools.html#itertools.product),
+    and [`itertools.zip_longest()`](https://docs.python.org/3/library/itertools.html#itertools.zip_longest).
+
+    More specifically, it must accept an arbitrary number of Iterables (specifically the Generator
+    returned by `datapath.iterate()`), and yield a Sequence with a value from each one in order when the return
+    value is iterated.
+
+    You can supply extra keyword arguments to any function with this signature by utilizing
+    [`functools.partial()`](https://docs.python.org/3/library/functools.html#functools.partial). Passing positional
+    arguments to a partial will probably not work as expected, and is not recommended.
+
+    Example with a partial and `itertools.zip_longest()`:
+
+    ```
+    >>> import functools, itertools
+    >>> test_obj = {'a': list('123'), 'b': list('4567')}
+    >>> for text in format_iterate(test_obj, 'a {a[]} b {b[]}',
+    ...                            iter_func=functools.partial(itertools.zip_longest, fillvalue='X')):
+    ...     print(text)
+    a 1 b 4
+    a 2 b 5
+    a 3 b 6
+    a X b 7
+
+    ```
+    """
+    iterators = []
+    path_formats = []
+    plain_format_string = ''
+    for literal_text, field_name, format_spec, conversion in string.Formatter().parse(format_string):
+        plain_format_string += literal_text
+        if not field_name:
+            continue
+        plain_format_string += '{}'
+        iterators.append(iterate(obj, field_name, default))
+        path_formats.append((format_spec, conversion))
+
+    for results in iter_func(*iterators):
+        values = []
+        for index, result in enumerate(results):
+            if isinstance(result, iterate_result):
+                _, value = result
+            else:
+                value = result
+            format_spec, conversion = path_formats[index]
+            values.append(_do_format(value, format_spec, conversion))
+        yield plain_format_string.format(*values)
diff --git a/pre_deploy.sh b/pre_deploy.sh
@@ -5,8 +5,10 @@ set -xeo pipefail
 rm -rf build dist *.egg-info
 
 build_version='3.10'
+docker pull "python:$build_version"
 docker run -it --rm -v "$PWD:/repo" -w /repo "python:$build_version" '/repo/build.sh'
 
 for version in '3.10' '3.11' '3.12'; do
+    docker pull "python:$version"
     docker run -it --rm -v "$PWD/dist:/dist" -v "$PWD/test:/repo/test" -w /repo "python:$version" '/repo/test/docker_test.sh'
 done
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='python-datapath',
-    version='0.1.2',
+    version='0.2.0',
 
     author='Alex Shafer',
     author_email='ashafer@pm.me',
diff --git a/test/test_datapath.py b/test/test_datapath.py
@@ -1,3 +1,4 @@
+import doctest
 import unittest
 
 import datapath
@@ -42,6 +43,11 @@
 )
 
 
+def load_tests(loader, tests, ignore):
+    tests.addTests(doctest.DocTestSuite(datapath._base))
+    return tests
+
+
 class TestDatapath(unittest.TestCase):
     def test_validate_path_valid_cases_iterable_false(self):
         for valid_path in valid_paths:
diff --git a/test/test_format.py b/test/test_format.py
@@ -0,0 +1,52 @@
+import doctest
+import unittest
+from importlib import import_module
+
+import datapath
+
+
+def load_tests(loader, tests, ignore):
+    tests.addTests(doctest.DocTestSuite(import_module('datapath.format')))
+    return tests
+
+
+class TestFormat(unittest.TestCase):
+    def test_format_simple(self):
+        format_strings = (
+            ('one {} two {} three', 2),
+            ('{} one', 1),
+            ('one {}', 1),
+            ('{}{}{}', 3),
+            ('{}', 1),
+        )
+        test_obj = {
+            'a': list('123'),
+            'b': [{'c': list('456')}, 7],
+        }
+        paths = (
+            ('{a[0]}', test_obj['a'][0]),
+            ('{b[1]}', test_obj['b'][1]),
+            ('{b[0].c[2]}', test_obj['b'][0]['c'][2]),
+        )
+        for index, (format_string, num_paths) in enumerate(format_strings):
+            with self.subTest(msg=f'index {index}'):
+                my_paths = (path[0] for path in paths[:num_paths])
+                values = (path[1] for path in paths[:num_paths])
+                real_format_string = format_string.format(*my_paths)
+                expected = format_string.format(*values)
+                actual = datapath.format(test_obj, real_format_string)
+                self.assertEqual(expected, actual)
+
+
+class TestFormatIterate(unittest.TestCase):
+    def test_format_iterate_no_literal(self):
+        test_obj = {'a': list('1234')}
+        expected = '1234'
+        for index, value in enumerate(datapath.format_iterate(test_obj, '{a[]}')):
+            self.assertEqual(value, expected[index])
+
+    def test_format_iterate_trailing_literal(self):
+        test_obj = {'a': list('1234')}
+        expected = '1234'
+        for index, value in enumerate(datapath.format_iterate(test_obj, '{a[]} x')):
+            self.assertEqual(value, expected[index] + ' x')