From 14af1803e2d016b8f82e52d6e7073a279fee12d7 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 12:52:58 -0400 Subject: [PATCH 1/6] Add map functions (make_map, map_keys, map_values, map_extract, map_entries, element_at) Closes #1448 Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/core/src/functions.rs | 26 +++++++ python/datafusion/functions.py | 121 +++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index c32134054..4f2b3ccbc 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -93,6 +93,18 @@ fn array_cat(exprs: Vec) -> PyExpr { array_concat(exprs) } +#[pyfunction] +fn make_map(keys: Vec, values: Vec) -> PyExpr { + let keys = keys.into_iter().map(|x| x.into()).collect(); + let values = values.into_iter().map(|x| x.into()).collect(); + datafusion::functions_nested::map::map(keys, values).into() +} + +#[pyfunction] +fn element_at(map: PyExpr, key: PyExpr) -> PyExpr { + datafusion::functions_nested::expr_fn::map_extract(map.into(), key.into()).into() +} + #[pyfunction] #[pyo3(signature = (array, element, index=None))] fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { @@ -665,6 +677,12 @@ array_fn!(cardinality, array); array_fn!(flatten, array); array_fn!(range, start stop step); +// Map Functions +array_fn!(map_keys, map); +array_fn!(map_values, map); +array_fn!(map_extract, map key); +array_fn!(map_entries, map); + aggregate_function!(array_agg); aggregate_function!(max); aggregate_function!(min); @@ -1124,6 +1142,14 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(flatten))?; m.add_wrapped(wrap_pyfunction!(cardinality))?; + // Map Functions + m.add_wrapped(wrap_pyfunction!(make_map))?; + m.add_wrapped(wrap_pyfunction!(map_keys))?; + m.add_wrapped(wrap_pyfunction!(map_values))?; + m.add_wrapped(wrap_pyfunction!(map_extract))?; + m.add_wrapped(wrap_pyfunction!(map_entries))?; + m.add_wrapped(wrap_pyfunction!(element_at))?; + // Window Functions m.add_wrapped(wrap_pyfunction!(lead))?; m.add_wrapped(wrap_pyfunction!(lag))?; diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index f062cbfce..e15cc15c1 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,6 +18,7 @@ from __future__ import annotations +import builtins from typing import TYPE_CHECKING, Any import pyarrow as pa @@ -137,6 +138,7 @@ "degrees", "dense_rank", "digest", + "element_at", "empty", "encode", "ends_with", @@ -200,6 +202,11 @@ "make_array", "make_date", "make_list", + "make_map", + "map_entries", + "map_extract", + "map_keys", + "map_values", "max", "md5", "mean", @@ -3338,6 +3345,120 @@ def empty(array: Expr) -> Expr: return array_empty(array) +# map functions + + +def make_map(*args: Expr) -> Expr: + """Returns a map created from key and value expressions. + + Accepts an even number of arguments, alternating between keys and values. + For example, ``make_map(k1, v1, k2, v2)`` creates a map ``{k1: v1, k2: v2}``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.make_map( + ... dfn.lit("a"), dfn.lit(1), + ... dfn.lit("b"), dfn.lit(2), + ... ).alias("map")) + >>> result.collect_column("map")[0].as_py() + [('a', 1), ('b', 2)] + """ + if len(args) % 2 != 0: + msg = "make_map requires an even number of arguments" + raise ValueError(msg) + keys = [args[i].expr for i in builtins.range(0, len(args), 2)] + values = [args[i].expr for i in builtins.range(1, len(args), 2)] + return Expr(f.make_map(keys, values)) + + +def map_keys(map: Expr) -> Expr: + """Returns a list of all keys in the map. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.map_keys( + ... dfn.functions.make_map( + ... dfn.lit("x"), dfn.lit(1), + ... dfn.lit("y"), dfn.lit(2), + ... ) + ... ).alias("keys")) + >>> result.collect_column("keys")[0].as_py() + ['x', 'y'] + """ + return Expr(f.map_keys(map.expr)) + + +def map_values(map: Expr) -> Expr: + """Returns a list of all values in the map. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.map_values( + ... dfn.functions.make_map( + ... dfn.lit("x"), dfn.lit(1), + ... dfn.lit("y"), dfn.lit(2), + ... ) + ... ).alias("vals")) + >>> result.collect_column("vals")[0].as_py() + [1, 2] + """ + return Expr(f.map_values(map.expr)) + + +def map_extract(map: Expr, key: Expr) -> Expr: + """Returns the value for the given key in the map, or an empty list if absent. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.map_extract( + ... dfn.functions.make_map( + ... dfn.lit("x"), dfn.lit(1), + ... dfn.lit("y"), dfn.lit(2), + ... ), + ... dfn.lit("x"), + ... ).alias("val")) + >>> result.collect_column("val")[0].as_py() + [1] + """ + return Expr(f.map_extract(map.expr, key.expr)) + + +def map_entries(map: Expr) -> Expr: + """Returns a list of all entries (key-value struct pairs) in the map. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.map_entries( + ... dfn.functions.make_map( + ... dfn.lit("x"), dfn.lit(1), + ... dfn.lit("y"), dfn.lit(2), + ... ) + ... ).alias("entries")) + >>> result.collect_column("entries")[0].as_py() + [{'key': 'x', 'value': 1}, {'key': 'y', 'value': 2}] + """ + return Expr(f.map_entries(map.expr)) + + +def element_at(map: Expr, key: Expr) -> Expr: + """Returns the value for the given key in the map, or an empty list if absent. + + See Also: + This is an alias for :py:func:`map_extract`. + """ + return map_extract(map, key) + + # aggregate functions def approx_distinct( expression: Expr, From cce130514373eee8b818160718a30ba97274456e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 13:22:14 -0400 Subject: [PATCH 2/6] Add unit tests for map functions Co-Authored-By: Claude Opus 4.6 (1M context) --- python/tests/test_functions.py | 115 +++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 37d349c58..ccbb62f86 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -668,6 +668,121 @@ def test_array_function_obj_tests(stmt, py_expr): assert a == b +def test_make_map(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + result = ( + df.select( + f.make_map( + literal("x"), + literal(1), + literal("y"), + literal(2), + ).alias("map") + ) + .collect()[0] + .column(0) + ) + assert result[0].as_py() == [("x", 1), ("y", 2)] + + +def test_make_map_odd_args(): + with pytest.raises(ValueError, match="even number of arguments"): + f.make_map(literal("x"), literal(1), literal("y")) + + +def test_map_keys(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + m = f.make_map(literal("x"), literal(1), literal("y"), literal(2)) + result = df.select(f.map_keys(m).alias("keys")).collect()[0].column(0) + assert result[0].as_py() == ["x", "y"] + + +def test_map_values(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + m = f.make_map(literal("x"), literal(1), literal("y"), literal(2)) + result = df.select(f.map_values(m).alias("vals")).collect()[0].column(0) + assert result[0].as_py() == [1, 2] + + +def test_map_extract(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + m = f.make_map(literal("x"), literal(1), literal("y"), literal(2)) + result = ( + df.select(f.map_extract(m, literal("x")).alias("val")).collect()[0].column(0) + ) + assert result[0].as_py() == [1] + + +def test_map_extract_missing_key(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + m = f.make_map(literal("x"), literal(1)) + result = ( + df.select(f.map_extract(m, literal("z")).alias("val")).collect()[0].column(0) + ) + assert result[0].as_py() == [None] + + +def test_map_entries(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + m = f.make_map(literal("x"), literal(1), literal("y"), literal(2)) + result = df.select(f.map_entries(m).alias("entries")).collect()[0].column(0) + assert result[0].as_py() == [ + {"key": "x", "value": 1}, + {"key": "y", "value": 2}, + ] + + +def test_element_at(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + m = f.make_map(literal("a"), literal(10), literal("b"), literal(20)) + result = ( + df.select(f.element_at(m, literal("b")).alias("val")).collect()[0].column(0) + ) + assert result[0].as_py() == [20] + + +def test_map_functions_with_column_data(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array(["k1", "k2", "k3"]), + pa.array([10, 20, 30]), + ], + names=["keys", "vals"], + ) + df = ctx.create_dataframe([[batch]]) + + m = f.make_map(column("keys"), column("vals")) + result = df.select(f.map_keys(m).alias("k")).collect()[0].column(0) + for i, expected in enumerate(["k1", "k2", "k3"]): + assert result[i].as_py() == [expected] + + result = df.select(f.map_values(m).alias("v")).collect()[0].column(0) + for i, expected in enumerate([10, 20, 30]): + assert result[i].as_py() == [expected] + + @pytest.mark.parametrize( ("function", "expected_result"), [ From 48275286aeff187a2436441506e128ec963a9954 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 16:51:15 -0400 Subject: [PATCH 3/6] Remove redundant pyo3 element_at function element_at is already a Python-only alias for map_extract, so the Rust binding is unnecessary. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/core/src/functions.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index 4f2b3ccbc..44135a56b 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -100,11 +100,6 @@ fn make_map(keys: Vec, values: Vec) -> PyExpr { datafusion::functions_nested::map::map(keys, values).into() } -#[pyfunction] -fn element_at(map: PyExpr, key: PyExpr) -> PyExpr { - datafusion::functions_nested::expr_fn::map_extract(map.into(), key.into()).into() -} - #[pyfunction] #[pyo3(signature = (array, element, index=None))] fn array_position(array: PyExpr, element: PyExpr, index: Option) -> PyExpr { @@ -1148,7 +1143,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(map_values))?; m.add_wrapped(wrap_pyfunction!(map_extract))?; m.add_wrapped(wrap_pyfunction!(map_entries))?; - m.add_wrapped(wrap_pyfunction!(element_at))?; // Window Functions m.add_wrapped(wrap_pyfunction!(lead))?; From fac2c246f41972479f945d0b494355b9317f9b4e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 16:57:02 -0400 Subject: [PATCH 4/6] Change make_map to accept a Python dictionary make_map now takes a dict for the common case and also supports separate keys/values lists for column expressions. Non-Expr keys and values are automatically converted to literals. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/functions.py | 65 ++++++++++++++++-------------- python/tests/test_functions.py | 73 ++++++++++++++++------------------ 2 files changed, 70 insertions(+), 68 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index e15cc15c1..014ec5806 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,7 +18,6 @@ from __future__ import annotations -import builtins from typing import TYPE_CHECKING, Any import pyarrow as pa @@ -3348,29 +3347,47 @@ def empty(array: Expr) -> Expr: # map functions -def make_map(*args: Expr) -> Expr: - """Returns a map created from key and value expressions. +def make_map( + data: dict[Any, Any] | None = None, + keys: list[Any] | None = None, + values: list[Any] | None = None, +) -> Expr: + """Returns a map expression. + + Can be called with either a Python dictionary or separate ``keys`` + and ``values`` lists. Keys and values that are not already + :py:class:`~datafusion.expr.Expr` are automatically converted to + literal expressions. - Accepts an even number of arguments, alternating between keys and values. - For example, ``make_map(k1, v1, k2, v2)`` creates a map ``{k1: v1, k2: v2}``. + Args: + data: A Python dictionary of key-value pairs. + keys: A list of keys (use with ``values`` for column expressions). + values: A list of values (use with ``keys``). Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( - ... dfn.functions.make_map( - ... dfn.lit("a"), dfn.lit(1), - ... dfn.lit("b"), dfn.lit(2), - ... ).alias("map")) + ... dfn.functions.make_map({"a": 1, "b": 2}).alias("map")) >>> result.collect_column("map")[0].as_py() [('a', 1), ('b', 2)] """ - if len(args) % 2 != 0: - msg = "make_map requires an even number of arguments" + if data is not None: + if keys is not None or values is not None: + msg = "Cannot specify both data and keys/values" + raise ValueError(msg) + key_list = list(data.keys()) + value_list = list(data.values()) + elif keys is not None and values is not None: + key_list = keys + value_list = values + else: + msg = "Must specify either data or both keys and values" raise ValueError(msg) - keys = [args[i].expr for i in builtins.range(0, len(args), 2)] - values = [args[i].expr for i in builtins.range(1, len(args), 2)] - return Expr(f.make_map(keys, values)) + + key_exprs = [k if isinstance(k, Expr) else Expr.literal(k) for k in key_list] + val_exprs = [v if isinstance(v, Expr) else Expr.literal(v) for v in value_list] + return Expr(f.make_map([k.expr for k in key_exprs], [v.expr for v in val_exprs])) def map_keys(map: Expr) -> Expr: @@ -3381,10 +3398,7 @@ def map_keys(map: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map_keys( - ... dfn.functions.make_map( - ... dfn.lit("x"), dfn.lit(1), - ... dfn.lit("y"), dfn.lit(2), - ... ) + ... dfn.functions.make_map({"x": 1, "y": 2}) ... ).alias("keys")) >>> result.collect_column("keys")[0].as_py() ['x', 'y'] @@ -3400,10 +3414,7 @@ def map_values(map: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map_values( - ... dfn.functions.make_map( - ... dfn.lit("x"), dfn.lit(1), - ... dfn.lit("y"), dfn.lit(2), - ... ) + ... dfn.functions.make_map({"x": 1, "y": 2}) ... ).alias("vals")) >>> result.collect_column("vals")[0].as_py() [1, 2] @@ -3419,10 +3430,7 @@ def map_extract(map: Expr, key: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map_extract( - ... dfn.functions.make_map( - ... dfn.lit("x"), dfn.lit(1), - ... dfn.lit("y"), dfn.lit(2), - ... ), + ... dfn.functions.make_map({"x": 1, "y": 2}), ... dfn.lit("x"), ... ).alias("val")) >>> result.collect_column("val")[0].as_py() @@ -3439,10 +3447,7 @@ def map_entries(map: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map_entries( - ... dfn.functions.make_map( - ... dfn.lit("x"), dfn.lit(1), - ... dfn.lit("y"), dfn.lit(2), - ... ) + ... dfn.functions.make_map({"x": 1, "y": 2}) ... ).alias("entries")) >>> result.collect_column("entries")[0].as_py() [{'key': 'x', 'value': 1}, {'key': 'y', 'value': 2}] diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index ccbb62f86..4f917f6a4 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -673,24 +673,42 @@ def test_make_map(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) + result = df.select(f.make_map({"x": 1, "y": 2}).alias("map")).collect()[0].column(0) + assert result[0].as_py() == [("x", 1), ("y", 2)] + + +def test_make_map_with_expr_values(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + result = ( - df.select( - f.make_map( - literal("x"), - literal(1), - literal("y"), - literal(2), - ).alias("map") - ) + df.select(f.make_map({"x": literal(1), "y": literal(2)}).alias("map")) .collect()[0] .column(0) ) assert result[0].as_py() == [("x", 1), ("y", 2)] -def test_make_map_odd_args(): - with pytest.raises(ValueError, match="even number of arguments"): - f.make_map(literal("x"), literal(1), literal("y")) +def test_make_map_with_column_data(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [ + pa.array(["k1", "k2", "k3"]), + pa.array([10, 20, 30]), + ], + names=["keys", "vals"], + ) + df = ctx.create_dataframe([[batch]]) + + m = f.make_map(keys=[column("keys")], values=[column("vals")]) + result = df.select(f.map_keys(m).alias("k")).collect()[0].column(0) + for i, expected in enumerate(["k1", "k2", "k3"]): + assert result[i].as_py() == [expected] + + result = df.select(f.map_values(m).alias("v")).collect()[0].column(0) + for i, expected in enumerate([10, 20, 30]): + assert result[i].as_py() == [expected] def test_map_keys(): @@ -698,7 +716,7 @@ def test_map_keys(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map(literal("x"), literal(1), literal("y"), literal(2)) + m = f.make_map({"x": 1, "y": 2}) result = df.select(f.map_keys(m).alias("keys")).collect()[0].column(0) assert result[0].as_py() == ["x", "y"] @@ -708,7 +726,7 @@ def test_map_values(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map(literal("x"), literal(1), literal("y"), literal(2)) + m = f.make_map({"x": 1, "y": 2}) result = df.select(f.map_values(m).alias("vals")).collect()[0].column(0) assert result[0].as_py() == [1, 2] @@ -718,7 +736,7 @@ def test_map_extract(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map(literal("x"), literal(1), literal("y"), literal(2)) + m = f.make_map({"x": 1, "y": 2}) result = ( df.select(f.map_extract(m, literal("x")).alias("val")).collect()[0].column(0) ) @@ -730,7 +748,7 @@ def test_map_extract_missing_key(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map(literal("x"), literal(1)) + m = f.make_map({"x": 1}) result = ( df.select(f.map_extract(m, literal("z")).alias("val")).collect()[0].column(0) ) @@ -742,7 +760,7 @@ def test_map_entries(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map(literal("x"), literal(1), literal("y"), literal(2)) + m = f.make_map({"x": 1, "y": 2}) result = df.select(f.map_entries(m).alias("entries")).collect()[0].column(0) assert result[0].as_py() == [ {"key": "x", "value": 1}, @@ -755,34 +773,13 @@ def test_element_at(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map(literal("a"), literal(10), literal("b"), literal(20)) + m = f.make_map({"a": 10, "b": 20}) result = ( df.select(f.element_at(m, literal("b")).alias("val")).collect()[0].column(0) ) assert result[0].as_py() == [20] -def test_map_functions_with_column_data(): - ctx = SessionContext() - batch = pa.RecordBatch.from_arrays( - [ - pa.array(["k1", "k2", "k3"]), - pa.array([10, 20, 30]), - ], - names=["keys", "vals"], - ) - df = ctx.create_dataframe([[batch]]) - - m = f.make_map(column("keys"), column("vals")) - result = df.select(f.map_keys(m).alias("k")).collect()[0].column(0) - for i, expected in enumerate(["k1", "k2", "k3"]): - assert result[i].as_py() == [expected] - - result = df.select(f.map_values(m).alias("v")).collect()[0].column(0) - for i, expected in enumerate([10, 20, 30]): - assert result[i].as_py() == [expected] - - @pytest.mark.parametrize( ("function", "expected_result"), [ From 8f3d167600ee0bd053dc2dae5ee6fc104f3a851e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 18:22:56 -0400 Subject: [PATCH 5/6] Make map the primary function with make_map as alias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit map() now supports three calling conventions matching upstream: - map({"a": 1, "b": 2}) — from a Python dictionary - map([keys], [values]) — two lists that get zipped - map(k1, v1, k2, v2, ...) — variadic key-value pairs Non-Expr keys and values are automatically converted to literals. Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/functions.py | 67 +++++++++++++++++++--------------- python/tests/test_functions.py | 60 ++++++++++++++++++++++++------ 2 files changed, 86 insertions(+), 41 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 014ec5806..7efe6b1b9 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -202,6 +202,7 @@ "make_date", "make_list", "make_map", + "map", "map_entries", "map_extract", "map_keys", @@ -3347,42 +3348,41 @@ def empty(array: Expr) -> Expr: # map functions -def make_map( - data: dict[Any, Any] | None = None, - keys: list[Any] | None = None, - values: list[Any] | None = None, -) -> Expr: +def map(*args: Any) -> Expr: """Returns a map expression. - Can be called with either a Python dictionary or separate ``keys`` - and ``values`` lists. Keys and values that are not already - :py:class:`~datafusion.expr.Expr` are automatically converted to - literal expressions. + Supports three calling conventions: - Args: - data: A Python dictionary of key-value pairs. - keys: A list of keys (use with ``values`` for column expressions). - values: A list of values (use with ``keys``). + - ``map({"a": 1, "b": 2})`` — from a Python dictionary. + - ``map([keys], [values])`` — two lists that get zipped. + - ``map(k1, v1, k2, v2, ...)`` — variadic key-value pairs. + + Keys and values that are not already :py:class:`~datafusion.expr.Expr` + are automatically converted to literal expressions. Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( - ... dfn.functions.make_map({"a": 1, "b": 2}).alias("map")) - >>> result.collect_column("map")[0].as_py() + ... dfn.functions.map({"a": 1, "b": 2}).alias("m")) + >>> result.collect_column("m")[0].as_py() [('a', 1), ('b', 2)] """ - if data is not None: - if keys is not None or values is not None: - msg = "Cannot specify both data and keys/values" - raise ValueError(msg) - key_list = list(data.keys()) - value_list = list(data.values()) - elif keys is not None and values is not None: - key_list = keys - value_list = values + if len(args) == 1 and isinstance(args[0], dict): + key_list = list(args[0].keys()) + value_list = list(args[0].values()) + elif ( + len(args) == 2 # noqa: PLR2004 + and isinstance(args[0], list) + and isinstance(args[1], list) + ): + key_list = args[0] + value_list = args[1] + elif len(args) >= 2 and len(args) % 2 == 0: # noqa: PLR2004 + key_list = list(args[0::2]) + value_list = list(args[1::2]) else: - msg = "Must specify either data or both keys and values" + msg = "map expects a dict, two lists, or an even number of key-value arguments" raise ValueError(msg) key_exprs = [k if isinstance(k, Expr) else Expr.literal(k) for k in key_list] @@ -3390,6 +3390,15 @@ def make_map( return Expr(f.make_map([k.expr for k in key_exprs], [v.expr for v in val_exprs])) +def make_map(*args: Any) -> Expr: + """Returns a map expression. + + See Also: + This is an alias for :py:func:`map`. + """ + return map(*args) + + def map_keys(map: Expr) -> Expr: """Returns a list of all keys in the map. @@ -3398,7 +3407,7 @@ def map_keys(map: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map_keys( - ... dfn.functions.make_map({"x": 1, "y": 2}) + ... dfn.functions.map({"x": 1, "y": 2}) ... ).alias("keys")) >>> result.collect_column("keys")[0].as_py() ['x', 'y'] @@ -3414,7 +3423,7 @@ def map_values(map: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map_values( - ... dfn.functions.make_map({"x": 1, "y": 2}) + ... dfn.functions.map({"x": 1, "y": 2}) ... ).alias("vals")) >>> result.collect_column("vals")[0].as_py() [1, 2] @@ -3430,7 +3439,7 @@ def map_extract(map: Expr, key: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map_extract( - ... dfn.functions.make_map({"x": 1, "y": 2}), + ... dfn.functions.map({"x": 1, "y": 2}), ... dfn.lit("x"), ... ).alias("val")) >>> result.collect_column("val")[0].as_py() @@ -3447,7 +3456,7 @@ def map_entries(map: Expr) -> Expr: >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map_entries( - ... dfn.functions.make_map({"x": 1, "y": 2}) + ... dfn.functions.map({"x": 1, "y": 2}) ... ).alias("entries")) >>> result.collect_column("entries")[0].as_py() [{'key': 'x', 'value': 1}, {'key': 'y', 'value': 2}] diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 4f917f6a4..8dec52c1a 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -668,29 +668,29 @@ def test_array_function_obj_tests(stmt, py_expr): assert a == b -def test_make_map(): +def test_map_from_dict(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - result = df.select(f.make_map({"x": 1, "y": 2}).alias("map")).collect()[0].column(0) + result = df.select(f.map({"x": 1, "y": 2}).alias("m")).collect()[0].column(0) assert result[0].as_py() == [("x", 1), ("y", 2)] -def test_make_map_with_expr_values(): +def test_map_from_dict_with_expr_values(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) result = ( - df.select(f.make_map({"x": literal(1), "y": literal(2)}).alias("map")) + df.select(f.map({"x": literal(1), "y": literal(2)}).alias("m")) .collect()[0] .column(0) ) assert result[0].as_py() == [("x", 1), ("y", 2)] -def test_make_map_with_column_data(): +def test_map_from_two_lists(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays( [ @@ -701,7 +701,7 @@ def test_make_map_with_column_data(): ) df = ctx.create_dataframe([[batch]]) - m = f.make_map(keys=[column("keys")], values=[column("vals")]) + m = f.map([column("keys")], [column("vals")]) result = df.select(f.map_keys(m).alias("k")).collect()[0].column(0) for i, expected in enumerate(["k1", "k2", "k3"]): assert result[i].as_py() == [expected] @@ -711,12 +711,48 @@ def test_make_map_with_column_data(): assert result[i].as_py() == [expected] +def test_map_from_variadic_pairs(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + result = df.select(f.map("x", 1, "y", 2).alias("m")).collect()[0].column(0) + assert result[0].as_py() == [("x", 1), ("y", 2)] + + +def test_map_variadic_with_exprs(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + result = ( + df.select(f.map(literal("x"), literal(1), literal("y"), literal(2)).alias("m")) + .collect()[0] + .column(0) + ) + assert result[0].as_py() == [("x", 1), ("y", 2)] + + +def test_map_odd_args_raises(): + with pytest.raises(ValueError, match="map expects"): + f.map("x", 1, "y") + + +def test_make_map_is_alias(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) + df = ctx.create_dataframe([[batch]]) + + result = df.select(f.make_map({"x": 1, "y": 2}).alias("m")).collect()[0].column(0) + assert result[0].as_py() == [("x", 1), ("y", 2)] + + def test_map_keys(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map({"x": 1, "y": 2}) + m = f.map({"x": 1, "y": 2}) result = df.select(f.map_keys(m).alias("keys")).collect()[0].column(0) assert result[0].as_py() == ["x", "y"] @@ -726,7 +762,7 @@ def test_map_values(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map({"x": 1, "y": 2}) + m = f.map({"x": 1, "y": 2}) result = df.select(f.map_values(m).alias("vals")).collect()[0].column(0) assert result[0].as_py() == [1, 2] @@ -736,7 +772,7 @@ def test_map_extract(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map({"x": 1, "y": 2}) + m = f.map({"x": 1, "y": 2}) result = ( df.select(f.map_extract(m, literal("x")).alias("val")).collect()[0].column(0) ) @@ -748,7 +784,7 @@ def test_map_extract_missing_key(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map({"x": 1}) + m = f.map({"x": 1}) result = ( df.select(f.map_extract(m, literal("z")).alias("val")).collect()[0].column(0) ) @@ -760,7 +796,7 @@ def test_map_entries(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map({"x": 1, "y": 2}) + m = f.map({"x": 1, "y": 2}) result = df.select(f.map_entries(m).alias("entries")).collect()[0].column(0) assert result[0].as_py() == [ {"key": "x", "value": 1}, @@ -773,7 +809,7 @@ def test_element_at(): batch = pa.RecordBatch.from_arrays([pa.array([1])], names=["a"]) df = ctx.create_dataframe([[batch]]) - m = f.make_map({"a": 10, "b": 20}) + m = f.map({"a": 10, "b": 20}) result = ( df.select(f.element_at(m, literal("b")).alias("val")).collect()[0].column(0) ) From 3b5513763dbe9e48613f37d8aea1fd5689892fd0 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 29 Mar 2026 18:58:39 -0400 Subject: [PATCH 6/6] Improve map function docstrings - Add examples for all three map() calling conventions - Use clearer descriptions instead of jargon (no "zipped" or "variadic") - Break map_keys/map_values/map_extract/map_entries examples into two steps: create the map column first, then call the function Co-Authored-By: Claude Opus 4.6 (1M context) --- python/datafusion/functions.py | 49 +++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 7efe6b1b9..71a6140d1 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -3354,19 +3354,41 @@ def map(*args: Any) -> Expr: Supports three calling conventions: - ``map({"a": 1, "b": 2})`` — from a Python dictionary. - - ``map([keys], [values])`` — two lists that get zipped. - - ``map(k1, v1, k2, v2, ...)`` — variadic key-value pairs. + - ``map([keys], [values])`` — from a list of keys and a list of + their associated values. Both lists must be the same length. + - ``map(k1, v1, k2, v2, ...)`` — from alternating keys and their + associated values. Keys and values that are not already :py:class:`~datafusion.expr.Expr` are automatically converted to literal expressions. Examples: + From a dictionary: + >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) >>> result = df.select( ... dfn.functions.map({"a": 1, "b": 2}).alias("m")) >>> result.collect_column("m")[0].as_py() [('a', 1), ('b', 2)] + + From two lists: + + >>> df = ctx.from_pydict({"key": ["x", "y"], "val": [10, 20]}) + >>> df = df.select( + ... dfn.functions.map( + ... [dfn.col("key")], [dfn.col("val")] + ... ).alias("m")) + >>> df.collect_column("m")[0].as_py() + [('x', 10)] + + From alternating keys and values: + + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.map("x", 1, "y", 2).alias("m")) + >>> result.collect_column("m")[0].as_py() + [('x', 1), ('y', 2)] """ if len(args) == 1 and isinstance(args[0], dict): key_list = list(args[0].keys()) @@ -3405,10 +3427,10 @@ def map_keys(map: Expr) -> Expr: Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) + >>> df = df.select( + ... dfn.functions.map({"x": 1, "y": 2}).alias("m")) >>> result = df.select( - ... dfn.functions.map_keys( - ... dfn.functions.map({"x": 1, "y": 2}) - ... ).alias("keys")) + ... dfn.functions.map_keys(dfn.col("m")).alias("keys")) >>> result.collect_column("keys")[0].as_py() ['x', 'y'] """ @@ -3421,10 +3443,10 @@ def map_values(map: Expr) -> Expr: Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) + >>> df = df.select( + ... dfn.functions.map({"x": 1, "y": 2}).alias("m")) >>> result = df.select( - ... dfn.functions.map_values( - ... dfn.functions.map({"x": 1, "y": 2}) - ... ).alias("vals")) + ... dfn.functions.map_values(dfn.col("m")).alias("vals")) >>> result.collect_column("vals")[0].as_py() [1, 2] """ @@ -3437,10 +3459,11 @@ def map_extract(map: Expr, key: Expr) -> Expr: Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) + >>> df = df.select( + ... dfn.functions.map({"x": 1, "y": 2}).alias("m")) >>> result = df.select( ... dfn.functions.map_extract( - ... dfn.functions.map({"x": 1, "y": 2}), - ... dfn.lit("x"), + ... dfn.col("m"), dfn.lit("x") ... ).alias("val")) >>> result.collect_column("val")[0].as_py() [1] @@ -3454,10 +3477,10 @@ def map_entries(map: Expr) -> Expr: Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) + >>> df = df.select( + ... dfn.functions.map({"x": 1, "y": 2}).alias("m")) >>> result = df.select( - ... dfn.functions.map_entries( - ... dfn.functions.map({"x": 1, "y": 2}) - ... ).alias("entries")) + ... dfn.functions.map_entries(dfn.col("m")).alias("entries")) >>> result.collect_column("entries")[0].as_py() [{'key': 'x', 'value': 1}, {'key': 'y', 'value': 2}] """