From 07c274192d2949b20e8febd3832f0abef5280d18 Mon Sep 17 00:00:00 2001 From: renato2099 Date: Mon, 13 Apr 2026 20:40:41 +0200 Subject: [PATCH 1/6] Allow plain Python literals in regexp function wrappers --- python/datafusion/expr.py | 17 ++++++++ python/datafusion/functions.py | 71 +++++++++++++++++++++------------- python/tests/test_functions.py | 24 ++++++++++++ 3 files changed, 85 insertions(+), 27 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 7cd74ecd5..8a3c50839 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -276,6 +276,23 @@ def _iter( return list(_iter(exprs)) +def _to_raw_literal_expr(value: Expr | Any) -> expr_internal.Expr: + """Convert an expression or Python literal to its raw variant. + + Args: + value: Candidate expression or Python literal value. + + Returns: + The internal :class:`~datafusion._internal.expr.Expr` representation. + + Examples: + >>> expr = Expr(_to_raw_literal_expr(1)) + >>> isinstance(expr, Expr) + True + """ + if isinstance(value, Expr): + return value.expr + return Expr.literal(value).expr def _to_raw_expr(value: Expr | str) -> expr_internal.Expr: """Convert a Python expression or column name to its raw variant. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 841cd9c0b..f18de3ef9 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -32,6 +32,7 @@ expr_list_to_raw_expr_list, sort_list_to_raw_sort_list, sort_or_default, + _to_raw_literal_expr, ) __all__ = [ @@ -1440,7 +1441,7 @@ def radians(arg: Expr) -> Expr: return Expr(f.radians(arg.expr)) -def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: +def regexp_like(string: Expr, regex: Expr | Any, flags: Expr | Any | None = None) -> Expr: r"""Find if any regular expression (regex) matches exist. Tests a string using a regular expression returning true if at least one match, @@ -1468,12 +1469,14 @@ def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: >>> result.collect_column("m")[0].as_py() True """ - if flags is not None: - flags = flags.expr - return Expr(f.regexp_like(string.expr, regex.expr, flags)) + # if flags is not None: + # flags = flags.expr + # return Expr(f.regexp_like(string.expr, regex.expr, flags)) + flags = _to_raw_literal_expr(flags) if flags is not None else None + return Expr(f.regexp_like(string.expr, _to_raw_literal_expr(regex), flags)) -def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: +def regexp_match(string: Expr, regex: Expr | Any, flags: Expr | Any | None = None) -> Expr: r"""Perform regular expression (regex) matching. Returns an array with each element containing the leftmost-first match of the @@ -1501,13 +1504,15 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: >>> result.collect_column("m")[0].as_py() ['hello'] """ - if flags is not None: - flags = flags.expr - return Expr(f.regexp_match(string.expr, regex.expr, flags)) + # if flags is not None: + # flags = flags.expr + # return Expr(f.regexp_match(string.expr, regex.expr, flags)) + flags = _to_raw_literal_expr(flags) if flags is not None else None + return Expr(f.regexp_match(string.expr, _to_raw_literal_expr(regex), flags)) def regexp_replace( - string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None + string: Expr, pattern: Expr | Any, replacement: Expr | Any, flags: Expr | Any | None = None ) -> Expr: r"""Replaces substring(s) matching a PCRE-like regular expression. @@ -1541,13 +1546,17 @@ def regexp_replace( >>> result.collect_column("r")[0].as_py() 'aX bX cX' """ - if flags is not None: - flags = flags.expr - return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags)) + # if flags is not None: + # flags = flags.expr + # return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags)) + flags = _to_raw_literal_expr(flags) if flags is not None else None + pattern = _to_raw_literal_expr(pattern) + replacement = _to_raw_literal_expr(replacement) + return Expr(f.regexp_replace(string.expr, pattern, replacement, flags)) def regexp_count( - string: Expr, pattern: Expr, start: Expr | None = None, flags: Expr | None = None + string: Expr, pattern: Expr | Any, start: Expr | Any | None = None, flags: Expr | Any | None = None ) -> Expr: """Returns the number of matches in a string. @@ -1575,19 +1584,22 @@ def regexp_count( >>> result.collect_column("c")[0].as_py() 1 """ - if flags is not None: - flags = flags.expr - start = start.expr if start is not None else start - return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) + # if flags is not None: + # flags = flags.expr + # start = start.expr if start is not None else start + # return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) + flags = _to_raw_literal_expr(flags) if flags is not None else None + start = _to_raw_literal_expr(start) if start is not None else None + return Expr(f.regexp_count(string.expr, _to_raw_literal_expr(pattern), start, flags)) def regexp_instr( values: Expr, - regex: Expr, - start: Expr | None = None, - n: Expr | None = None, - flags: Expr | None = None, - sub_expr: Expr | None = None, + regex: Expr | Any, + start: Expr | Any | None = None, + n: Expr | Any | None = None, + flags: Expr | Any | None = None, + sub_expr: Expr | Any | None = None, ) -> Expr: r"""Returns the position of a regular expression match in a string. @@ -1635,15 +1647,20 @@ def regexp_instr( >>> result.collect_column("pos")[0].as_py() 1 """ - start = start.expr if start is not None else None - n = n.expr if n is not None else None - flags = flags.expr if flags is not None else None - sub_expr = sub_expr.expr if sub_expr is not None else None + # start = start.expr if start is not None else None + # n = n.expr if n is not None else None + # flags = flags.expr if flags is not None else None + # sub_expr = sub_expr.expr if sub_expr is not None else None + regex = _to_raw_literal_expr(regex) + start = _to_raw_literal_expr(start) if start is not None else None + n = _to_raw_literal_expr(n) if n is not None else None + flags = _to_raw_literal_expr(flags) if flags is not None else None + sub_expr = _to_raw_literal_expr(sub_expr) if sub_expr is not None else None return Expr( f.regexp_instr( values.expr, - regex.expr, + regex, start, n, flags, diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 11e94af1c..30c92768d 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -932,6 +932,30 @@ def test_map_functions(func, expected): f.regexp_count(column("a"), literal("(ell|orl)")), pa.array([1, 1, 0], type=pa.int64()), ), + ( + f.regexp_like(column("a"), "(ell|orl)"), + pa.array([True, True, False]), + ), + ( + f.regexp_match(column("a"), "(ell|orl)"), + pa.array([["ell"], ["orl"], None], type=pa.list_(pa.string_view())), + ), + ( + f.regexp_replace(column("a"), "(ell|orl)", "-"), + pa.array(["H-o", "W-d", "!"], type=pa.string_view()), + ), + ( + f.regexp_count(column("a"), "(ell|orl)", start=1), + pa.array([1, 1, 0], type=pa.int64()), + ), + ( + f.regexp_count(column("a"), "(ELL|ORL)", flags="i"), + pa.array([1, 1, 0], type=pa.int64()), + ), + ( + f.regexp_instr(column("a"), "([lr])", n=2), + pa.array([4, 4, 0], type=pa.int64()), + ), ( f.regexp_instr(column("a"), literal("(ell|orl)")), pa.array([2, 2, 0], type=pa.int64()), From 7fbe14857db9d49d2be3ae6ba2cb3ba52330f4da Mon Sep 17 00:00:00 2001 From: renato2099 Date: Mon, 13 Apr 2026 20:55:21 +0200 Subject: [PATCH 2/6] Making ruff happy --- python/datafusion/expr.py | 2 ++ python/datafusion/functions.py | 22 +++++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 8a3c50839..74d70d6ee 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -276,6 +276,7 @@ def _iter( return list(_iter(exprs)) + def _to_raw_literal_expr(value: Expr | Any) -> expr_internal.Expr: """Convert an expression or Python literal to its raw variant. @@ -294,6 +295,7 @@ def _to_raw_literal_expr(value: Expr | Any) -> expr_internal.Expr: return value.expr return Expr.literal(value).expr + def _to_raw_expr(value: Expr | str) -> expr_internal.Expr: """Convert a Python expression or column name to its raw variant. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index f18de3ef9..652f7e2ba 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1441,7 +1441,9 @@ def radians(arg: Expr) -> Expr: return Expr(f.radians(arg.expr)) -def regexp_like(string: Expr, regex: Expr | Any, flags: Expr | Any | None = None) -> Expr: +def regexp_like( + string: Expr, regex: Expr | Any, flags: Expr | Any | None = None +) -> Expr: r"""Find if any regular expression (regex) matches exist. Tests a string using a regular expression returning true if at least one match, @@ -1476,7 +1478,9 @@ def regexp_like(string: Expr, regex: Expr | Any, flags: Expr | Any | None = None return Expr(f.regexp_like(string.expr, _to_raw_literal_expr(regex), flags)) -def regexp_match(string: Expr, regex: Expr | Any, flags: Expr | Any | None = None) -> Expr: +def regexp_match( + string: Expr, regex: Expr | Any, flags: Expr | Any | None = None +) -> Expr: r"""Perform regular expression (regex) matching. Returns an array with each element containing the leftmost-first match of the @@ -1512,7 +1516,10 @@ def regexp_match(string: Expr, regex: Expr | Any, flags: Expr | Any | None = Non def regexp_replace( - string: Expr, pattern: Expr | Any, replacement: Expr | Any, flags: Expr | Any | None = None + string: Expr, + pattern: Expr | Any, + replacement: Expr | Any, + flags: Expr | Any | None = None, ) -> Expr: r"""Replaces substring(s) matching a PCRE-like regular expression. @@ -1556,7 +1563,10 @@ def regexp_replace( def regexp_count( - string: Expr, pattern: Expr | Any, start: Expr | Any | None = None, flags: Expr | Any | None = None + string: Expr, + pattern: Expr | Any, + start: Expr | Any | None = None, + flags: Expr | Any | None = None, ) -> Expr: """Returns the number of matches in a string. @@ -1590,7 +1600,9 @@ def regexp_count( # return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) flags = _to_raw_literal_expr(flags) if flags is not None else None start = _to_raw_literal_expr(start) if start is not None else None - return Expr(f.regexp_count(string.expr, _to_raw_literal_expr(pattern), start, flags)) + return Expr( + f.regexp_count(string.expr, _to_raw_literal_expr(pattern), start, flags) + ) def regexp_instr( From 4d758bb07d24d8386d7b57c5c9d74ad094e1afec Mon Sep 17 00:00:00 2001 From: renato2099 Date: Mon, 13 Apr 2026 21:00:40 +0200 Subject: [PATCH 3/6] Making ruff happy Making ruff happy --- python/datafusion/functions.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 652f7e2ba..3926bc5a0 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -29,10 +29,10 @@ Expr, SortExpr, SortKey, + _to_raw_literal_expr, expr_list_to_raw_expr_list, sort_list_to_raw_sort_list, sort_or_default, - _to_raw_literal_expr, ) __all__ = [ @@ -1471,9 +1471,6 @@ def regexp_like( >>> result.collect_column("m")[0].as_py() True """ - # if flags is not None: - # flags = flags.expr - # return Expr(f.regexp_like(string.expr, regex.expr, flags)) flags = _to_raw_literal_expr(flags) if flags is not None else None return Expr(f.regexp_like(string.expr, _to_raw_literal_expr(regex), flags)) @@ -1508,9 +1505,6 @@ def regexp_match( >>> result.collect_column("m")[0].as_py() ['hello'] """ - # if flags is not None: - # flags = flags.expr - # return Expr(f.regexp_match(string.expr, regex.expr, flags)) flags = _to_raw_literal_expr(flags) if flags is not None else None return Expr(f.regexp_match(string.expr, _to_raw_literal_expr(regex), flags)) @@ -1553,9 +1547,6 @@ def regexp_replace( >>> result.collect_column("r")[0].as_py() 'aX bX cX' """ - # if flags is not None: - # flags = flags.expr - # return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags)) flags = _to_raw_literal_expr(flags) if flags is not None else None pattern = _to_raw_literal_expr(pattern) replacement = _to_raw_literal_expr(replacement) @@ -1594,10 +1585,6 @@ def regexp_count( >>> result.collect_column("c")[0].as_py() 1 """ - # if flags is not None: - # flags = flags.expr - # start = start.expr if start is not None else start - # return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) flags = _to_raw_literal_expr(flags) if flags is not None else None start = _to_raw_literal_expr(start) if start is not None else None return Expr( @@ -1659,10 +1646,6 @@ def regexp_instr( >>> result.collect_column("pos")[0].as_py() 1 """ - # start = start.expr if start is not None else None - # n = n.expr if n is not None else None - # flags = flags.expr if flags is not None else None - # sub_expr = sub_expr.expr if sub_expr is not None else None regex = _to_raw_literal_expr(regex) start = _to_raw_literal_expr(start) if start is not None else None n = _to_raw_literal_expr(n) if n is not None else None From e61040fcf7264ae802e59be41b3de52e90953fbc Mon Sep 17 00:00:00 2001 From: renato2099 Date: Tue, 14 Apr 2026 12:25:48 +0200 Subject: [PATCH 4/6] Narrow public interface of methods --- python/datafusion/functions.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 3926bc5a0..b61b1add8 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1442,7 +1442,7 @@ def radians(arg: Expr) -> Expr: def regexp_like( - string: Expr, regex: Expr | Any, flags: Expr | Any | None = None + string: Expr, regex: Expr | str, flags: Expr | str | None = None ) -> Expr: r"""Find if any regular expression (regex) matches exist. @@ -1476,7 +1476,7 @@ def regexp_like( def regexp_match( - string: Expr, regex: Expr | Any, flags: Expr | Any | None = None + string: Expr, regex: Expr | str, flags: Expr | str | None = None ) -> Expr: r"""Perform regular expression (regex) matching. @@ -1511,9 +1511,9 @@ def regexp_match( def regexp_replace( string: Expr, - pattern: Expr | Any, - replacement: Expr | Any, - flags: Expr | Any | None = None, + pattern: Expr | str, + replacement: Expr | str, + flags: Expr | str | None = None, ) -> Expr: r"""Replaces substring(s) matching a PCRE-like regular expression. @@ -1555,9 +1555,9 @@ def regexp_replace( def regexp_count( string: Expr, - pattern: Expr | Any, - start: Expr | Any | None = None, - flags: Expr | Any | None = None, + pattern: Expr | str, + start: Expr | str | None = None, + flags: Expr | str | None = None, ) -> Expr: """Returns the number of matches in a string. @@ -1594,11 +1594,11 @@ def regexp_count( def regexp_instr( values: Expr, - regex: Expr | Any, - start: Expr | Any | None = None, - n: Expr | Any | None = None, - flags: Expr | Any | None = None, - sub_expr: Expr | Any | None = None, + regex: Expr | str, + start: Expr | str | None = None, + n: Expr | str | None = None, + flags: Expr | str | None = None, + sub_expr: Expr | str | None = None, ) -> Expr: r"""Returns the position of a regular expression match in a string. From b22166eeff0125302c399a50a16bf7b92caad5a5 Mon Sep 17 00:00:00 2001 From: renato2099 Date: Tue, 14 Apr 2026 21:34:21 +0200 Subject: [PATCH 5/6] Narrow public interface of methods --- python/datafusion/functions.py | 50 ++++++++++++++++------------------ 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index b61b1add8..fb762c028 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1454,8 +1454,7 @@ def regexp_like( >>> df = ctx.from_pydict({"a": ["hello123"]}) >>> result = df.select( ... dfn.functions.regexp_like( - ... dfn.col("a"), dfn.lit("\\d+") - ... ).alias("m") + ... dfn.col("a"), "\\d+").alias("m") ... ) >>> result.collect_column("m")[0].as_py() True @@ -1464,8 +1463,8 @@ def regexp_like( >>> result = df.select( ... dfn.functions.regexp_like( - ... dfn.col("a"), dfn.lit("HELLO"), - ... flags=dfn.lit("i"), + ... dfn.col("a"), "HELLO", + ... flags="i", ... ).alias("m") ... ) >>> result.collect_column("m")[0].as_py() @@ -1488,8 +1487,7 @@ def regexp_match( >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) >>> result = df.select( ... dfn.functions.regexp_match( - ... dfn.col("a"), dfn.lit("(\\d+)") - ... ).alias("m") + ... dfn.col("a"), "(\\d+)").alias("m") ... ) >>> result.collect_column("m")[0].as_py() ['42'] @@ -1498,8 +1496,8 @@ def regexp_match( >>> result = df.select( ... dfn.functions.regexp_match( - ... dfn.col("a"), dfn.lit("(HELLO)"), - ... flags=dfn.lit("i"), + ... dfn.col("a"), "(HELLO)", + ... flags="i", ... ).alias("m") ... ) >>> result.collect_column("m")[0].as_py() @@ -1528,8 +1526,8 @@ def regexp_replace( >>> df = ctx.from_pydict({"a": ["hello 42"]}) >>> result = df.select( ... dfn.functions.regexp_replace( - ... dfn.col("a"), dfn.lit("\\d+"), - ... dfn.lit("XX") + ... dfn.col("a"), "\\d+", + ... "XX" ... ).alias("r") ... ) >>> result.collect_column("r")[0].as_py() @@ -1540,8 +1538,8 @@ def regexp_replace( >>> df = ctx.from_pydict({"a": ["a1 b2 c3"]}) >>> result = df.select( ... dfn.functions.regexp_replace( - ... dfn.col("a"), dfn.lit("\\d+"), - ... dfn.lit("X"), flags=dfn.lit("g"), + ... dfn.col("a"), "\\d+", + ... "X", flags="g", ... ).alias("r") ... ) >>> result.collect_column("r")[0].as_py() @@ -1556,7 +1554,7 @@ def regexp_replace( def regexp_count( string: Expr, pattern: Expr | str, - start: Expr | str | None = None, + start: Expr | int | None = None, flags: Expr | str | None = None, ) -> Expr: """Returns the number of matches in a string. @@ -1569,8 +1567,7 @@ def regexp_count( >>> df = ctx.from_pydict({"a": ["abcabc"]}) >>> result = df.select( ... dfn.functions.regexp_count( - ... dfn.col("a"), dfn.lit("abc") - ... ).alias("c")) + ... dfn.col("a"), "abc").alias("c")) >>> result.collect_column("c")[0].as_py() 2 @@ -1579,8 +1576,8 @@ def regexp_count( >>> result = df.select( ... dfn.functions.regexp_count( - ... dfn.col("a"), dfn.lit("ABC"), - ... start=dfn.lit(4), flags=dfn.lit("i"), + ... dfn.col("a"), "ABC", + ... start=4, flags="i", ... ).alias("c")) >>> result.collect_column("c")[0].as_py() 1 @@ -1595,10 +1592,10 @@ def regexp_count( def regexp_instr( values: Expr, regex: Expr | str, - start: Expr | str | None = None, - n: Expr | str | None = None, + start: Expr | int | None = None, + n: Expr | int | None = None, flags: Expr | str | None = None, - sub_expr: Expr | str | None = None, + sub_expr: Expr | int | None = None, ) -> Expr: r"""Returns the position of a regular expression match in a string. @@ -1615,8 +1612,7 @@ def regexp_instr( >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) >>> result = df.select( ... dfn.functions.regexp_instr( - ... dfn.col("a"), dfn.lit("\\d+") - ... ).alias("pos") + ... dfn.col("a"), "\\d+").alias("pos") ... ) >>> result.collect_column("pos")[0].as_py() 7 @@ -1627,9 +1623,9 @@ def regexp_instr( >>> df = ctx.from_pydict({"a": ["abc ABC abc"]}) >>> result = df.select( ... dfn.functions.regexp_instr( - ... dfn.col("a"), dfn.lit("abc"), - ... start=dfn.lit(2), n=dfn.lit(1), - ... flags=dfn.lit("i"), + ... dfn.col("a"), "abc", + ... start=2, n=1, + ... flags="i", ... ).alias("pos") ... ) >>> result.collect_column("pos")[0].as_py() @@ -1639,8 +1635,8 @@ def regexp_instr( >>> result = df.select( ... dfn.functions.regexp_instr( - ... dfn.col("a"), dfn.lit("(abc)"), - ... sub_expr=dfn.lit(1), + ... dfn.col("a"), "(abc)", + ... sub_expr=1, ... ).alias("pos") ... ) >>> result.collect_column("pos")[0].as_py() From ad0b3d6e8e4640fa491e6e9b40908e6fe18d008b Mon Sep 17 00:00:00 2001 From: renato2099 Date: Tue, 14 Apr 2026 22:29:14 +0200 Subject: [PATCH 6/6] Fix bug when handling 'flags' as only parameter --- python/datafusion/functions.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index fb762c028..3ca46d602 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1582,11 +1582,19 @@ def regexp_count( >>> result.collect_column("c")[0].as_py() 1 """ + pattern = _to_raw_literal_expr(pattern) flags = _to_raw_literal_expr(flags) if flags is not None else None start = _to_raw_literal_expr(start) if start is not None else None - return Expr( - f.regexp_count(string.expr, _to_raw_literal_expr(pattern), start, flags) - ) + + # If Python callers pass only flags, supply the default start=1. + # Accepted call forms in Datafusion include: + # two arguments: string + pattern + # three arguments: string + pattern + start + # four arguments: string + pattern + start + flags + if start is None and flags is not None: + start = _to_raw_literal_expr(1) + + return Expr(f.regexp_count(string.expr, pattern, start, flags)) def regexp_instr(