diff --git a/AGENTS.md b/AGENTS.md index 9f2627b..fe60a9b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -28,7 +28,10 @@ Redis search results come out as a `QueryResult(rows, count)`. translator. - Writes. There is no `INSERT`, `UPDATE`, or `DELETE`. Write through `redis-py`. - Index creation. Use `FT.CREATE` directly via `redis-py` first. -- Cross-index joins, subqueries, `HAVING`, `DISTINCT`. Not implemented. +- Cross-index joins, subqueries, `HAVING`. Not implemented. +- `SELECT DISTINCT` is supported for bare column projections (routed to + `FT.AGGREGATE` with `GROUPBY`). `SELECT DISTINCT *` and `DISTINCT` mixed + with aggregates raise `ValueError`. ## The minimum useful snippet @@ -80,8 +83,10 @@ Full reference, generated from docstrings, is at `docs/api/`. 6. **Lazy schema loading is the default.** The first query that touches an index issues one `FT.INFO`. Pass `schema_cache_strategy="load_all"` to `create_executor` if you want to fail fast on missing indexes at startup. -7. **No JOIN, subquery, HAVING, or DISTINCT.** The translator raises - `ValueError`; do not retry with rephrasing. +7. **No JOIN, subquery, or HAVING.** The translator raises `ValueError`; + do not retry with rephrasing. `SELECT DISTINCT col1, col2, ...` is + supported for column projections; `DISTINCT *` and `DISTINCT` mixed with + aggregates still raise. 8. **GEO uses `POINT(lon, lat)` order.** Longitude first, matching Redis. ## Error model an agent can expect diff --git a/README.md b/README.md index 2328f57..940ba9c 100644 --- a/README.md +++ b/README.md @@ -72,13 +72,14 @@ Pass `decode_responses=True` to the `Redis` client if you want string keys inste - [x] Date functions: `YEAR()`, `MONTH()`, `DAY()`, `DATE_FORMAT()`, etc. - [x] `IS NULL` / `IS NOT NULL` via `ismissing()` (requires Redis 7.4+) - [x] `exists()` function for field presence checks +- [x] `SELECT DISTINCT col1, col2, ...` on bare column projections (routed to `FT.AGGREGATE` with `GROUPBY`) ## What's not implemented (yet) - [ ] JOINs (Redis doesn't support cross-index joins) - [ ] Subqueries - [ ] HAVING clause -- [ ] DISTINCT +- [ ] `SELECT DISTINCT *` and `DISTINCT` mixed with aggregate functions (both raise `ValueError`) - [ ] Index creation from SQL (`CREATE INDEX`) The translator raises `ValueError` for unsupported clauses; do not retry with rephrasing. diff --git a/docs/api/sql-syntax.md b/docs/api/sql-syntax.md index f6d7f24..5e9f51b 100644 --- a/docs/api/sql-syntax.md +++ b/docs/api/sql-syntax.md @@ -24,6 +24,7 @@ The complete catalog of SQL clauses, operators, and functions sql-redis recognis | Date functions (`YEAR`, `MONTH`, `DAY`, `DATE_FORMAT`, ...) | yes | | `IS NULL` / `IS NOT NULL` (Redis 7.4+) | yes | | `exists()` for field presence | yes | +| `SELECT DISTINCT col1, col2, ...` | yes (column projections only; routed to `FT.AGGREGATE` with `GROUPBY`) | ## Not supported @@ -32,7 +33,8 @@ The complete catalog of SQL clauses, operators, and functions sql-redis recognis | `JOIN` | Redis has no cross-index join. | | Subqueries | Out of scope for the POC. | | `HAVING` | Out of scope (use `WHERE` plus `GROUP BY` where possible). | -| `DISTINCT` | Out of scope. | +| `SELECT DISTINCT *` | Cannot group by an unspecified set of columns; list them explicitly. | +| `DISTINCT` with aggregate functions | Use `GROUP BY` explicitly, or `COUNT(DISTINCT col)` for cardinality. | | `CREATE INDEX` | sql-redis does not create schemas. Use `FT.CREATE`. | ## TEXT search diff --git a/sql_redis/analyzer.py b/sql_redis/analyzer.py index 342f612..a6eeb3a 100644 --- a/sql_redis/analyzer.py +++ b/sql_redis/analyzer.py @@ -125,15 +125,34 @@ def analyze(self, parsed: ParsedQuery) -> AnalyzedQuery: for date_func in parsed.date_functions: referenced_fields.add(date_func.field) - # Collect aliases from date functions and computed fields (for GROUP BY) + # Collect aliases from date functions, computed fields, the score() + # alias, and aggregation aliases. These are computed in the query + # pipeline rather than loaded from documents, so GROUP BY / ORDER BY + # references to them must not be looked up in the schema. alias_names = {df.alias for df in parsed.date_functions} alias_names.update(cf.alias for cf in parsed.computed_fields) + if parsed.scoring is not None: + alias_names.add(parsed.scoring.alias) + for agg in parsed.aggregations: + if agg.alias: + alias_names.add(agg.alias) + if parsed.vector_search is not None and parsed.vector_search.alias: + # ORDER BY is the canonical way to sort by + # KNN similarity; the alias is a computed column, not an indexed + # field, so it must not be looked up in the schema. + alias_names.add(parsed.vector_search.alias) # Fields from GROUP BY (exclude aliases since they're computed) for field_name in parsed.groupby_fields: if field_name not in alias_names: referenced_fields.add(field_name) + # Fields from ORDER BY (so they are validated against the schema + # and available for LOAD in the FT.AGGREGATE path) + for field_name, _ in parsed.orderby_fields: + if field_name not in alias_names: + referenced_fields.add(field_name) + # Resolve field types for field_name in referenced_fields: if field_name not in schema: diff --git a/sql_redis/parser.py b/sql_redis/parser.py index 3aff647..61ba228 100644 --- a/sql_redis/parser.py +++ b/sql_redis/parser.py @@ -264,6 +264,7 @@ class ParsedQuery: offset: int | None = None filters: list[str] = dataclasses.field(default_factory=list) scoring: ScoringSpec | None = None # Relevance scoring config + distinct: bool = False # SELECT DISTINCT was specified class SQLParser: @@ -291,8 +292,30 @@ def parse(self, sql: str) -> ParsedQuery: # Extract SELECT fields and aggregations select = ast.find(exp.Select) if select: + if select.args.get("distinct") is not None: + result.distinct = True for expression in select.expressions: self._process_select_expression(expression, result) + if result.distinct: + # Validate DISTINCT shape here; the groupby_fields promotion + # happens after the GROUP BY clause is parsed (below) so an + # explicit GROUP BY does not duplicate the projected columns. + if "*" in result.fields: + raise ValueError( + "SELECT DISTINCT * is not supported; " + "list the columns to deduplicate by explicitly." + ) + if result.aggregations: + # AGG(DISTINCT ...) is handled per-aggregate; mixing + # top-level DISTINCT with aggregations has no clean Redis + # mapping. Reject so users do not silently get one or the + # other applied. + raise ValueError( + "SELECT DISTINCT combined with aggregate functions " + "is not supported; use GROUP BY explicitly." + ) + if not result.fields: + raise ValueError("SELECT DISTINCT requires at least one column.") # Extract WHERE clause conditions where = ast.find(exp.Where) @@ -311,6 +334,12 @@ def parse(self, sql: str) -> ParsedQuery: if isinstance(expr, exp.Column): result.groupby_fields.append(expr.name) + # SELECT DISTINCT: promote the projected columns to GROUP BY so the + # query routes to FT.AGGREGATE and emits GROUPBY @col1 @col2 ... + # An explicit GROUP BY takes precedence so we do not duplicate keys. + if result.distinct and not result.groupby_fields: + result.groupby_fields = list(result.fields) + # Extract HAVING clause — exists() in HAVING → FILTER having = ast.find(exp.Having) if having: @@ -1281,6 +1310,16 @@ def _extract_literal_value(self, expression, convert_dates: bool = False): inner_value = self._extract_literal_value(expression.this) if inner_value is not None: return -inner_value + elif isinstance(expression, exp.Column): + # SQL with ANSI quoting parses "active" as an identifier + # (exp.Column(Identifier(quoted=True))), not a string literal. + # Users who write `status = "active"` clearly intend a value + # comparison; silently turning it into None produced + # `@status:{None}` and crashed on NUMERIC fields. Treat a quoted + # identifier in value position as its string contents. + ident = expression.this + if isinstance(ident, exp.Identifier) and ident.args.get("quoted"): + return ident.this return None def _validate_geo_unit(self, unit_val: object) -> str: diff --git a/sql_redis/query_builder.py b/sql_redis/query_builder.py index 94c2361..1291349 100644 --- a/sql_redis/query_builder.py +++ b/sql_redis/query_builder.py @@ -49,8 +49,11 @@ class QueryBuilder: """Builds RediSearch query syntax from conditions.""" - # Characters that need escaping in TAG values - TAG_SPECIAL_CHARS = r".,<>{}[]\"':;!@#$%^&*()-+=~" + # Characters that need escaping in TAG values. + # `|` separates values inside an IN list at the syntax level, so a literal + # pipe inside a value must be escaped; otherwise `status = 'a|b'` parses + # as `status IN ('a', 'b')`. + TAG_SPECIAL_CHARS = r".,<>{}[]\"':;!@#$%^&*()-+=~|" # Characters that have special meaning in RediSearch free-text queries # (outside double-quoted phrases). Must be escaped with backslash. @@ -139,6 +142,13 @@ def build_text_condition( # Build search_value based on operator — shared by single- and multi-field paths if operator == "LIKE": + # LIKE '' produces no token and no wildcard, so RediSearch emits a + # bare `@field:` which is a syntax error at runtime. Reject early. + if value == "": + raise ValueError( + "LIKE pattern must not be empty. Use IS NULL / IS NOT NULL " + "to test for field absence, or '%' to match any value." + ) # Escape special chars in the non-wildcard portion, then convert % → * # Split on %, escape each segment, rejoin with * parts = value.split("%") @@ -344,6 +354,7 @@ def build_tag_condition( field: str, operator: str, value: str | list[str], + negated: bool = False, ) -> str: """Build query syntax for TAG field conditions. @@ -351,11 +362,13 @@ def build_tag_condition( field: Field name. operator: One of =, !=, IN. value: Tag value or list of values for IN. + negated: If True, prefix with `-` for negation. Covers NOT IN + and NOT field = .... The `!=` operator is also honored. Returns: RediSearch query syntax like @field:{value} or @field:{v1|v2}. """ - prefix = "-" if operator == "!=" else "" + prefix = "-" if negated or operator == "!=" else "" if isinstance(value, list): # IN clause - join with | @@ -371,6 +384,7 @@ def build_numeric_condition( field: str, operator: str, value: int | float | tuple[int | float, int | float], + negated: bool = False, ) -> str: """Build query syntax for NUMERIC field conditions. @@ -378,11 +392,14 @@ def build_numeric_condition( field: Field name. operator: One of =, !=, <, <=, >, >=, BETWEEN. value: Numeric value or (min, max) tuple for BETWEEN. + negated: If True, prefix the resulting range with - so that + NOT field > x, NOT BETWEEN, etc. are honored. Without this, + NOT on comparison operators was silently dropped. Returns: RediSearch query syntax like @field:[min max]. """ - prefix = "-" if operator == "!=" else "" + prefix = "-" if negated or operator == "!=" else "" if operator == "BETWEEN": if isinstance(value, tuple): @@ -390,17 +407,17 @@ def build_numeric_condition( return f"{prefix}@{field}:[{min_val} {max_val}]" raise ValueError("BETWEEN operator requires a tuple (min, max)") elif operator == "=": - return f"@{field}:[{value} {value}]" + return f"{prefix}@{field}:[{value} {value}]" elif operator == "!=": - return f"-@{field}:[{value} {value}]" + return f"{prefix}@{field}:[{value} {value}]" elif operator == ">": - return f"@{field}:[({value} +inf]" + return f"{prefix}@{field}:[({value} +inf]" elif operator == ">=": - return f"@{field}:[{value} +inf]" + return f"{prefix}@{field}:[{value} +inf]" elif operator == "<": - return f"@{field}:[-inf ({value}]" + return f"{prefix}@{field}:[-inf ({value}]" elif operator == "<=": - return f"@{field}:[-inf {value}]" + return f"{prefix}@{field}:[-inf {value}]" else: raise ValueError(f"Unknown numeric operator: {operator}") diff --git a/sql_redis/translator.py b/sql_redis/translator.py index 0751a37..ac7673f 100644 --- a/sql_redis/translator.py +++ b/sql_redis/translator.py @@ -156,6 +156,10 @@ def _build_command(self, analyzed: AnalyzedQuery) -> TranslatedQuery: ) # Determine if we need FT.AGGREGATE + # Multi-key ORDER BY also requires FT.AGGREGATE: FT.SEARCH SORTBY + # accepts a single key, while FT.AGGREGATE SORTBY accepts multiple. + # Routing automatically prevents the silent drop of trailing keys + # that used to happen on the FT.SEARCH path. use_aggregate = ( len(analyzed.aggregations) > 0 or len(analyzed.groupby_fields) > 0 @@ -165,6 +169,7 @@ def _build_command(self, analyzed: AnalyzedQuery) -> TranslatedQuery: or len(analyzed.date_functions) > 0 or has_date_func_conditions or len(parsed.filters) > 0 # exists() in HAVING → FILTER + or len(parsed.orderby_fields) > 1 # multi-key ORDER BY ) # Build query string from conditions @@ -310,6 +315,15 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str: inorder=condition.inorder, ) elif field_type == "TAG": + # BETWEEN is meaningless for TAG values; RediSearch tags have no + # ordering, so 'a' <= status <= 'z' has no defined semantics. + # Previously the parser fell through and the builder emitted + # @status:{\('a'\, 'z'\)} (invalid). Surface the limitation. + if operator == "BETWEEN": + raise ValueError( + f"BETWEEN is not supported on TAG fields ('{condition.field}'); " + "TAG values are unordered. Use IN (...) for a set match." + ) # Keep list value for IN clauses, convert scalar to string value = ( condition.value @@ -320,8 +334,35 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str: condition.field, operator, value, + negated=is_negated, ) elif field_type == "NUMERIC": + # IN (...) on a NUMERIC field was previously handed a list value + # to build_numeric_condition, which then tried float([1,2,3]) and + # crashed. RediSearch has no native IN for NUMERIC; expand to a + # union of equality ranges (negated → AND of NOT-equals). + if operator == "IN": + if not isinstance(condition.value, list) or not condition.value: + raise ValueError( + f"IN on NUMERIC field '{condition.field}' requires a " + "non-empty list of values." + ) + parts: list[str] = [] + for item in condition.value: + item_num = self._convert_to_numeric(item) + parts.append( + self._query_builder.build_numeric_condition( + condition.field, "=", item_num, negated=is_negated + ) + ) + if len(parts) == 1: + return parts[0] + # NOT IN (...) → AND of negated equalities (De Morgan) + # IN (...) → OR of equalities + joiner = " " if is_negated else "|" + joined = joiner.join(parts) + return f"({joined})" + # Cast value to expected type for numeric conditions numeric_value: int | float | tuple[int | float, int | float] if isinstance(condition.value, tuple): @@ -345,6 +386,7 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str: condition.field, operator, numeric_value, + negated=is_negated, ) else: # GEO, VECTOR, and unknown field types - default to text search @@ -425,6 +467,9 @@ def _build_search( # SORTBY — skip if the ORDER BY field is a score() alias, because # WITHSCORES already returns results in relevance order and the alias # is not a sortable indexed field. + # Multi-key ORDER BY is routed to FT.AGGREGATE upstream (see + # use_aggregate in _build_command), so by the time we reach this + # branch parsed.orderby_fields has at most one entry. score_alias_name = parsed.scoring.alias if parsed.scoring else None if parsed.orderby_fields: field_name, direction = parsed.orderby_fields[0] @@ -522,6 +567,18 @@ def _build_aggregate( # Load fields referenced in exists() computed fields (SELECT) for computed in analyzed.computed_fields: self._extract_exists_fields(computed.expression, load_fields) + # Load ORDER BY fields so multi-key SORTBY works on non-SORTABLE + # columns. (SORTABLE fields are already in scope; loading them + # again is harmless.) Skip computed/derived aliases. + computed_aliases = {cf.alias for cf in analyzed.computed_fields} + computed_aliases.update(df.alias for df in analyzed.date_functions) + computed_aliases.update(gs.alias for gs in parsed.geo_distance_selects) + for field_name, _ in parsed.orderby_fields: + if ( + field_name in analyzed.field_types + and field_name not in computed_aliases + ): + load_fields.add(field_name) if load_all: args.extend(["LOAD", "*"]) diff --git a/tests/test_query_builder.py b/tests/test_query_builder.py index a98d40e..ac4d325 100644 --- a/tests/test_query_builder.py +++ b/tests/test_query_builder.py @@ -801,6 +801,99 @@ def test_or_preserves_optional_prefix(self): assert result == "@title:(laptop|~gaming)" +class TestQueryBuilderNegationBug: + """Regression tests for the v0.5.0 trial-user NOT-handling bug. + + `build_text_condition` already accepts negated; `build_tag_condition` and + `build_numeric_condition` historically only honored != via operator, so + `NOT field > x`, `NOT field IN (...)`, `NOT field BETWEEN ...` etc. were + silently dropped. These tests assert the builder applies the `-` prefix + when `negated=True` for every comparison operator. + """ + + def test_tag_in_negated(self): + """NOT IN on TAG → -@field:{a|b}.""" + builder = QueryBuilder() + result = builder.build_tag_condition("status", "IN", ["a", "b"], negated=True) + assert result == "-@status:{a|b}" + + def test_tag_equality_negated(self): + """NOT field = 'x' on TAG → -@field:{x}.""" + builder = QueryBuilder() + result = builder.build_tag_condition("status", "=", "a", negated=True) + assert result == "-@status:{a}" + + def test_tag_negated_plus_neq_single_dash(self): + """negated=True combined with operator='!=' yields one `-`, not two. + + The translator collapses double negation upstream, so the builder + normally never sees this state. The contract guards against accidental + double-prefixing if a caller passes both signals. + """ + builder = QueryBuilder() + result = builder.build_tag_condition("status", "!=", "a", negated=True) + assert result == "-@status:{a}" + + def test_numeric_gt_negated(self): + """NOT field > 5 on NUMERIC → -@field:[(5 +inf].""" + builder = QueryBuilder() + result = builder.build_numeric_condition("age", ">", 5, negated=True) + assert result == "-@age:[(5 +inf]" + + def test_numeric_gte_negated(self): + """NOT field >= 5 on NUMERIC → -@field:[5 +inf].""" + builder = QueryBuilder() + result = builder.build_numeric_condition("age", ">=", 5, negated=True) + assert result == "-@age:[5 +inf]" + + def test_numeric_lt_negated(self): + """NOT field < 5 on NUMERIC → -@field:[-inf (5].""" + builder = QueryBuilder() + result = builder.build_numeric_condition("age", "<", 5, negated=True) + assert result == "-@age:[-inf (5]" + + def test_numeric_lte_negated(self): + """NOT field <= 5 on NUMERIC → -@field:[-inf 5].""" + builder = QueryBuilder() + result = builder.build_numeric_condition("age", "<=", 5, negated=True) + assert result == "-@age:[-inf 5]" + + def test_numeric_between_negated(self): + """NOT BETWEEN on NUMERIC → -@field:[10 20].""" + builder = QueryBuilder() + result = builder.build_numeric_condition( + "age", "BETWEEN", (10, 20), negated=True + ) + assert result == "-@age:[10 20]" + + def test_numeric_equality_negated(self): + """NOT field = 5 on NUMERIC → -@field:[5 5].""" + builder = QueryBuilder() + result = builder.build_numeric_condition("age", "=", 5, negated=True) + assert result == "-@age:[5 5]" + + +class TestQueryBuilderTagPipeEscape: + """Regression test for unescaped | in TAG values. + + Without escaping, `status = 'a|b'` produces `@status:{a|b}` which + RediSearch reads as `a OR b`. The literal value `'a|b'` is lost. + """ + + def test_tag_pipe_is_escaped(self): + """TAG value with literal pipe must be escaped.""" + builder = QueryBuilder() + result = builder.build_tag_condition("status", "=", "a|b") + assert result == r"@status:{a\|b}" + + def test_tag_in_pipe_is_escaped(self): + """TAG IN with literal pipe inside a value escapes only the inner pipe.""" + builder = QueryBuilder() + result = builder.build_tag_condition("status", "IN", ["x|y", "z"]) + # Inner | must be escaped; the IN-separator | between values stays raw. + assert result == r"@status:{x\|y|z}" + + class TestQueryBuilderSlopValidation: """Tests for slop validation at the QueryBuilder level.""" diff --git a/tests/test_translator.py b/tests/test_translator.py index da85595..19c2f50 100644 --- a/tests/test_translator.py +++ b/tests/test_translator.py @@ -579,6 +579,213 @@ def test_double_negation_cancels(self, translator: Translator, basic_index: str) assert result.query_string == "@title:good" +class TestTranslatorTrialUserBug: + """Regression tests for the v0.5.0 trial-user bug report (error_spec.md). + + Critical: + - NOT is silently dropped for >, >=, <, <=, BETWEEN, IN on tag/numeric. + - TAG pipe | is not escaped, so 'a|b' is interpreted as 'a' OR 'b'. + + High: + - IN on NUMERIC crashes. + - LIKE '' produces invalid syntax. + - BETWEEN on TAG produces invalid syntax. + - Double-quoted value (identifier) becomes None. + + Medium: + - SELECT DISTINCT is silently ignored. + - Multi-column ORDER BY drops trailing keys. + """ + + def test_not_in_on_tag(self, translator: Translator, basic_index: str): + """NOT IN on TAG → -@status:{a|b} (excludes), not @status:{a|b} (matches).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE status NOT IN ('a', 'b')" + ) + assert result.query_string == "-@status:{a|b}" + + def test_not_between_on_numeric(self, translator: Translator, basic_index: str): + """NOT BETWEEN on NUMERIC → -@price:[10 20] (outside range).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE price NOT BETWEEN 10 AND 20" + ) + assert result.query_string == "-@price:[10 20]" + + def test_not_greater_than_on_numeric( + self, translator: Translator, basic_index: str + ): + """NOT field > 5 on NUMERIC → -@price:[(5 +inf] (i.e. price <= 5).""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE NOT price > 5" + ) + # Both shapes are semantically equivalent (NOT (price>5) == price<=5). + # Accept either the literal -[(5 +inf] form or the rewritten [-inf 5] form. + assert result.query_string in ("-@price:[(5 +inf]", "@price:[-inf 5]") + + def test_mixed_not_text_and_not_numeric( + self, translator: Translator, basic_index: str + ): + """The crystallizing example: only the = negation worked before; > was dropped. + + NOT title = 'x' AND NOT price > 50 must negate both sides. + """ + result = translator.translate( + f"SELECT * FROM {basic_index} " "WHERE NOT title = 'x' AND NOT price > 50" + ) + assert "-@title:x" in result.query_string + # The numeric NOT must also be applied (was previously dropped). + assert ( + "-@price:[(50 +inf]" in result.query_string + or "@price:[-inf 50]" in result.query_string + ) + + def test_tag_value_with_pipe_is_escaped( + self, translator: Translator, basic_index: str + ): + """status = 'a|b' must match the literal value 'a|b', not 'a' OR 'b'.""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE status = 'a|b'" + ) + assert result.query_string == r"@status:{a\|b}" + + def test_tag_in_with_pipe_inside_value( + self, translator: Translator, basic_index: str + ): + """IN ('x|y', 'z') must yield two values (x|y, z), not three.""" + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE status IN ('x|y', 'z')" + ) + assert result.query_string == r"@status:{x\|y|z}" + + def test_in_on_numeric_does_not_crash( + self, translator: Translator, basic_index: str + ): + """price IN (1, 2, 3) must not raise TypeError. + + Either a clear ValueError (rejecting IN on numeric) or a valid query + such as the union of equality ranges is acceptable; the crash on a + list-to-float coercion is not. + """ + try: + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE price IN (1, 2, 3)" + ) + except ValueError: + # Surfacing the limitation is acceptable + return + # If it succeeds, must not be the broken `[1 [1, 2, 3]]` shape. + assert "[1, 2, 3]" not in result.query_string + + def test_empty_like_does_not_produce_bare_colon( + self, translator: Translator, basic_index: str + ): + """LIKE '' must not emit a bare `@title:` (invalid RediSearch syntax). + + A clear ValueError is preferred over silent invalid output. + """ + try: + result = translator.translate( + f"SELECT * FROM {basic_index} WHERE title LIKE ''" + ) + except ValueError: + return + # If accepted, the query string must not contain the bare-colon shape. + assert not result.query_string.rstrip().endswith("@title:") + assert "@title: " not in result.query_string + + def test_between_on_tag_is_rejected(self, translator: Translator, basic_index: str): + """BETWEEN on a TAG field is meaningless and must raise. + + Previously produced @status:{\\('a'\\, 'z'\\)}, which is invalid. + """ + with pytest.raises(ValueError): + translator.translate( + f"SELECT * FROM {basic_index} " "WHERE status BETWEEN 'a' AND 'z'" + ) + + def test_double_quoted_value_does_not_become_none( + self, translator: Translator, basic_index: str + ): + """status = "active" (double-quoted SQL identifier) must not become None. + + sqlglot parses "active" as an identifier (exp.Column). The translator + previously called _extract_literal_value which returned None, yielding + @status:{None}. Either accept it as a string literal or raise; never + emit the literal token 'None' into the query. + """ + try: + result = translator.translate( + f'SELECT * FROM {basic_index} WHERE status = "active"' + ) + except ValueError: + return + assert "None" not in result.query_string + # If accepted, expect the value treated as a string literal: + assert "@status:{active}" in result.query_string + + def test_select_distinct_is_not_silently_ignored( + self, translator: Translator, basic_index: str + ): + """SELECT DISTINCT must not silently behave like a plain SELECT. + + Either: produce a query that deduplicates (e.g. GROUPBY on the + selected columns), or raise to surface that DISTINCT is unsupported. + Silently returning duplicate rows is the bug. + """ + try: + result = translator.translate(f"SELECT DISTINCT status FROM {basic_index}") + except ValueError: + return + # FT.SEARCH cannot dedupe; an FT.AGGREGATE with GROUPBY @status is the + # only sane way to honor DISTINCT here. + assert result.command == "FT.AGGREGATE" + assert "GROUPBY" in result.args + gb_idx = result.args.index("GROUPBY") + # Must group by @status + assert "@status" in result.args[gb_idx : gb_idx + 4] + + def test_multi_column_order_by_preserved( + self, translator: Translator, basic_index: str + ): + """ORDER BY a ASC, b DESC must not drop the trailing key. + + FT.SEARCH SORTBY only accepts one key; FT.AGGREGATE SORTBY accepts + multiple. The translator auto-routes multi-key ORDER BY to + FT.AGGREGATE so both keys survive. + """ + result = translator.translate( + f"SELECT * FROM {basic_index} ORDER BY price ASC, title DESC" + ) + # Multi-key ORDER BY triggers FT.AGGREGATE + assert result.command == "FT.AGGREGATE" + assert "SORTBY" in result.args + sb_idx = result.args.index("SORTBY") + # FT.AGGREGATE SORTBY format: SORTBY nargs @field dir @field dir + assert result.args[sb_idx + 1] == "4" # 2 keys * 2 (field+dir) + assert result.args[sb_idx + 2] == "@price" + assert result.args[sb_idx + 3] == "ASC" + assert result.args[sb_idx + 4] == "@title" + assert result.args[sb_idx + 5] == "DESC" + + def test_multi_column_order_by_loads_non_select_field( + self, translator: Translator, basic_index: str + ): + """When ORDER BY references a column outside SELECT, LOAD must + include it so the sort works on non-SORTABLE columns.""" + result = translator.translate( + f"SELECT title FROM {basic_index} " "ORDER BY price ASC, content DESC" + ) + assert result.command == "FT.AGGREGATE" + assert "LOAD" in result.args + load_idx = result.args.index("LOAD") + load_count = int(result.args[load_idx + 1]) + loaded = result.args[load_idx + 2 : load_idx + 2 + load_count] + # The ORDER BY columns must be loaded along with the SELECT column + assert "@price" in loaded + assert "@content" in loaded + assert "@title" in loaded + + class TestTranslatorOutput: """Tests for output format methods."""