[Vector Sets] IN operator for string/string operands (#14122)

This PR introduces "IN" overloading for strings in Vector Sets VSIM
FILTER expressions.
Now it is possible to do something like:

    "foo" IN "foobar"

IN continues to work as usually if the second operand is an array,
checking for membership of the left operand.

Ping @rowantrollope that requested this feature. I'm evaluating if to
add glob matching functionalities via the `=~` operator but I need to do
an optimization round in our glob matching function probably. Glob
matching can be slower, at the same time the complexity of the greedy
search in the graph remains unchanged, so it may be a good idea to have
it.

Case insensitive search will be likely not be added however, since this
would require handling unicode that is kinda outside the scope of Redis
filters. The user is still able to perform `"foo" in "foobar" || "FOO"
in "foobar"` at least.
This commit is contained in:
Salvatore Sanfilippo 2025-06-26 04:13:54 +02:00 committed by GitHub
parent a25f0a715e
commit 8948a5d2b2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 120 additions and 26 deletions

View file

@ -365,6 +365,18 @@ JSON attributes are converted in this way:
Any other type is ignored, and accessig it will make the expression evaluate to false.
### The IN operator
The `IN` operator works in two ways, it can test for membership in an array, like in:
5 in [1, 2, 3]
"foo" in [1, "foo", "bar"]
But can also check for substrings, in case the A and B operators are both strings.
"foo" in "barfoobar" # Will evaluate to true
"zap" in "foobar" # Will evaluate to false
### Examples
```

View file

@ -20,7 +20,6 @@
#define RedisModule_Assert assert
#define _DEFAULT_SOURCE
#define _USE_MATH_DEFINES
#define _POSIX_C_SOURCE 200809L
#include <assert.h>
#include <math.h>
#endif
@ -620,11 +619,12 @@ exprstate *exprCompile(char *expr, int *errpos) {
if (token->token_type == EXPR_TOKEN_EOF) break;
/* Handle values (numbers, strings, selectors). */
/* Handle values (numbers, strings, selectors, null). */
if (token->token_type == EXPR_TOKEN_NUM ||
token->token_type == EXPR_TOKEN_STR ||
token->token_type == EXPR_TOKEN_TUPLE ||
token->token_type == EXPR_TOKEN_SELECTOR)
token->token_type == EXPR_TOKEN_SELECTOR ||
token->token_type == EXPR_TOKEN_NULL)
{
exprStackPush(&es->program, token);
exprTokenRetain(token);
@ -734,6 +734,17 @@ int exprTokensEqual(exprtoken *a, exprtoken *b) {
return exprTokenToNum(a) == exprTokenToNum(b);
}
/* Return true if the string a is a substring of b. */
int exprTokensStringIn(exprtoken *a, exprtoken *b) {
RedisModule_Assert(a->token_type == EXPR_TOKEN_STR &&
b->token_type == EXPR_TOKEN_STR);
if (a->str.len > b->str.len) return 0; // A is bigger, can't be a substring.
for (size_t i = 0; i <= b->str.len - a->str.len; i++) {
if (memcmp(b->str.start+i,a->str.start,a->str.len) == 0) return 1;
}
return 0;
}
#include "fastjson.c" // JSON parser implementation used by exprRun().
/* Execute the compiled expression program. Returns 1 if the final stack value
@ -823,7 +834,9 @@ int exprRun(exprstate *es, char *json, size_t json_len) {
result->num = !exprTokensEqual(a, b) ? 1 : 0;
break;
case EXPR_OP_IN: {
// For 'in' operator, b must be a tuple.
/* For 'in' operator, b must be a tuple, and we check for
* membership. Otherwise both a and b must be strings, and
* in this case we check if a is a substring of b. */
result->num = 0; // Default to false.
if (b->token_type == EXPR_TOKEN_TUPLE) {
for (size_t j = 0; j < b->tuple.len; j++) {
@ -832,6 +845,10 @@ int exprRun(exprstate *es, char *json, size_t json_len) {
break;
}
}
} else if (a->token_type == EXPR_TOKEN_STR &&
b->token_type == EXPR_TOKEN_STR)
{
result->num = exprTokensStringIn(a,b);
}
break;
}

View file

@ -39,124 +39,189 @@ class VSIMFilterExpressions(TestCase):
self.redis.execute_command('VSETATTR', self.test_key, f'{self.test_key}:item:5',
'invalid json') # Intentionally malformed JSON
# Test 1: Basic equality with numbers
# Basic equality with numbers
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age == 25')
assert len(result) == 1, "Expected 1 result for age == 25"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1 for age == 25"
# Test 2: Greater than
# Greater than
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age > 25')
assert len(result) == 2, "Expected 2 results for age > 25"
# Test 3: Less than or equal
# Less than or equal
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age <= 30')
assert len(result) == 2, "Expected 2 results for age <= 30"
# Test 4: String equality
# String equality
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.name == "Alice"')
assert len(result) == 1, "Expected 1 result for name == Alice"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1 for name == Alice"
# Test 5: String inequality
# String inequality
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.name != "Alice"')
assert len(result) == 2, "Expected 2 results for name != Alice"
# Test 6: Boolean value
# Boolean value
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.active')
assert len(result) == 1, "Expected 1 result for .active being true"
# Test 7: Logical AND
# Logical AND
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age > 20 and .age < 30')
assert len(result) == 1, "Expected 1 result for 20 < age < 30"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1 for 20 < age < 30"
# Test 8: Logical OR
# Logical OR
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age < 30 or .age > 35')
assert len(result) == 1, "Expected 1 result for age < 30 or age > 35"
# Test 9: Logical NOT
# Logical NOT
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '!(.age == 25)')
assert len(result) == 2, "Expected 2 results for NOT(age == 25)"
# Test 10: The "in" operator with array
# The "in" operator with array
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age in [25, 35]')
assert len(result) == 2, "Expected 2 results for age in [25, 35]"
# Test 11: The "in" operator with strings in array
# The "in" operator with strings in array
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.name in ["Alice", "David"]')
assert len(result) == 1, "Expected 1 result for name in [Alice, David]"
# Test 12: Arithmetic operations - addition
# The "in" operator for substring matching
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"lic" in .name')
assert len(result) == 1, "Expected 1 result for 'lic' in name"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1 (Alice)"
# The "in" operator with city substring
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"ork" in .city')
assert len(result) == 1, "Expected 1 result for 'ork' in city"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1 (New York)"
# The "in" operator with no matches
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"xyz" in .name')
assert len(result) == 0, "Expected 0 results for 'xyz' in name"
# Off-by-one tests - substring at the beginning
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"Ali" in .name')
assert len(result) == 1, "Expected 1 result for 'Ali' at beginning of 'Alice'"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1"
# Off-by-one tests - substring at the end
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"ice" in .name')
assert len(result) == 1, "Expected 1 result for 'ice' at end of 'Alice'"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1"
# Off-by-one tests - exact match (entire string)
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"Alice" in .name')
assert len(result) == 1, "Expected 1 result for exact match 'Alice' in 'Alice'"
assert result[0].decode() == f'{self.test_key}:item:1', "Expected item:1"
# Off-by-one tests - single character
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"A" in .name')
assert len(result) == 1, "Expected 1 result for single char 'A' in 'Alice'"
# Off-by-one tests - empty string (should match all strings)
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"" in .name')
assert len(result) == 3, "Expected 3 results for empty string (matches all strings)"
# Off-by-one tests - non-empty strings are never substrings of ""
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.name in ""')
assert len(result) == 0, "Expected 0 results for empty string on the right of IN operator"
# Off-by-one tests - empty string match empty string.
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '"" in .name && "" in ""')
assert len(result) == 3, "Expected empty string matching empty string"
# Arithmetic operations - addition
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age + 10 > 40')
assert len(result) == 1, "Expected 1 result for age + 10 > 40"
# Test 13: Arithmetic operations - multiplication
# Arithmetic operations - multiplication
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age * 2 > 60')
assert len(result) == 1, "Expected 1 result for age * 2 > 60"
# Test 14: Arithmetic operations - division
# Arithmetic operations - division
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age / 5 == 5')
assert len(result) == 1, "Expected 1 result for age / 5 == 5"
# Test 15: Arithmetic operations - modulo
# Arithmetic operations - modulo
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age % 2 == 0')
assert len(result) == 1, "Expected 1 result for age % 2 == 0"
# Test 16: Power operator
# Power operator
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age ** 2 > 900')
assert len(result) == 1, "Expected 1 result for age^2 > 900"
# Test 17: Missing attribute (should exclude items missing that attribute)
# Missing attribute (should exclude items missing that attribute)
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.missing_field == "value"')
assert len(result) == 0, "Expected 0 results for missing_field == value"
# Test 18: No attribute set at all
# No attribute set at all
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.any_field')
assert f'{self.test_key}:item:4' not in [item.decode() for item in result], "Item with no attribute should be excluded"
# Test 19: Malformed JSON
# Malformed JSON
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.any_field')
assert f'{self.test_key}:item:5' not in [item.decode() for item in result], "Item with malformed JSON should be excluded"
# Test 20: Complex expression combining multiple operators
# Complex expression combining multiple operators
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '(.age > 20 and .age < 40) and (.city == "Boston" or .city == "New York")')
@ -164,13 +229,13 @@ class VSIMFilterExpressions(TestCase):
expected_items = [f'{self.test_key}:item:1', f'{self.test_key}:item:2']
assert set([item.decode() for item in result]) == set(expected_items), "Expected item:1 and item:2 for the complex expression"
# Test 21: Parentheses to control operator precedence
# Parentheses to control operator precedence
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.age > (20 + 10)')
assert len(result) == 1, "Expected 1 result for age > (20 + 10)"
# Test 22: Array access (arrays evaluate to true)
# Array access (arrays evaluate to true)
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', 4,
*[str(x) for x in vec1],
'FILTER', '.scores')