diff --git a/redisvl/query/filter.py b/redisvl/query/filter.py index ced52520..d489d935 100644 --- a/redisvl/query/filter.py +++ b/redisvl/query/filter.py @@ -73,6 +73,24 @@ def _set_value( self._value = val self._operator = operator + def is_missing(self) -> "FilterExpression": + """Create a filter expression for documents missing this field. + + Returns: + FilterExpression: A filter expression that matches documents where the field is missing. + + .. code-block:: python + + from redisvl.query.filter import Tag, Text, Num, Geo, Timestamp + + f = Tag("brand").is_missing() + f = Text("title").is_missing() + f = Num("price").is_missing() + f = Geo("location").is_missing() + f = Timestamp("created_at").is_missing() + """ + return FilterExpression(f"ismissing(@{self._field})") + def check_operator_misuse(func: Callable) -> Callable: @wraps(func) diff --git a/redisvl/redis/connection.py b/redisvl/redis/connection.py index 7bdbcdfa..5ec562d0 100644 --- a/redisvl/redis/connection.py +++ b/redisvl/redis/connection.py @@ -20,7 +20,7 @@ from redisvl.version import __version__ -def compare_versions(version1, version2): +def compare_versions(version1: str, version2: str): """ Compare two Redis version strings numerically. @@ -105,19 +105,27 @@ def parse_attrs(attrs): # TODO 'WITHSUFFIXTRIE' is another boolean attr, but is not returned by ft.info original = attrs.copy() parsed_attrs = {} - if "NOSTEM" in attrs: - parsed_attrs["no_stem"] = True - attrs.remove("NOSTEM") - if "CASESENSITIVE" in attrs: - parsed_attrs["case_sensitive"] = True - attrs.remove("CASESENSITIVE") - if "SORTABLE" in attrs: - parsed_attrs["sortable"] = True - attrs.remove("SORTABLE") - if "UNF" in attrs: - attrs.remove("UNF") # UNF present on sortable numeric fields only + + # Handle all boolean attributes first, regardless of position + boolean_attrs = { + "NOSTEM": "no_stem", + "CASESENSITIVE": "case_sensitive", + "SORTABLE": "sortable", + "INDEXMISSING": "index_missing", + "INDEXEMPTY": "index_empty", + } + + for redis_attr, python_attr in boolean_attrs.items(): + if redis_attr in attrs: + parsed_attrs[python_attr] = True + attrs.remove(redis_attr) + + # Handle UNF which is associated with SORTABLE + if "UNF" in attrs: + attrs.remove("UNF") # UNF present on sortable numeric fields only try: + # Parse remaining attributes as key-value pairs starting from index 6 parsed_attrs.update( {attrs[i].lower(): attrs[i + 1] for i in range(6, len(attrs), 2)} ) diff --git a/redisvl/schema/fields.py b/redisvl/schema/fields.py index 1df294d8..f36ed465 100644 --- a/redisvl/schema/fields.py +++ b/redisvl/schema/fields.py @@ -61,6 +61,8 @@ class BaseFieldAttributes(BaseModel): sortable: bool = Field(default=False) """Enable faster result sorting on the field at runtime""" + index_missing: bool = Field(default=False) + """Allow indexing and searching for missing values (documents without the field)""" class TextFieldAttributes(BaseFieldAttributes): @@ -74,6 +76,8 @@ class TextFieldAttributes(BaseFieldAttributes): """Keep a suffix trie with all terms which match the suffix to optimize certain queries""" phonetic_matcher: Optional[str] = None """Used to perform phonetic matching during search""" + index_empty: bool = Field(default=False) + """Allow indexing and searching for empty strings""" class TagFieldAttributes(BaseFieldAttributes): @@ -85,6 +89,8 @@ class TagFieldAttributes(BaseFieldAttributes): """Treat text as case sensitive or not. By default, tag characters are converted to lowercase""" withsuffixtrie: bool = Field(default=False) """Keep a suffix trie with all terms which match the suffix to optimize certain queries""" + index_empty: bool = Field(default=False) + """Allow indexing and searching for empty strings""" class NumericFieldAttributes(BaseFieldAttributes): @@ -112,6 +118,8 @@ class BaseVectorFieldAttributes(BaseModel): """The distance metric used to measure query relevance""" initial_cap: Optional[int] = None """Initial vector capacity in the index affecting memory allocation size of the index""" + index_missing: bool = Field(default=False) + """Allow indexing and searching for missing values (documents without the field)""" @field_validator("algorithm", "datatype", "distance_metric", mode="before") @classmethod @@ -129,6 +137,8 @@ def field_data(self) -> Dict[str, Any]: } if self.initial_cap is not None: # Only include it if it's set field_data["INITIAL_CAP"] = self.initial_cap + if self.index_missing: # Only include it if it's set + field_data["INDEXMISSING"] = True return field_data @@ -190,14 +200,30 @@ class TextField(BaseField): def as_redis_field(self) -> RedisField: name, as_name = self._handle_names() - return RedisTextField( - name, - as_name=as_name, - weight=self.attrs.weight, # type: ignore - no_stem=self.attrs.no_stem, # type: ignore - phonetic_matcher=self.attrs.phonetic_matcher, # type: ignore - sortable=self.attrs.sortable, - ) + # Build arguments for RedisTextField + kwargs: Dict[str, Any] = { + "weight": self.attrs.weight, # type: ignore + "no_stem": self.attrs.no_stem, # type: ignore + "sortable": self.attrs.sortable, + } + + # Only add as_name if it's not None + if as_name is not None: + kwargs["as_name"] = as_name + + # Only add phonetic_matcher if it's not None + if self.attrs.phonetic_matcher is not None: # type: ignore + kwargs["phonetic_matcher"] = self.attrs.phonetic_matcher # type: ignore + + # Add INDEXMISSING if enabled + if self.attrs.index_missing: # type: ignore + kwargs["index_missing"] = True + + # Add INDEXEMPTY if enabled + if self.attrs.index_empty: # type: ignore + kwargs["index_empty"] = True + + return RedisTextField(name, **kwargs) class TagField(BaseField): @@ -208,13 +234,26 @@ class TagField(BaseField): def as_redis_field(self) -> RedisField: name, as_name = self._handle_names() - return RedisTagField( - name, - as_name=as_name, - separator=self.attrs.separator, # type: ignore - case_sensitive=self.attrs.case_sensitive, # type: ignore - sortable=self.attrs.sortable, - ) + # Build arguments for RedisTagField + kwargs: Dict[str, Any] = { + "separator": self.attrs.separator, # type: ignore + "case_sensitive": self.attrs.case_sensitive, # type: ignore + "sortable": self.attrs.sortable, + } + + # Only add as_name if it's not None + if as_name is not None: + kwargs["as_name"] = as_name + + # Add INDEXMISSING if enabled + if self.attrs.index_missing: # type: ignore + kwargs["index_missing"] = True + + # Add INDEXEMPTY if enabled + if self.attrs.index_empty: # type: ignore + kwargs["index_empty"] = True + + return RedisTagField(name, **kwargs) class NumericField(BaseField): @@ -225,11 +264,20 @@ class NumericField(BaseField): def as_redis_field(self) -> RedisField: name, as_name = self._handle_names() - return RedisNumericField( - name, - as_name=as_name, - sortable=self.attrs.sortable, - ) + # Build arguments for RedisNumericField + kwargs: Dict[str, Any] = { + "sortable": self.attrs.sortable, + } + + # Only add as_name if it's not None + if as_name is not None: + kwargs["as_name"] = as_name + + # Add INDEXMISSING if enabled + if self.attrs.index_missing: # type: ignore + kwargs["index_missing"] = True + + return RedisNumericField(name, **kwargs) class GeoField(BaseField): @@ -240,11 +288,20 @@ class GeoField(BaseField): def as_redis_field(self) -> RedisField: name, as_name = self._handle_names() - return RedisGeoField( - name, - as_name=as_name, - sortable=self.attrs.sortable, - ) + # Build arguments for RedisGeoField + kwargs: Dict[str, Any] = { + "sortable": self.attrs.sortable, + } + + # Only add as_name if it's not None + if as_name is not None: + kwargs["as_name"] = as_name + + # Add INDEXMISSING if enabled + if self.attrs.index_missing: # type: ignore + kwargs["index_missing"] = True + + return RedisGeoField(name, **kwargs) class FlatVectorField(BaseField): diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 90617d18..443a5ce1 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -432,11 +432,14 @@ def to_dict(self) -> Dict[str, Any]: Returns: Dict[str, Any]: The index schema as a dictionary. """ - dict_schema = model_to_dict(self) - # cast fields back to a pure list - dict_schema["fields"] = [ - field for field_name, field in dict_schema["fields"].items() - ] + # Manually serialize to ensure all field attributes are preserved + dict_schema = { + "index": model_to_dict(self.index), + "fields": [ + model_to_dict(field) for field_name, field in self.fields.items() + ], + "version": self.version, + } return dict_schema def to_yaml(self, file_path: str, overwrite: bool = True) -> None: diff --git a/redisvl/utils/utils.py b/redisvl/utils/utils.py index 3aeb8b15..c56048e4 100644 --- a/redisvl/utils/utils.py +++ b/redisvl/utils/utils.py @@ -38,6 +38,10 @@ def model_to_dict(model: BaseModel) -> Dict[str, Any]: def serialize_item(item): if isinstance(item, Enum): return item.value.lower() + elif isinstance(item, BaseModel): + # Recursively serialize nested BaseModel instances with exclude_defaults=False + nested_data = item.model_dump(exclude_none=True, exclude_defaults=False) + return {key: serialize_item(value) for key, value in nested_data.items()} elif isinstance(item, dict): return {key: serialize_item(value) for key, value in item.items()} elif isinstance(item, list): @@ -45,7 +49,8 @@ def serialize_item(item): else: return item - serialized_data = model.model_dump(exclude_none=True) + # Use exclude_defaults=False to preserve all field attributes including new ones + serialized_data = model.model_dump(exclude_none=True, exclude_defaults=False) for key, value in serialized_data.items(): serialized_data[key] = serialize_item(value) return serialized_data @@ -170,29 +175,51 @@ def wrapper(*args, **kwargs): def sync_wrapper(fn: Callable[[], Coroutine[Any, Any, Any]]) -> Callable[[], None]: def wrapper(): + # Check if the interpreter is shutting down + if sys is None or getattr(sys, "_getframe", None) is None: + # Interpreter is shutting down, skip cleanup + return + try: loop = asyncio.get_running_loop() except RuntimeError: loop = None + except Exception: + # Any other exception during loop detection means we should skip cleanup + return + try: if loop is None or not loop.is_running(): + # Check if asyncio module is still available + if asyncio is None: + return + loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) task = loop.create_task(fn()) loop.run_until_complete(task) - except RuntimeError: + except (RuntimeError, AttributeError, TypeError) as e: # This could happen if an object stored an event loop and now - # that event loop is closed. There's nothing we can do other than - # advise the user to use explicit cleanup methods. + # that event loop is closed, or if asyncio modules are being + # torn down during interpreter shutdown. # # Uses logging module instead of get_logger() to avoid I/O errors # if the wrapped function is called as a finalizer. - logging.info( - f"Could not run the async function {fn.__name__} because the event loop is closed. " - "This usually means the object was not properly cleaned up. Please use explicit " - "cleanup methods (e.g., disconnect(), close()) or use the object as an async " - "context manager.", - ) + if logging is not None: + try: + logging.info( + f"Could not run the async function {fn.__name__} because the event loop is closed " + "or the interpreter is shutting down. " + "This usually means the object was not properly cleaned up. Please use explicit " + "cleanup methods (e.g., disconnect(), close()) or use the object as an async " + "context manager.", + ) + except Exception: + # Even logging failed, interpreter is really shutting down + pass + return + except Exception: + # Any other unexpected exception should be silently ignored during shutdown return return wrapper diff --git a/tests/conftest.py b/tests/conftest.py index c4c708cd..b6b27746 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,8 +6,9 @@ import pytest from testcontainers.compose import DockerCompose +from redisvl.exceptions import RedisModuleVersionError from redisvl.index.index import AsyncSearchIndex, SearchIndex -from redisvl.redis.connection import RedisConnectionFactory +from redisvl.redis.connection import RedisConnectionFactory, compare_versions from redisvl.redis.utils import array_to_buffer from redisvl.utils.vectorize import HFTextVectorizer @@ -565,3 +566,77 @@ def hash_preprocess(item: dict) -> dict: # run the test yield index + + +# Version checking utilities +def get_redis_version(client): + """Get Redis version from client info.""" + return client.info()["redis_version"] + + +async def get_redis_version_async(client): + """Get Redis version from async client info.""" + info = await client.info() + return info["redis_version"] + + +def skip_if_redis_version_below(client, min_version: str, message: str = None): + """ + Skip test if Redis version is below minimum required. + + Args: + client: Redis client instance + min_version: Minimum required Redis version + message: Custom skip message + """ + redis_version = get_redis_version(client) + if not compare_versions(redis_version, min_version): + skip_msg = message or f"Redis version {redis_version} < {min_version} required" + pytest.skip(skip_msg) + + +async def skip_if_redis_version_below_async( + client, min_version: str, message: str = None +): + """ + Skip test if Redis version is below minimum required (async version). + + Args: + client: Async Redis client instance + min_version: Minimum required Redis version + message: Custom skip message + """ + redis_version = await get_redis_version_async(client) + if not compare_versions(redis_version, min_version): + skip_msg = message or f"Redis version {redis_version} < {min_version} required" + pytest.skip(skip_msg) + + +def skip_if_module_version_error(func, *args, **kwargs): + """ + Execute function and skip test if RedisModuleVersionError is raised. + + Args: + func: Function to execute + *args: Arguments for the function + **kwargs: Keyword arguments for the function + """ + try: + return func(*args, **kwargs) + except RedisModuleVersionError: + pytest.skip("Required Redis modules not available or version too low") + + +async def skip_if_module_version_error_async(func, *args, **kwargs): + """ + Execute async function and skip test if RedisModuleVersionError is raised. + + Args: + func: Async function to execute + *args: Arguments for the function + **kwargs: Keyword arguments for the function + """ + try: + return await func(*args, **kwargs) + except RedisModuleVersionError: + pytest.skip("Required Redis modules not available or version too low") diff --git a/tests/docker-compose.yml b/tests/docker-compose.yml index 1441e30e..f301d08f 100644 --- a/tests/docker-compose.yml +++ b/tests/docker-compose.yml @@ -1,7 +1,6 @@ -version: "3.9" services: redis: - image: "${REDIS_IMAGE}" + image: "${REDIS_IMAGE:-redis/redis-stack-server:latest}" ports: - "6379" environment: diff --git a/tests/integration/test_aggregation.py b/tests/integration/test_aggregation.py index 73625cb9..3561b1de 100644 --- a/tests/integration/test_aggregation.py +++ b/tests/integration/test_aggregation.py @@ -2,9 +2,9 @@ from redisvl.index import SearchIndex from redisvl.query import HybridQuery -from redisvl.query.filter import Geo, GeoRadius, Num, Tag, Text -from redisvl.redis.connection import compare_versions +from redisvl.query.filter import FilterExpression, Geo, GeoRadius, Num, Tag, Text from redisvl.redis.utils import array_to_buffer +from tests.conftest import skip_if_redis_version_below @pytest.fixture @@ -58,9 +58,7 @@ def hash_preprocess(item: dict) -> dict: def test_aggregation_query(index): - redis_version = index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" text_field = "description" @@ -139,9 +137,7 @@ def test_empty_query_string(): def test_aggregation_query_with_filter(index): - redis_version = index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" text_field = "description" @@ -167,9 +163,7 @@ def test_aggregation_query_with_filter(index): def test_aggregation_query_with_geo_filter(index): - redis_version = index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" text_field = "description" @@ -195,9 +189,7 @@ def test_aggregation_query_with_geo_filter(index): @pytest.mark.parametrize("alpha", [0.1, 0.5, 0.9]) def test_aggregate_query_alpha(index, alpha): - redis_version = index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" text_field = "description" @@ -224,9 +216,7 @@ def test_aggregate_query_alpha(index, alpha): def test_aggregate_query_stopwords(index): - redis_version = index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" text_field = "description" @@ -260,9 +250,7 @@ def test_aggregate_query_stopwords(index): def test_aggregate_query_with_text_filter(index): - redis_version = index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(index.client, "7.2.0") text = "a medical professional with expertise in lung cancer" text_field = "description" diff --git a/tests/integration/test_async_search_index.py b/tests/integration/test_async_search_index.py index 94da698a..3c19a23d 100644 --- a/tests/integration/test_async_search_index.py +++ b/tests/integration/test_async_search_index.py @@ -22,8 +22,10 @@ @pytest.fixture -def index_schema(): - return IndexSchema.from_dict({"index": {"name": "my_index"}, "fields": fields}) +def index_schema(worker_id): + return IndexSchema.from_dict( + {"index": {"name": f"my_index_{worker_id}"}, "fields": fields} + ) @pytest.fixture @@ -55,7 +57,8 @@ def async_index_from_yaml(worker_id): def test_search_index_properties(index_schema, async_index): assert async_index.schema == index_schema # custom settings - assert async_index.name == index_schema.index.name == "my_index" + assert async_index.name == index_schema.index.name + assert async_index.name.startswith("my_index_") assert async_index.client # default settings assert async_index.prefix == index_schema.index.prefix == "rvl" diff --git a/tests/integration/test_connection.py b/tests/integration/test_connection.py index 8e2d2ea8..39807a46 100644 --- a/tests/integration/test_connection.py +++ b/tests/integration/test_connection.py @@ -8,13 +8,16 @@ from redisvl.exceptions import RedisModuleVersionError from redisvl.redis.connection import ( RedisConnectionFactory, - compare_versions, convert_index_info_to_schema, unpack_redis_modules, validate_modules, ) from redisvl.schema import IndexSchema from redisvl.version import __version__ +from tests.conftest import ( + skip_if_redis_version_below, + skip_if_redis_version_below_async, +) EXPECTED_LIB_NAME = f"redis-py(redisvl_v{__version__})" @@ -166,9 +169,7 @@ def test_unknown_redis(self): def test_validate_redis(client): - redis_version = client.info()["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(client, "7.2.0") RedisConnectionFactory.validate_sync_redis(client) lib_name = client.client_info() assert lib_name["lib-name"] == EXPECTED_LIB_NAME @@ -176,18 +177,14 @@ def test_validate_redis(client): @pytest.mark.asyncio async def test_validate_async_redis(async_client): - redis_version = (await async_client.info())["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + await skip_if_redis_version_below_async(async_client, "7.2.0") await RedisConnectionFactory.validate_async_redis(async_client) lib_name = await async_client.client_info() assert lib_name["lib-name"] == EXPECTED_LIB_NAME def test_validate_redis_custom_lib_name(client): - redis_version = client.info()["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(client, "7.2.0") RedisConnectionFactory.validate_sync_redis(client, "langchain_v0.1.0") lib_name = client.client_info() assert lib_name["lib-name"] == f"redis-py(redisvl_v{__version__};langchain_v0.1.0)" @@ -195,9 +192,7 @@ def test_validate_redis_custom_lib_name(client): @pytest.mark.asyncio async def test_validate_async_redis_custom_lib_name(async_client): - redis_version = (await async_client.info())["redis_version"] - if not compare_versions(redis_version, "7.2.0"): - pytest.skip("Not using a late enough version of Redis") + await skip_if_redis_version_below_async(async_client, "7.2.0") await RedisConnectionFactory.validate_async_redis(async_client, "langchain_v0.1.0") lib_name = await async_client.client_info() assert lib_name["lib-name"] == f"redis-py(redisvl_v{__version__};langchain_v0.1.0)" diff --git a/tests/integration/test_llmcache.py b/tests/integration/test_llmcache.py index 9a16f17c..6da425de 100644 --- a/tests/integration/test_llmcache.py +++ b/tests/integration/test_llmcache.py @@ -12,6 +12,7 @@ from redisvl.index.index import AsyncSearchIndex, SearchIndex from redisvl.query.filter import Num, Tag, Text from redisvl.utils.vectorize import HFTextVectorizer +from tests.conftest import skip_if_module_version_error @pytest.fixture(scope="session") @@ -915,23 +916,25 @@ def test_no_key_collision_on_identical_prompts(redis_url, worker_id, hf_vectoriz assert len(filtered_results) == 2 -def test_create_cache_with_different_vector_types(worker_id): +def test_create_cache_with_different_vector_types(worker_id, redis_url): try: - bfloat_cache = SemanticCache(name=f"bfloat_cache_{worker_id}", dtype="bfloat16") + bfloat_cache = SemanticCache( + name=f"bfloat_cache_{worker_id}", dtype="bfloat16", redis_url=redis_url + ) bfloat_cache.store("bfloat16 prompt", "bfloat16 response") float16_cache = SemanticCache( - name=f"float16_cache_{worker_id}", dtype="float16" + name=f"float16_cache_{worker_id}", dtype="float16", redis_url=redis_url ) float16_cache.store("float16 prompt", "float16 response") float32_cache = SemanticCache( - name=f"float32_cache_{worker_id}", dtype="float32" + name=f"float32_cache_{worker_id}", dtype="float32", redis_url=redis_url ) float32_cache.store("float32 prompt", "float32 response") float64_cache = SemanticCache( - name=f"float64_cache_{worker_id}", dtype="float64" + name=f"float64_cache_{worker_id}", dtype="float64", redis_url=redis_url ) float64_cache.store("float64 prompt", "float64 response") @@ -939,20 +942,23 @@ def test_create_cache_with_different_vector_types(worker_id): cache.set_threshold(0.6) assert len(cache.check("float prompt", num_results=5)) == 1 except: - pytest.skip("Not using a late enough version of Redis") + pytest.skip("Required Redis modules not available or version too low") def test_bad_dtype_connecting_to_existing_cache(redis_url, worker_id): - try: - cache = SemanticCache( + def create_cache(): + return SemanticCache( name=f"float64_cache_{worker_id}", dtype="float64", redis_url=redis_url ) - same_type = SemanticCache( + + def create_same_type(): + return SemanticCache( name=f"float64_cache_{worker_id}", dtype="float64", redis_url=redis_url ) - # under the hood uses from_existing - except RedisModuleVersionError: - pytest.skip("Not using a late enough version of Redis") + + cache = skip_if_module_version_error(create_cache) + same_type = skip_if_module_version_error(create_same_type) + # under the hood uses from_existing with pytest.raises(ValueError): bad_type = SemanticCache( diff --git a/tests/integration/test_message_history.py b/tests/integration/test_message_history.py index c23e691e..716c8edd 100644 --- a/tests/integration/test_message_history.py +++ b/tests/integration/test_message_history.py @@ -6,6 +6,7 @@ from redisvl.exceptions import RedisModuleVersionError from redisvl.extensions.constants import ID_FIELD_NAME from redisvl.extensions.message_history import MessageHistory, SemanticMessageHistory +from tests.conftest import skip_if_module_version_error @pytest.fixture @@ -558,38 +559,50 @@ def test_semantic_drop(semantic_history): ] -def test_different_vector_dtypes(): +def test_different_vector_dtypes(redis_url): try: - bfloat_sess = SemanticMessageHistory(name="bfloat_history", dtype="bfloat16") + bfloat_sess = SemanticMessageHistory( + name="bfloat_history", dtype="bfloat16", redis_url=redis_url + ) bfloat_sess.add_message({"role": "user", "content": "bfloat message"}) - float16_sess = SemanticMessageHistory(name="float16_history", dtype="float16") + float16_sess = SemanticMessageHistory( + name="float16_history", dtype="float16", redis_url=redis_url + ) float16_sess.add_message({"role": "user", "content": "float16 message"}) - float32_sess = SemanticMessageHistory(name="float32_history", dtype="float32") + float32_sess = SemanticMessageHistory( + name="float32_history", dtype="float32", redis_url=redis_url + ) float32_sess.add_message({"role": "user", "content": "float32 message"}) - float64_sess = SemanticMessageHistory(name="float64_history", dtype="float64") + float64_sess = SemanticMessageHistory( + name="float64_history", dtype="float64", redis_url=redis_url + ) float64_sess.add_message({"role": "user", "content": "float64 message"}) for sess in [bfloat_sess, float16_sess, float32_sess, float64_sess]: sess.set_distance_threshold(0.7) assert len(sess.get_relevant("float message")) == 1 + sess.delete() # Clean up except: - pytest.skip("Not using a late enough version of Redis") + pytest.skip("Required Redis modules not available or version too low") -def test_bad_dtype_connecting_to_exiting_history(redis_url, hf_vectorizer): - try: - history = SemanticMessageHistory( +def test_bad_dtype_connecting_to_exiting_history(redis_url): + def create_history(): + return SemanticMessageHistory( name="float64 history", dtype="float64", redis_url=redis_url ) - same_type = SemanticMessageHistory( + + def create_same_type(): + return SemanticMessageHistory( name="float64 history", dtype="float64", redis_url=redis_url ) - # under the hood uses from_existing - except RedisModuleVersionError: - pytest.skip("Not using a late enough version of Redis") + + history = skip_if_module_version_error(create_history) + same_type = skip_if_module_version_error(create_same_type) + # under the hood uses from_existing with pytest.raises(ValueError): bad_type = SemanticMessageHistory( diff --git a/tests/integration/test_query.py b/tests/integration/test_query.py index 82310622..bbd56339 100644 --- a/tests/integration/test_query.py +++ b/tests/integration/test_query.py @@ -1,3 +1,4 @@ +import os import uuid from datetime import timedelta @@ -23,8 +24,7 @@ Timestamp, ) from redisvl.redis.utils import array_to_buffer - -# TODO expand to multiple schema types and sync + async +from tests.conftest import skip_if_redis_version_below @pytest.fixture @@ -923,3 +923,400 @@ def test_vector_query_with_ef_runtime_flat_index(flat_index, vector_query, sampl # However, the index should raise an error if EF_RUNTIME is set on a flat index. with pytest.raises(QueryValidationError): # noqa: F821 flat_index.query(vector_query) + + +@pytest.fixture +def missing_fields_index(worker_id, client): + """Create an index with INDEXMISSING and INDEXEMPTY enabled fields and test data.""" + skip_if_redis_version_below(client, "7.2.0") + + # Create an index with INDEXMISSING enabled fields (filterable fields only) + missing_index = SearchIndex.from_dict( + { + "index": { + "name": f"missing_test_index_{worker_id}", + "prefix": f"missing_{worker_id}", + "storage_type": "hash", + }, + "fields": [ + # Text field with both INDEXMISSING and INDEXEMPTY + { + "name": "title", + "type": "text", + "attrs": {"index_missing": True, "index_empty": True}, + }, + # Tag field with both INDEXMISSING and INDEXEMPTY + { + "name": "category", + "type": "tag", + "attrs": {"index_missing": True, "index_empty": True}, + }, + # Numeric field with INDEXMISSING + {"name": "price", "type": "numeric", "attrs": {"index_missing": True}}, + # Geo field with INDEXMISSING + {"name": "location", "type": "geo", "attrs": {"index_missing": True}}, + # Regular field without INDEXMISSING for comparison + {"name": "description", "type": "text"}, + ], + }, + redis_client=client, + ) + + # Create the index + missing_index.create(overwrite=True) + + # Load test data with different missing field scenarios + test_data = [ + { + "id": "complete", + "title": "Complete Document", + "category": "electronics", + "price": 99, + "location": "37.7749,-122.4194", + "description": "A complete document with all fields", + }, + { + "id": "empty_strings", + "title": "", # Empty title + "category": "", # Empty category + "price": 150, + "location": "40.7128,-74.0060", + "description": "Document with empty string values", + }, + { + "id": "partial_missing", + "title": "Partial Document", + # missing category field + "price": 75, + # missing location field + "description": "Document missing some fields", + }, + { + "id": "mostly_missing", + # missing title, category, price, location fields + "description": "Document with most fields missing", + }, + { + "id": "zero_price", + "title": "Zero Price Item", + "category": "free", + "price": 0, # Valid zero value + "location": "34.0522,-118.2437", + "description": "Document with zero price", + }, + ] + + missing_index.load(test_data, id_field="id") + + yield missing_index, f"missing_{worker_id}" + + # Clean up + missing_index.delete(drop=True) + + +def test_basic_missing_field_queries(missing_fields_index): + """ + Test the fundamental is_missing() functionality across all supported field types. + + This test validates that Redis v2.10's INDEXMISSING feature works correctly for: + - Text fields: Search for documents completely missing a text field + - Tag fields: Search for documents missing tag/categorical data + - Numeric fields: Search for documents missing numerical values + - Geo fields: Search for documents missing location data + + Why this matters: INDEXMISSING enables data quality checks, incomplete record + identification, and conditional processing based on field presence. This is the + foundation test ensuring the core functionality works for each filterable field type. + """ + missing_index, prefix = missing_fields_index + + # Test missing text field + missing_title_query = FilterQuery( + filter_expression=Text("title").is_missing(), + return_fields=["id", "description"], + ) + results = missing_index.query(missing_title_query) + assert len(results) == 1 + assert results[0]["id"] == f"{prefix}:mostly_missing" + + # Test missing tag field + missing_category_query = FilterQuery( + filter_expression=Tag("category").is_missing(), + return_fields=["id", "description"], + ) + results = missing_index.query(missing_category_query) + assert len(results) == 2 # partial_missing and mostly_missing + result_ids = {result["id"] for result in results} + assert f"{prefix}:partial_missing" in result_ids + assert f"{prefix}:mostly_missing" in result_ids + + # Test missing numeric field + missing_price_query = FilterQuery( + filter_expression=Num("price").is_missing(), return_fields=["id", "description"] + ) + results = missing_index.query(missing_price_query) + assert len(results) == 1 + assert results[0]["id"] == f"{prefix}:mostly_missing" + + # Test missing geo field + missing_location_query = FilterQuery( + filter_expression=Geo("location").is_missing(), + return_fields=["id", "description"], + ) + results = missing_index.query(missing_location_query) + assert len(results) == 2 # partial_missing and mostly_missing + result_ids = {result["id"] for result in results} + assert f"{prefix}:partial_missing" in result_ids + assert f"{prefix}:mostly_missing" in result_ids + + +def test_missing_vs_empty_field_distinction(missing_fields_index): + """ + Test the critical distinction between missing fields and empty string values. + + This test validates that Redis v2.10's INDEXEMPTY and INDEXMISSING features work + differently: + - INDEXMISSING: Finds documents where the field is completely absent + - INDEXEMPTY: Enables searching for empty string values ("") in TEXT/TAG fields + + Why this matters: Applications need to distinguish between "no data provided" + (missing field) vs "empty data provided" (empty string). This enables more + sophisticated data validation and quality checks. For example, a user might + submit a form with an empty title field vs not providing a title at all. + """ + missing_index, prefix = missing_fields_index + + # Find documents missing the title field entirely (field not present) + missing_title_query = FilterQuery( + filter_expression=Text("title").is_missing(), + return_fields=["id", "description"], + ) + results = missing_index.query(missing_title_query) + assert len(results) == 1 + assert results[0]["id"] == f"{prefix}:mostly_missing" + + # Verify that documents with empty strings are NOT found by is_missing() + # Empty string documents have the field present but with "" value + for result in results: + assert result["id"] != f"{prefix}:empty_strings" + + # Test the same distinction for tag fields + missing_category_query = FilterQuery( + filter_expression=Tag("category").is_missing(), + return_fields=["id", "description"], + ) + results = missing_index.query(missing_category_query) + + # Should find documents missing category field, but not those with empty categories + missing_ids = {result["id"] for result in results} + assert f"{prefix}:mostly_missing" in missing_ids + assert f"{prefix}:partial_missing" in missing_ids + assert f"{prefix}:empty_strings" not in missing_ids # Has empty string, not missing + + +def test_missing_fields_business_logic_integration(missing_fields_index): + """ + Test combining missing field filters with business logic for real-world scenarios. + + This test demonstrates practical applications where missing field detection is + combined with business rules: + - Inventory management: High-value items missing categorization + - Data quality: Products missing price information + - Content management: Articles missing required metadata + + Why this matters: Real applications rarely search for missing fields in isolation. + They combine missing field detection with business logic to identify actionable + data quality issues, incomplete records that need attention, or documents ready + for specific processing workflows. + """ + missing_index, prefix = missing_fields_index + + # Business scenario: Find high-value items (>$50) missing category information + # This helps identify expensive products that need proper categorization + high_value_missing_category = FilterQuery( + filter_expression=(Num("price") > 50) & Tag("category").is_missing(), + return_fields=["id", "price", "description"], + ) + results = missing_index.query(high_value_missing_category) + assert len(results) == 1 # partial_missing has price=75 and missing category + assert results[0]["id"] == f"{prefix}:partial_missing" + + # Business scenario: Find all documents missing price data for inventory audit + # This helps identify products that need pricing information + missing_price_audit = FilterQuery( + filter_expression=Num("price").is_missing(), return_fields=["id", "description"] + ) + results = missing_index.query(missing_price_audit) + assert len(results) == 1 + result_ids = {result["id"] for result in results} + assert f"{prefix}:mostly_missing" in result_ids + + # Business scenario: Find items that are either free (price=0) OR have missing price + free_or_no_price = FilterQuery( + filter_expression=(Num("price") == 0) | Num("price").is_missing(), + return_fields=["id", "price", "description"], + ) + results = missing_index.query(free_or_no_price) + assert len(results) >= 1 # At least mostly_missing, possibly zero_price if indexed + result_ids = {result["id"] for result in results} + assert f"{prefix}:mostly_missing" in result_ids + + +def test_complex_missing_field_combinations(missing_fields_index): + """ + Test complex logical combinations of missing field filters using AND/OR operations. + + This test validates that missing field filters can be combined with logical + operators to create sophisticated queries: + - OR operations: Find documents missing ANY of several critical fields + - AND operations: Find documents missing ALL of a set of fields + - Mixed operations: Complex business rules about data completeness + + Why this matters: Real-world data quality checks often need to identify records + with multiple types of missing data. This enables flexible data quality rules + and completeness scoring for filterable fields. + """ + missing_index, prefix = missing_fields_index + + # Find documents missing ANY critical business field (data quality red flags) + missing_any_critical_field = FilterQuery( + filter_expression=( + Text("title").is_missing() + | Tag("category").is_missing() + | Num("price").is_missing() + ), + return_fields=["id", "description"], + ) + results = missing_index.query(missing_any_critical_field) + assert len(results) == 2 # partial_missing and mostly_missing + result_ids = {result["id"] for result in results} + assert f"{prefix}:partial_missing" in result_ids + assert f"{prefix}:mostly_missing" in result_ids + + # Find documents missing ALL critical fields (severely incomplete records) + missing_all_critical_fields = FilterQuery( + filter_expression=( + Text("title").is_missing() + & Tag("category").is_missing() + & Num("price").is_missing() + ), + return_fields=["id", "description"], + ) + results = missing_index.query(missing_all_critical_fields) + assert len(results) == 1 + assert results[0]["id"] == f"{prefix}:mostly_missing" + + # Find documents complete enough for display (have both title AND category) + display_ready_documents = FilterQuery( + filter_expression=(Text("title") != "") & (Tag("category") != ""), + return_fields=["id", "title", "category"], + ) + results = missing_index.query(display_ready_documents) + assert len(results) >= 2 # Should find complete and zero_price documents + + +def test_data_quality_completeness_analysis(missing_fields_index): + """ + Test data quality analysis patterns using missing field detection. + + This test demonstrates how missing field queries enable data quality monitoring: + - Calculate data completeness rates across the dataset + - Identify records suitable for different processing workflows + - Measure the impact of data quality issues on searchable content + + Why this matters: Organizations need to monitor and improve data quality over time. + Missing field detection enables automated data quality dashboards, completeness + scoring, and identification of records that need human review or automated + enrichment. This is essential for maintaining high-quality searchable content. + """ + missing_index, prefix = missing_fields_index + + # Calculate total searchable documents (baseline for quality metrics) + total_docs = missing_index.query(CountQuery("*")) + assert total_docs >= 3 # At least 3 docs should be indexed and searchable + + # Identify incomplete records missing required business fields + incomplete_records = FilterQuery( + filter_expression=(Text("title").is_missing() | Num("price").is_missing()), + return_fields=["id"], + ) + incomplete_docs = missing_index.query(incomplete_records) + incomplete_count = len(incomplete_docs) + + # Calculate data completeness rate for monitoring + completion_rate = (total_docs - incomplete_count) / total_docs + assert ( + completion_rate > 0.3 + ) # At least 30% of docs should be complete with test data + + # Identify records ready for public display (quality threshold check) + display_ready = FilterQuery( + filter_expression=Text("title") != "", + return_fields=["id", "title", "description"], + ) + results = missing_index.query(display_ready) + assert len(results) >= 1 # Should have at least some displayable documents + + +def test_missing_fields_workflow_filtering(missing_fields_index): + """ + Test missing field filters in workflow and processing scenarios. + + This test validates using missing field detection to filter documents for + specific processing workflows: + - Content processing: Only process documents with sufficient text content + - Recommendation systems: Only include items with pricing data + - Content pipelines: Filter documents ready for each processing stage + + Why this matters: Modern applications have multi-stage processing pipelines + where different stages require different fields to be present. Missing field + detection enables intelligent filtering to ensure each processing step only + receives documents it can actually process, improving efficiency and preventing + errors in downstream systems. + """ + missing_index, prefix = missing_fields_index + + # Workflow 1: Identify documents ready for text processing (need title content) + text_processing_ready = FilterQuery( + filter_expression=( + (Text("title") == "Complete Document") + | (Text("title") == "Partial Document") + | (Text("title") == "Zero Price Item") + ), + return_fields=["id", "title", "description"], + ) + results = missing_index.query(text_processing_ready) + + # Verify all results have text content for processing + for result in results: + assert "title" in result and result["title"] != "" + assert "description" in result + + # Workflow 2: Identify items suitable for price-based recommendations + price_based_recommendations = FilterQuery( + filter_expression=Num("price") >= 0, # Has valid price data + return_fields=["id", "price", "title"], + ) + results = missing_index.query(price_based_recommendations) + assert len(results) >= 2 # Should find multiple documents with pricing + + # Verify all results have pricing data for recommendations + for result in results: + assert "price" in result + + # Workflow 3: Complete document processing (have all key fields) + complete_processing_ready = FilterQuery( + filter_expression=Text("title") == "Complete Document", + return_fields=["id", "title", "category", "price"], + ) + results = missing_index.query(complete_processing_ready) + + # If we find the complete document, verify it has the expected fields + if len(results) > 0: + for result in results: + assert "title" in result and result["title"] != "" + assert "category" in result + assert "price" in result + else: + # If not indexed, that's also acceptable behavior for this test + pass diff --git a/tests/integration/test_search_index.py b/tests/integration/test_search_index.py index ffb5dff5..f2d99b16 100644 --- a/tests/integration/test_search_index.py +++ b/tests/integration/test_search_index.py @@ -32,8 +32,10 @@ @pytest.fixture -def index_schema(): - return IndexSchema.from_dict({"index": {"name": "my_index"}, "fields": fields}) +def index_schema(worker_id): + return IndexSchema.from_dict( + {"index": {"name": f"my_index_{worker_id}"}, "fields": fields} + ) @pytest.fixture @@ -65,7 +67,8 @@ def index_from_yaml(worker_id): def test_search_index_properties(index_schema, index): assert index.schema == index_schema # custom settings - assert index.name == index_schema.index.name == "my_index" + assert index.name == index_schema.index.name + assert index.name.startswith("my_index_") # default settings assert index.prefix == index_schema.index.prefix == "rvl" diff --git a/tests/integration/test_semantic_router.py b/tests/integration/test_semantic_router.py index 2c162502..e0c2c939 100644 --- a/tests/integration/test_semantic_router.py +++ b/tests/integration/test_semantic_router.py @@ -13,6 +13,7 @@ RoutingConfig, ) from redisvl.redis.connection import compare_versions +from tests.conftest import skip_if_redis_version_below def get_base_path(): @@ -88,9 +89,7 @@ def test_get_non_existing_route(semantic_router): def test_single_query(semantic_router): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") match = semantic_router("hello") assert match.name == "greeting" @@ -98,18 +97,14 @@ def test_single_query(semantic_router): def test_single_query_no_match(semantic_router): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") match = semantic_router("unknown_phrase") assert match.name is None def test_multiple_query(semantic_router): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") matches = semantic_router.route_many("hello", max_k=2) assert len(matches) > 0 @@ -127,9 +122,7 @@ def test_update_routing_config(semantic_router): def test_vector_query(semantic_router): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") vector = semantic_router.vectorizer.embed("goodbye") match = semantic_router(vector=vector) @@ -137,9 +130,7 @@ def test_vector_query(semantic_router): def test_vector_query_no_match(semantic_router): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") vector = [ 0.0 @@ -372,9 +363,7 @@ def test_deprecated_dtype_argument(routes, redis_url): def test_deprecated_distance_threshold_argument(semantic_router, routes, redis_url): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") router = SemanticRouter( name="test_pass_through_dtype", @@ -389,9 +378,7 @@ def test_deprecated_distance_threshold_argument(semantic_router, routes, redis_u def test_routes_different_distance_thresholds_get_two( semantic_router, routes, redis_url ): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") routes[0].distance_threshold = 0.5 routes[1].distance_threshold = 0.7 @@ -411,9 +398,7 @@ def test_routes_different_distance_thresholds_get_two( def test_routes_different_distance_thresholds_get_one( semantic_router, routes, redis_url ): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") routes[0].distance_threshold = 0.5 @@ -433,9 +418,7 @@ def test_routes_different_distance_thresholds_get_one( def test_add_delete_route_references(semantic_router): - redis_version = semantic_router._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(semantic_router._index.client, "7.0.0") # Add new references to an existing route added_refs = semantic_router.add_route_references( @@ -477,8 +460,7 @@ def test_add_delete_route_references(semantic_router): def test_from_existing(client, redis_url, routes): - if not compare_versions(client.info()["redis_version"], "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(client, "7.0.0") # connect separately router = SemanticRouter( diff --git a/tests/integration/test_threshold_optimizer.py b/tests/integration/test_threshold_optimizer.py index c30bf931..f2689ac7 100644 --- a/tests/integration/test_threshold_optimizer.py +++ b/tests/integration/test_threshold_optimizer.py @@ -10,12 +10,12 @@ from redisvl.extensions.cache.llm import SemanticCache from redisvl.extensions.router import Route, SemanticRouter from redisvl.extensions.router.schema import RoutingConfig -from redisvl.redis.connection import compare_versions from redisvl.utils.optimize import ( CacheThresholdOptimizer, EvalMetric, RouterThresholdOptimizer, ) +from tests.conftest import skip_if_redis_version_below @pytest.fixture @@ -92,9 +92,7 @@ def test_routes_different_distance_thresholds_optimizer_default( routes, redis_url, test_data_optimization, hf_vectorizer ): redis = Redis.from_url(redis_url) - redis_version = redis.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(redis, "7.0.0") zero_threshold = 0.0 @@ -127,9 +125,7 @@ def test_routes_different_distance_thresholds_optimizer_precision( routes, redis_url, test_data_optimization, hf_vectorizer ): redis = Redis.from_url(redis_url) - redis_version = redis.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(redis, "7.0.0") zero_threshold = 0.0 @@ -164,9 +160,7 @@ def test_routes_different_distance_thresholds_optimizer_recall( routes, redis_url, test_data_optimization, hf_vectorizer, client ): redis = Redis.from_url(redis_url) - redis_version = redis.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(redis, "7.0.0") zero_threshold = 0.0 @@ -205,9 +199,7 @@ def test_optimize_threshold_cache_default(redis_url): distance_threshold=null_threshold, ) - redis_version = cache._index.client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(cache._index.client, "7.0.0") paris_key = cache.store(prompt="what is the capital of france?", response="paris") rabat_key = cache.store(prompt="what is the capital of morocco?", response="rabat") @@ -226,9 +218,7 @@ def test_optimize_threshold_cache_default(redis_url): def test_optimize_threshold_cache_precision(client, redis_url): - redis_version = client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(client, "7.0.0") null_threshold = 0.0 cache = SemanticCache( @@ -254,9 +244,7 @@ def test_optimize_threshold_cache_precision(client, redis_url): def test_optimize_threshold_cache_recall(client, redis_url): - redis_version = client.info()["redis_version"] - if not compare_versions(redis_version, "7.0.0"): - pytest.skip("Not using a late enough version of Redis") + skip_if_redis_version_below(client, "7.0.0") null_threshold = 0.0 cache = SemanticCache( diff --git a/tests/unit/test_fields.py b/tests/unit/test_fields.py index 0c0d504e..bd8b41ac 100644 --- a/tests/unit/test_fields.py +++ b/tests/unit/test_fields.py @@ -1,5 +1,3 @@ -from typing import Any, Optional, Tuple - import pytest from redis.commands.search.field import GeoField as RedisGeoField from redis.commands.search.field import NumericField as RedisNumericField @@ -219,3 +217,208 @@ def test_create_unknown_field_type(): with pytest.raises(ValueError) as excinfo: FieldFactory.create_field("unknown", "example_field") assert "Unknown field type: unknown" in str(excinfo.value) + + +# Tests for new index_missing and index_empty attributes +def test_field_attributes_index_missing_and_empty(): + """Test the new index_missing and index_empty field attributes.""" + + # Test TextField with both attributes + text_field = TextField( + name="description", + attrs={"index_missing": True, "index_empty": True, "sortable": True}, + ) + assert text_field.attrs.index_missing == True + assert text_field.attrs.index_empty == True + assert text_field.attrs.sortable == True + + # Test TagField with both attributes + tag_field = TagField( + name="tags", + attrs={"index_missing": True, "index_empty": True, "case_sensitive": True}, + ) + assert tag_field.attrs.index_missing == True + assert tag_field.attrs.index_empty == True + assert tag_field.attrs.case_sensitive == True + + # Test NumericField with index_missing only (index_empty not supported) + num_field = NumericField( + name="price", attrs={"index_missing": True, "sortable": True} + ) + assert num_field.attrs.index_missing == True + assert num_field.attrs.sortable == True + + # Test GeoField with index_missing only (index_empty not supported) + geo_field = GeoField(name="location", attrs={"index_missing": True}) + assert geo_field.attrs.index_missing == True + + # Test vector fields with index_missing + flat_vector_field = FlatVectorField( + name="embedding", + attrs={"algorithm": "flat", "dims": 128, "index_missing": True}, + ) + assert flat_vector_field.attrs.index_missing == True + assert flat_vector_field.attrs.dims == 128 + + hnsw_vector_field = HNSWVectorField( + name="embedding2", + attrs={"algorithm": "hnsw", "dims": 256, "index_missing": True}, + ) + assert hnsw_vector_field.attrs.index_missing == True + assert hnsw_vector_field.attrs.dims == 256 + + +def test_default_index_missing_and_empty_values(): + """Test that index_missing and index_empty default to False.""" + + # Test default values for text field + text_field = TextField(name="description") + assert text_field.attrs.index_missing == False + assert text_field.attrs.index_empty == False + + # Test default values for tag field + tag_field = TagField(name="tags") + assert tag_field.attrs.index_missing == False + assert tag_field.attrs.index_empty == False + + # Test default values for numeric field + num_field = NumericField(name="price") + assert num_field.attrs.index_missing == False + + # Test default values for geo field + geo_field = GeoField(name="location") + assert geo_field.attrs.index_missing == False + + # Test default values for vector fields + flat_vector_field = FlatVectorField( + name="embedding", attrs={"algorithm": "flat", "dims": 128} + ) + assert flat_vector_field.attrs.index_missing == False + + hnsw_vector_field = HNSWVectorField( + name="embedding2", attrs={"algorithm": "hnsw", "dims": 256} + ) + assert hnsw_vector_field.attrs.index_missing == False + + +@pytest.mark.parametrize( + "field_class,field_name,extra_attrs,supports_index_empty", + [ + (TextField, "text_field", {"weight": 2.0}, True), + (TagField, "tag_field", {"separator": "|"}, True), + (NumericField, "num_field", {"sortable": True}, False), + (GeoField, "geo_field", {"sortable": True}, False), + ], +) +def test_redis_field_creation_with_index_attributes( + field_class, field_name, extra_attrs, supports_index_empty +): + """Test that index_missing and index_empty are properly passed to Redis field objects.""" + + # Test with index_missing=True + attrs = {"index_missing": True} + attrs.update(extra_attrs) + + if supports_index_empty: + attrs["index_empty"] = True + + field = field_class(name=field_name, attrs=attrs) + redis_field = field.as_redis_field() + + # Check that the field was created successfully + assert redis_field.name == field_name + + # For Redis fields, these attributes would be passed as keyword arguments + # We can't directly inspect them, but we can verify the field creation doesn't fail + + +def test_vector_fields_redis_creation_with_index_missing(): + """Test that vector fields properly handle index_missing in Redis field creation.""" + + # Test FlatVectorField with index_missing + flat_field = FlatVectorField( + name="flat_embedding", + attrs={ + "algorithm": "flat", + "dims": 128, + "index_missing": True, + "block_size": 100, + }, + ) + redis_field = flat_field.as_redis_field() + assert isinstance(redis_field, RedisVectorField) + assert redis_field.name == "flat_embedding" + + # Test HNSWVectorField with index_missing + hnsw_field = HNSWVectorField( + name="hnsw_embedding", + attrs={"algorithm": "hnsw", "dims": 256, "index_missing": True, "m": 24}, + ) + redis_field = hnsw_field.as_redis_field() + assert isinstance(redis_field, RedisVectorField) + assert redis_field.name == "hnsw_embedding" + + +def test_vector_field_data_includes_index_missing(): + """Test that vector field field_data includes INDEXMISSING when enabled.""" + + # Test with index_missing=True + flat_field_with_missing = FlatVectorField( + name="embedding", + attrs={"algorithm": "flat", "dims": 128, "index_missing": True}, + ) + field_data = flat_field_with_missing.attrs.field_data + assert "INDEXMISSING" in field_data + assert field_data["INDEXMISSING"] == True + + # Test with index_missing=False (default) + flat_field_without_missing = FlatVectorField( + name="embedding", attrs={"algorithm": "flat", "dims": 128} + ) + field_data = flat_field_without_missing.attrs.field_data + assert "INDEXMISSING" not in field_data + + # Test HNSW field with index_missing=True + hnsw_field_with_missing = HNSWVectorField( + name="embedding", + attrs={"algorithm": "hnsw", "dims": 256, "index_missing": True}, + ) + field_data = hnsw_field_with_missing.attrs.field_data + assert "INDEXMISSING" in field_data + assert field_data["INDEXMISSING"] == True + + +def test_field_factory_with_new_attributes(): + """Test FieldFactory.create_field with the new index attributes.""" + + # Test creating TextField with new attributes + text_field = FieldFactory.create_field( + "text", "description", attrs={"index_missing": True, "index_empty": True} + ) + assert isinstance(text_field, TextField) + assert text_field.attrs.index_missing == True + assert text_field.attrs.index_empty == True + + # Test creating TagField with new attributes + tag_field = FieldFactory.create_field( + "tag", "categories", attrs={"index_missing": True, "index_empty": True} + ) + assert isinstance(tag_field, TagField) + assert tag_field.attrs.index_missing == True + assert tag_field.attrs.index_empty == True + + # Test creating NumericField with index_missing + num_field = FieldFactory.create_field( + "numeric", "price", attrs={"index_missing": True} + ) + assert isinstance(num_field, NumericField) + assert num_field.attrs.index_missing == True + + # Test creating vector field with index_missing + vector_field = FieldFactory.create_field( + "vector", + "embedding", + attrs={"algorithm": "flat", "dims": 128, "index_missing": True}, + ) + assert isinstance(vector_field, FlatVectorField) + assert vector_field.attrs.index_missing == True diff --git a/tests/unit/test_filter.py b/tests/unit/test_filter.py index dae74240..7b2a1261 100644 --- a/tests/unit/test_filter.py +++ b/tests/unit/test_filter.py @@ -470,8 +470,6 @@ def test_timestamp_invalid_input(): def test_timestamp_filter_combination(): """Test combining timestamp filters with other filters.""" - from redisvl.query.filter import Num, Tag - ts = Timestamp("created_at") > datetime(2023, 3, 1) num = Num("age") > 30 tag = Tag("status") == "active" @@ -482,3 +480,48 @@ def test_timestamp_filter_combination(): assert str(combined).startswith("((@created_at:") assert "@age:[(30 +inf]" in str(combined) assert "@status:{active}" in str(combined) + + +def test_is_missing_filter_methods(): + """Test the new is_missing() method for all filter types.""" + # Test all filter types + tag_missing = Tag("brand").is_missing() + text_missing = Text("title").is_missing() + num_missing = Num("price").is_missing() + geo_missing = Geo("location").is_missing() + timestamp_missing = Timestamp("created_at").is_missing() + + # Check that they generate the correct query strings + assert str(tag_missing) == "ismissing(@brand)" + assert str(text_missing) == "ismissing(@title)" + assert str(num_missing) == "ismissing(@price)" + assert str(geo_missing) == "ismissing(@location)" + assert str(timestamp_missing) == "ismissing(@created_at)" + + +def test_is_missing_filter_combinations(): + """Test combining is_missing filters with other filters.""" + # Test combining is_missing with regular filters + missing_brand = Tag("brand").is_missing() + has_price = Num("price") > 100 + has_tag = Tag("category") == "electronics" + + # Test AND combinations + combined_and = missing_brand & has_price + combined_str = str(combined_and) + assert "ismissing(@brand)" in combined_str + assert "@price:[(100 +inf]" in combined_str + + # Test OR combinations + combined_or = missing_brand | has_tag + combined_str = str(combined_or) + assert "ismissing(@brand)" in combined_str + assert "@category:{electronics}" in combined_str + assert " | " in combined_str + + # Test complex combinations + complex_filter = (missing_brand & has_price) | has_tag + complex_str = str(complex_filter) + assert "ismissing(@brand)" in complex_str + assert "@price:[(100 +inf]" in complex_str + assert "@category:{electronics}" in complex_str diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 26878cb5..3829b85a 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -178,3 +178,131 @@ def test_from_yaml_file_not_found(): """Test loading from yaml with file not found.""" with pytest.raises(FileNotFoundError): IndexSchema.from_yaml("nonexistent_file") + + +def test_schema_with_index_missing_and_empty_attributes(): + """Test schema creation and operations with INDEXMISSING and INDEXEMPTY attributes.""" + schema_dict = { + "index": { + "name": "test-missing-empty", + "prefix": "test", + "storage_type": "hash", + }, + "fields": [ + { + "name": "title", + "type": "text", + "attrs": {"index_missing": True, "index_empty": True, "sortable": True}, + }, + { + "name": "tags", + "type": "tag", + "attrs": {"index_missing": True, "index_empty": True}, + }, + { + "name": "price", + "type": "numeric", + "attrs": {"index_missing": True, "sortable": True}, + }, + {"name": "location", "type": "geo", "attrs": {"index_missing": True}}, + { + "name": "embedding", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 128, + "distance_metric": "cosine", + "index_missing": True, + }, + }, + ], + } + + # Test schema creation + schema = IndexSchema.from_dict(schema_dict) + + # Verify field attributes are correctly set + assert schema.fields["title"].attrs.index_missing == True + assert schema.fields["title"].attrs.index_empty == True + assert schema.fields["title"].attrs.sortable == True + + assert schema.fields["tags"].attrs.index_missing == True + assert schema.fields["tags"].attrs.index_empty == True + + assert schema.fields["price"].attrs.index_missing == True + assert schema.fields["price"].attrs.sortable == True + + assert schema.fields["location"].attrs.index_missing == True + + assert schema.fields["embedding"].attrs.index_missing == True + assert schema.fields["embedding"].attrs.dims == 128 + + # Test Redis field conversion + redis_fields = schema.redis_fields + assert len(redis_fields) == 5 + + # Verify all fields can be converted to Redis fields successfully + for field_name, field in schema.fields.items(): + redis_field = field.as_redis_field() + assert redis_field.name == field_name + + +def test_schema_serialization_with_new_attributes(): + """Test schema creation and field attribute handling with INDEXMISSING and INDEXEMPTY attributes.""" + original_schema_dict = { + "index": { + "name": "test-serialization", + "prefix": "ser", + "storage_type": "hash", + }, + "fields": [ + { + "name": "description", + "type": "text", + "attrs": {"index_missing": True, "index_empty": True, "weight": 2.0}, + }, + { + "name": "categories", + "type": "tag", + "attrs": {"index_missing": True, "index_empty": True, "separator": "|"}, + }, + {"name": "score", "type": "numeric", "attrs": {"index_missing": True}}, + { + "name": "vector_field", + "type": "vector", + "attrs": { + "algorithm": "hnsw", + "dims": 256, + "index_missing": True, + "m": 24, + }, + }, + ], + } + + # Create schema from dict + schema = IndexSchema.from_dict(original_schema_dict) + + # Verify field attributes are correctly set after creation + assert schema.fields["description"].attrs.index_missing == True + assert schema.fields["description"].attrs.index_empty == True + assert schema.fields["description"].attrs.weight == 2.0 + + assert schema.fields["categories"].attrs.index_missing == True + assert schema.fields["categories"].attrs.index_empty == True + assert schema.fields["categories"].attrs.separator == "|" + + assert schema.fields["score"].attrs.index_missing == True + + assert schema.fields["vector_field"].attrs.index_missing == True + assert schema.fields["vector_field"].attrs.dims == 256 + assert schema.fields["vector_field"].attrs.m == 24 + + # Test that Redis field conversion works with new attributes + for field_name, field in schema.fields.items(): + redis_field = field.as_redis_field() + assert redis_field.name == field_name + + # Test that the schema has the correct number of fields + assert len(schema.fields) == 4 + assert schema.index.name == "test-serialization"