Add ability to clear ALL data associated with an index (redis#179)

This PR introduces the `clear()` method to the core SearchEngine class in RedisVL. This now allows for the ability to clear out all data, while leaving the index in place. Useful for manual cache eviction, manual session clearing, and more. This PR also updates the extension classes to use the new `clear()` method as opposed to the SCAN ITER approach.
bsbodden · Jul 9, 2024 · 9ca93ef · 9ca93ef
1 parent 61e7338
commit 9ca93ef
Show file tree

Hide file tree

Showing 12 changed files with 135 additions and 22 deletions.
diff --git a/docs/user_guide/getting_started_01.ipynb b/docs/user_guide/getting_started_01.ipynb
@@ -653,13 +653,44 @@
     "## Cleanup"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below we will clean up after our work. First, you can optionally flush all data from Redis associated with the index by\n",
+    "using the `.clear()` method. This will leave the secondary index in place for future insertions or updates.\n",
+    "\n",
+    "But if you want to clean up everything, including the index, just use `.delete()`\n",
+    "which will by default remove the index AND the underlying data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# (optionally) clear all data from Redis associated with the index\n",
+    "await index.clear()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# but the index is still in place\n",
+    "await index.exists()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# clean up the index\n",
+    "# remove / delete the index in its entirety\n",
     "await index.delete()"
    ]
   }
@@ -680,7 +711,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.10.14"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/docs/user_guide/vectorizers_04.ipynb b/docs/user_guide/vectorizers_04.ipynb
@@ -531,14 +531,14 @@
     }
    ],
    "source": [
-    "from redisvl.utils.vectorize import MistralAITextVectorizer\n",
+    "# from redisvl.utils.vectorize import MistralAITextVectorizer\n",
     "\n",
-    "mistral = MistralAITextVectorizer()\n",
+    "# mistral = MistralAITextVectorizer()\n",
     "\n",
-    "# mebed a sentence using their asyncronous method\n",
-    "test = await mistral.aembed(\"This is a test sentence.\")\n",
-    "print(\"Vector dimensions: \", len(test))\n",
-    "print(test[:10])"
+    "# # embed a sentence using their asyncronous method\n",
+    "# test = await mistral.aembed(\"This is a test sentence.\")\n",
+    "# print(\"Vector dimensions: \", len(test))\n",
+    "# print(test[:10])"
    ]
   },
   {

diff --git a/redisvl/extensions/llmcache/semantic.py b/redisvl/extensions/llmcache/semantic.py
@@ -176,10 +176,7 @@ def set_vectorizer(self, vectorizer: BaseVectorizer) -> None:
 
     def clear(self) -> None:
         """Clear the cache of all keys while preserving the index."""
-        with self._index.client.pipeline(transaction=False) as pipe:  # type: ignore
-            for key in self._index.client.scan_iter(match=f"{self._index.prefix}:*"):  # type: ignore
-                pipe.delete(key)
-            pipe.execute()
+        self._index.clear()
 
     def delete(self) -> None:
         """Clear the semantic cache of all keys and remove the underlying search

diff --git a/redisvl/extensions/session_manager/semantic_session.py b/redisvl/extensions/session_manager/semantic_session.py
@@ -130,10 +130,7 @@ def set_scope(
 
     def clear(self) -> None:
         """Clears the chat session history."""
-        with self._index.client.pipeline(transaction=False) as pipe:  # type: ignore
-            for key in self._index.client.scan_iter(match=f"{self._index.prefix}:*"):  # type: ignore
-                pipe.delete(key)
-            pipe.execute()
+        self._index.clear()
 
     def delete(self) -> None:
         """Clear all conversation keys and remove the search index."""

diff --git a/redisvl/index/index.py b/redisvl/index/index.py
@@ -23,7 +23,8 @@
 from redis.commands.search.indexDefinition import IndexDefinition
 
 from redisvl.index.storage import HashStorage, JsonStorage
-from redisvl.query.query import BaseQuery, CountQuery, FilterQuery
+from redisvl.query import BaseQuery, CountQuery, FilterQuery
+from redisvl.query.filter import FilterExpression
 from redisvl.redis.connection import (
     RedisConnectionFactory,
     convert_index_info_to_schema,
@@ -476,6 +477,26 @@ def delete(self, drop: bool = True):
         except:
             logger.exception("Error while deleting index")
 
+    def clear(self) -> int:
+        """Clear all keys in Redis associated with the index, leaving the index
+        available and in-place for future insertions or updates.
+
+        Returns:
+            int: Count of records deleted from Redis.
+        """
+        # Track deleted records
+        total_records_deleted: int = 0
+
+        # Paginate using queries and delete in batches
+        for batch in self.paginate(
+            FilterQuery(FilterExpression("*"), return_fields=["id"]), page_size=500
+        ):
+            batch_keys = [record["id"] for record in batch]
+            record_deleted = self._redis_client.delete(*batch_keys)  # type: ignore
+            total_records_deleted += record_deleted  # type: ignore
+
+        return total_records_deleted
+
     def load(
         self,
         data: Iterable[Any],
@@ -894,6 +915,26 @@ async def delete(self, drop: bool = True):
             logger.exception("Error while deleting index")
             raise
 
+    async def clear(self) -> int:
+        """Clear all keys in Redis associated with the index, leaving the index
+        available and in-place for future insertions or updates.
+
+        Returns:
+            int: Count of records deleted from Redis.
+        """
+        # Track deleted records
+        total_records_deleted: int = 0
+
+        # Paginate using queries and delete in batches
+        async for batch in self.paginate(
+            FilterQuery(FilterExpression("*"), return_fields=["id"]), page_size=500
+        ):
+            batch_keys = [record["id"] for record in batch]
+            records_deleted = await self._redis_client.delete(*batch_keys)  # type: ignore
+            total_records_deleted += records_deleted  # type: ignore
+
+        return total_records_deleted
+
     async def load(
         self,
         data: Iterable[Any],

diff --git a/redisvl/query/__init__.py b/redisvl/query/__init__.py
@@ -1,3 +1,9 @@
-from redisvl.query.query import CountQuery, FilterQuery, RangeQuery, VectorQuery
+from redisvl.query.query import (
+    BaseQuery,
+    CountQuery,
+    FilterQuery,
+    RangeQuery,
+    VectorQuery,
+)
 
-__all__ = ["VectorQuery", "FilterQuery", "RangeQuery", "CountQuery"]
+__all__ = ["BaseQuery", "VectorQuery", "FilterQuery", "RangeQuery", "CountQuery"]
diff --git a/schemas/schema.yaml b/schemas/schema.yaml
@@ -9,7 +9,7 @@ index:
 fields:
   - name: user
     type: tag
-    path: '.user'
+    path: '$.user'
   - name: credit_score
     type: tag
     path: '$.credit_score'

diff --git a/tests/integration/test_async_search_index.py b/tests/integration/test_async_search_index.py
@@ -172,6 +172,18 @@ async def test_search_index_delete(async_client, async_index):
     )
 
 
+@pytest.mark.asyncio
+async def test_search_index_clear(async_client, async_index):
+    async_index.set_client(async_client)
+    await async_index.create(overwrite=True, drop=True)
+    data = [{"id": "1", "test": "foo"}]
+    await async_index.load(data, id_field="id")
+
+    count = await async_index.clear()
+    assert count == len(data)
+    assert await async_index.exists()
+
+
 @pytest.mark.asyncio
 async def test_search_index_load_and_fetch(async_client, async_index):
     async_index.set_client(async_client)

diff --git a/tests/integration/test_flow.py b/tests/integration/test_flow.py
@@ -90,4 +90,11 @@ def hash_preprocess(item: dict) -> dict:
         for field in return_fields:
             assert getattr(doc1, field) == doc2[field]
 
+    count_deleted_keys = index.clear()
+    assert count_deleted_keys == len(sample_data)
+
+    assert index.exists() == True
+
     index.delete()
+
+    assert index.exists() == False
diff --git a/tests/integration/test_flow_async.py b/tests/integration/test_flow_async.py
@@ -93,4 +93,11 @@ async def hash_preprocess(item: dict) -> dict:
         for field in return_fields:
             assert getattr(doc1, field) == doc2[field]
 
+    count_deleted_keys = await index.clear()
+    assert count_deleted_keys == len(sample_data)
+
+    assert await index.exists() == True
+
     await index.delete()
+
+    assert await index.exists() == False
diff --git a/tests/integration/test_search_index.py b/tests/integration/test_search_index.py
@@ -159,6 +159,17 @@ def test_search_index_delete(client, index):
     assert index.name not in convert_bytes(index.client.execute_command("FT._LIST"))
 
 
+def test_search_index_clear(client, index):
+    index.set_client(client)
+    index.create(overwrite=True, drop=True)
+    data = [{"id": "1", "test": "foo"}]
+    index.load(data, id_field="id")
+
+    count = index.clear()
+    assert count == len(data)
+    assert index.exists()
+
+
 def test_search_index_load_and_fetch(client, index):
     index.set_client(client)
     index.create(overwrite=True, drop=True)

diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py
@@ -27,7 +27,7 @@ def skip_vectorizer() -> bool:
         VertexAITextVectorizer,
         CohereTextVectorizer,
         AzureOpenAITextVectorizer,
-        MistralAITextVectorizer,
+        # MistralAITextVectorizer,
         CustomTextVectorizer,
     ]
 )
@@ -218,7 +218,11 @@ def bad_return_type(text: str) -> str:
 
 
 @pytest.fixture(
-    params=[OpenAITextVectorizer, MistralAITextVectorizer, CustomTextVectorizer]
+    params=[
+        OpenAITextVectorizer,
+        # MistralAITextVectorizer,
+        CustomTextVectorizer,
+    ]
 )
 def avectorizer(request, skip_vectorizer):
     if skip_vectorizer: