-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: added support for elasticsearch as a datasource #402
Changes from 1 commit
0168301
ab67c9d
ef0219e
45e071d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
--- | ||
title: '💾 Vector Database' | ||
--- | ||
|
||
We support `Chroma` and `Elasticsearch` as two vector database. | ||
`Chroma` is used as a default database. | ||
|
||
### Elasticsearch | ||
In order to use `Elasticsearch` as vector database we need to use App type `CustomApp`. | ||
```python | ||
import os | ||
from embedchain import CustomApp | ||
from embedchain.config import CustomAppConfig, ElasticsearchDBConfig | ||
from embedchain.models import Providers, EmbeddingFunctions, VectorDatabases | ||
|
||
os.environ["OPENAI_API_KEY"] = 'OPENAI_API_KEY' | ||
|
||
es_config = ElasticsearchDBConfig( | ||
# elasticsearch url or list of nodes url with different hosts and ports. | ||
es_url='https://localhost:9200', | ||
# pass named parameters supported by Python Elasticsearch client | ||
ca_certs="/path/to/http_ca.crt", | ||
basic_auth=("username", "password") | ||
) | ||
config = CustomAppConfig( | ||
embedding_fn=EmbeddingFunctions.OPENAI, | ||
provider=Providers.OPENAI, | ||
db_type=VectorDatabases.ELASTICSEARCH, | ||
es_config=es_config, | ||
) | ||
es_app = CustomApp(config) | ||
``` | ||
- Set `db_type=VectorDatabases.ELASTICSEARCH` and `es_config=ElasticsearchDBConfig(es_url='')` in `CustomAppConfig`. | ||
- `ElasticsearchDBConfig` accepts `es_url` as elasticsearch url or as list of nodes url with different hosts and ports. Additionally we can pass named paramaters supported by Python Elasticsearch client. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
import logging | ||
|
||
from embedchain.config.BaseConfig import BaseConfig | ||
from embedchain.config.vectordbs import ElasticsearchDBConfig | ||
from embedchain.models import VectorDatabases, VectorDimensions | ||
|
||
|
||
|
@@ -20,6 +21,7 @@ def __init__( | |
collection_name=None, | ||
db_type: VectorDatabases = None, | ||
vector_dim: VectorDimensions = None, | ||
es_config: ElasticsearchDBConfig = None, | ||
): | ||
""" | ||
:param log_level: Optional. (String) Debug level | ||
|
@@ -32,6 +34,7 @@ def __init__( | |
:param collection_name: Optional. Collection name for the database. | ||
:param db_type: Optional. type of Vector database to use | ||
:param vector_dim: Vector dimension generated by embedding fn | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should raise an error if Similar handling is done here: https://github.com/embedchain/embedchain/blob/5e94980aaa801843661ccd18a16f46ed8c28a871/embedchain/apps/CustomApp.py#L84 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think we can skip this part. right now vector_dim is not used with chroma as it computes the dimension itself. but in future we may need it. |
||
:param es_config: Optional. elasticsearch database config to be used for connection | ||
""" | ||
self._setup_logging(log_level) | ||
self.collection_name = collection_name if collection_name else "embedchain_store" | ||
|
@@ -43,12 +46,13 @@ def __init__( | |
db_type=db_type, | ||
vector_dim=vector_dim, | ||
collection_name=self.collection_name, | ||
es_config=es_config, | ||
) | ||
self.id = id | ||
return | ||
|
||
@staticmethod | ||
def get_db(db, embedding_fn, host, port, db_type, vector_dim, collection_name): | ||
def get_db(db, embedding_fn, host, port, db_type, vector_dim, collection_name, es_config): | ||
""" | ||
Get db based on db_type, db with default database (`ChromaDb`) | ||
:param Optional. (Vector) database to use for embeddings. | ||
|
@@ -58,6 +62,7 @@ def get_db(db, embedding_fn, host, port, db_type, vector_dim, collection_name): | |
:param db_type: Optional. db type to use. Supported values (`es`, `chroma`) | ||
:param vector_dim: Vector dimension generated by embedding fn | ||
:param collection_name: Optional. Collection name for the database. | ||
:param es_config: Optional. elasticsearch database config to be used for connection | ||
:raises ValueError: BaseAppConfig knows no default embedding function. | ||
:returns: database instance | ||
""" | ||
|
@@ -70,7 +75,9 @@ def get_db(db, embedding_fn, host, port, db_type, vector_dim, collection_name): | |
if db_type == VectorDatabases.ELASTICSEARCH: | ||
from embedchain.vectordb.elasticsearch_db import ElasticsearchDB | ||
|
||
return ElasticsearchDB(embedding_fn=embedding_fn, vector_dim=vector_dim, collection_name=collection_name) | ||
return ElasticsearchDB( | ||
embedding_fn=embedding_fn, vector_dim=vector_dim, collection_name=collection_name, es_config=es_config | ||
) | ||
|
||
from embedchain.vectordb.chroma_db import ChromaDB | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from typing import Dict, List, Union | ||
|
||
from embedchain.config.BaseConfig import BaseConfig | ||
|
||
|
||
class ElasticsearchDBConfig(BaseConfig): | ||
""" | ||
Config to initialize an elasticsearch client. | ||
:param es_url. elasticsearch url or list of nodes url to be used for connection | ||
:param ES_EXTRA_PARAMS: extra params dict that can be passed to elasticsearch. | ||
""" | ||
|
||
def __init__(self, es_url: Union[str, List[str]] = None, **ES_EXTRA_PARAMS: Dict[str, any]): | ||
self.ES_URL = es_url | ||
self.ES_EXTRA_PARAMS = ES_EXTRA_PARAMS |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,33 @@ | ||
import unittest | ||
from unittest.mock import MagicMock, Mock, patch | ||
from unittest.mock import Mock | ||
|
||
from embedchain.config import ElasticsearchDBConfig | ||
from embedchain.vectordb.elasticsearch_db import ElasticsearchDB | ||
|
||
|
||
class TestEsDB(unittest.TestCase): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know this isn't a helpful comment, but maybe more positive tests wouldn't hurt. Like testing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes we should have both positive and negative tests There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need help here, need to figure out how to mock elasticsearch client to successfully test both positive and negative test cases. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @pc9 : can you open a new issue for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure will do that. |
||
def setUp(self): | ||
# set mock es client | ||
self.mock_client = MagicMock() | ||
self.mock_client.indices.exists.return_value = True | ||
self.es_config = ElasticsearchDBConfig() | ||
self.vector_dim = 384 | ||
|
||
def test_init_with_invalid_embedding_fn(self): | ||
# Test if an exception is raised when an invalid embedding_fn is provided | ||
with self.assertRaises(ValueError): | ||
ElasticsearchDB(embedding_fn=None) | ||
|
||
def test_init_with_invalid_es_config(self): | ||
# Test if an exception is raised when an invalid es_config is provided | ||
with self.assertRaises(ValueError): | ||
ElasticsearchDB(embedding_fn=Mock(), es_config=None) | ||
|
||
def test_init_with_invalid_vector_dim(self): | ||
# Test if an exception is raised when an invalid vector_dim is provided | ||
with self.assertRaises(ValueError): | ||
ElasticsearchDB(embedding_fn=Mock(), es_client=self.mock_client, vector_dim=None) | ||
|
||
def test_init_with_valid_embedding_and_client(self): | ||
# check for successful creation of ElasticsearchDB instance | ||
esdb = ElasticsearchDB(embedding_fn=Mock(), es_client=self.mock_client, vector_dim=1024) | ||
self.assertIsInstance(esdb, ElasticsearchDB) | ||
ElasticsearchDB(embedding_fn=Mock(), es_config=self.es_config, vector_dim=None) | ||
|
||
@patch("os.getenv") # Mock the os.getenv function to return None for ES_ENDPOINT | ||
def test_init_with_missing_endpoint(self, mock_os_getenv): | ||
# Test if an exception is raised when ES_ENDPOINT is missing | ||
mock_os_getenv.return_value = None | ||
def test_init_with_invalid_collection_name(self): | ||
# Test if an exception is raised when an invalid collection_name is provided | ||
with self.assertRaises(ValueError): | ||
ElasticsearchDB(embedding_fn=Mock()) | ||
ElasticsearchDB( | ||
embedding_fn=Mock(), es_config=self.es_config, vector_dim=self.vector_dim, collection_name=None | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall, I think we have to talk about the name of this section. Why not call it what it is,
Vector Database
? And then the sections are not clear to me. it should probably bebut saying
"Chromadb" is used as default.
and then jumping to an Elasticsearch example might be confusing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree, have added
Vector Database
andElasticsearch
heading, I was unsure what to add underChromaDb
so I have skipped it.