Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added data-format restructuring #92

Merged
merged 6 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
resolved merge conflict
  • Loading branch information
Aaishik Dutta authored and Aaishik Dutta committed Jul 7, 2023
commit 2a061c1b46826205d380824cfeb9aaaff9e11ee8
9 changes: 9 additions & 0 deletions embedchain/data_format/data_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@
from embedchain.loaders.web_page import WebPageLoader
from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
from embedchain.loaders.local_text import LocalTextLoader
from embedchain.loaders.docx_file import DocxFileLoader
from embedchain.chunkers.youtube_video import YoutubeVideoChunker
from embedchain.chunkers.pdf_file import PdfFileChunker
from embedchain.chunkers.web_page import WebPageChunker
from embedchain.chunkers.qna_pair import QnaPairChunker
from embedchain.chunkers.text import TextChunker
from embedchain.chunkers.docx_file import DocxFileChunker


class DataFormat:
"""
DataFormat is an internal utility class which abstracts the mapping for
"""
def __init__(self, data_type):
self.loader = self._get_loader(data_type)
self.chunker = self._get_chunker(data_type)
Expand All @@ -28,6 +34,7 @@ def _get_loader(self, data_type):
'web_page': WebPageLoader(),
'qna_pair': LocalQnaPairLoader(),
'text': LocalTextLoader(),
'docx': DocxFileLoader(),
}
if data_type in loaders:
return loaders[data_type]
Expand All @@ -48,8 +55,10 @@ def _get_chunker(self, data_type):
'web_page': WebPageChunker(),
'qna_pair': QnaPairChunker(),
'text': TextChunker(),
'docx': DocxFileChunker(),
}
if data_type in chunkers:
return chunkers[data_type]
else:
raise ValueError(f"Unsupported data type: {data_type}")

17 changes: 11 additions & 6 deletions embedchain/embedchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,11 @@
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from embedchain.config import InitConfig, AddConfig, QueryConfig, ChatConfig
from embedchain.config.QueryConfig import DEFAULT_PROMPT

from embedchain.vectordb.chroma_db import ChromaDB
from embedchain.data_format import DataFormat


gpt4all_model = None

load_dotenv()
Expand All @@ -38,7 +34,10 @@ def __init__(self, config: InitConfig):
self.collection = self.config.db.collection
self.user_asks = []

def add(self, data_type, url):



def add(self, data_type, url, config: AddConfig = None):
"""
Adds the data from the given URL to the vector db.
Loads the data, chunks it, create embedding for each chunk
Expand All @@ -48,6 +47,9 @@ def add(self, data_type, url):
:param url: The URL where the data is located.
:param config: Optional. The `AddConfig` instance to use as configuration options.
"""
if config is None:
config = AddConfig()

data_format = DataFormat(data_type)
self.user_asks.append([data_type, url])
self.load_and_embed(data_format.loader, data_format.chunker, url)
Expand All @@ -62,6 +64,9 @@ def add_local(self, data_type, content, config: AddConfig = None):
:param content: The local data. Refer to the `README` for formatting.
:param config: Optional. The `AddConfig` instance to use as configuration options.
"""
if config is None:
config = AddConfig()

data_format = DataFormat(data_type)
self.user_asks.append([data_type, content])
self.load_and_embed(data_format.loader, data_format.chunker, content)
Expand Down Expand Up @@ -354,7 +359,7 @@ def __init__(self, person, config: InitConfig = None):
class PersonApp(EmbedChainPersonApp, App):
"""
The Person app.
Extends functionality from EmbedChainPersonApp and App
Extends functionality fr om EmbedChainPersonApp and App
"""
def query(self, input_query, config: QueryConfig = None):
query_config = QueryConfig(
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.