Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: load chunker from config #270

Merged
merged 11 commits into from
Jul 17, 2023
12 changes: 3 additions & 9 deletions docs/advanced/configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Here's the readme example with configuration options.
```python
import os
from embedchain import App
from embedchain.config import InitConfig, AddConfig, QueryConfig
from embedchain.config import InitConfig, AddConfig, QueryConfig, ChunkerConfig
from chromadb.utils import embedding_functions

# Example: use your own embedding function
Expand All @@ -25,14 +25,8 @@ config = InitConfig(ef=embedding_functions.OpenAIEmbeddingFunction(
naval_chat_bot = App(config)

# Example: define your own chunker config for `youtube_video`
youtube_add_config = {
"chunker": {
"chunk_size": 1000,
"chunk_overlap": 100,
"length_function": len,
}
}
naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(**youtube_add_config))
chunker_config = ChunkerConfig(chunk_size=1000, chunk_overlap=100, length_function=len)
naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(chunker=chunker_config))

add_config = AddConfig()
naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf", add_config)
Expand Down
14 changes: 6 additions & 8 deletions embedchain/chunkers/docx_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 1000,
"chunk_overlap": 0,
"length_function": len,
}


class DocxFileChunker(BaseChunker):
"""Chunker for .docx file."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/pdf_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 1000,
"chunk_overlap": 0,
"length_function": len,
}


class PdfFileChunker(BaseChunker):
"""Chunker for PDF file."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=1000, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/qna_pair.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 300,
"chunk_overlap": 0,
"length_function": len,
}


class QnaPairChunker(BaseChunker):
"""Chunker for QnA pair."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 300,
"chunk_overlap": 0,
"length_function": len,
}


class TextChunker(BaseChunker):
"""Chunker for text."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/web_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 500,
"chunk_overlap": 0,
"length_function": len,
}


class WebPageChunker(BaseChunker):
"""Chunker for web page."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
14 changes: 6 additions & 8 deletions embedchain/chunkers/youtube_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,16 @@
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config.AddConfig import ChunkerConfig

TEXT_SPLITTER_CHUNK_PARAMS = {
"chunk_size": 2000,
"chunk_overlap": 0,
"length_function": len,
}


class YoutubeVideoChunker(BaseChunker):
"""Chunker for Youtube video."""

def __init__(self, config: Optional[ChunkerConfig] = None):
if config is None:
config = TEXT_SPLITTER_CHUNK_PARAMS
text_splitter = RecursiveCharacterTextSplitter(**config)
config = ChunkerConfig(chunk_size=2000, chunk_overlap=0, length_function=len)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
length_function=config.length_function,
)
super().__init__(text_splitter)
12 changes: 6 additions & 6 deletions embedchain/config/AddConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ class ChunkerConfig(BaseConfig):

def __init__(
self,
chunk_size: Optional[int] = 4000,
chunk_overlap: Optional[int] = 200,
length_function: Optional[Callable[[str], int]] = len,
chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None,
length_function: Optional[Callable[[str], int]] = None,
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.length_function = length_function
self.chunk_size = chunk_size if chunk_size else 2000
self.chunk_overlap = chunk_overlap if chunk_overlap else 0
self.length_function = length_function if length_function else len


class LoaderConfig(BaseConfig):
Expand Down
2 changes: 1 addition & 1 deletion embedchain/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .AddConfig import AddConfig # noqa: F401
from .AddConfig import AddConfig, ChunkerConfig # noqa: F401
from .BaseConfig import BaseConfig # noqa: F401
from .ChatConfig import ChatConfig # noqa: F401
from .InitConfig import InitConfig # noqa: F401
Expand Down
7 changes: 2 additions & 5 deletions tests/chunkers/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unittest

from embedchain.chunkers.text import TextChunker
from embedchain.config import ChunkerConfig


class TestTextChunker(unittest.TestCase):
Expand All @@ -11,11 +12,7 @@ def test_chunks(self):
Test the chunks generated by TextChunker.
# TODO: Not a very precise test.
"""
chunker_config = {
"chunk_size": 10,
"chunk_overlap": 0,
"length_function": len,
}
chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len)
chunker = TextChunker(config=chunker_config)
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

Expand Down