From 1e9301f1643372143eb7bcdf27394d1e2dc7c2b5 Mon Sep 17 00:00:00 2001 From: aaishikdutta Date: Sun, 25 Jun 2023 01:10:10 +0530 Subject: [PATCH 01/13] added SitemapLoader --- README.md | 7 +++++++ embedchain/embedchain.py | 3 +++ embedchain/loaders/site_map.py | 32 ++++++++++++++++++++++++++++++++ setup.py | 1 + 4 files changed, 43 insertions(+) create mode 100644 embedchain/loaders/site_map.py diff --git a/README.md b/README.md index 6628ec78c2..0377d80ac8 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. ```python app.add_local('qna_pair', ("Question", "Answer")) ``` +### Sitemap + +To add a XML site map containing list of all urls, use the data_type as `site_map` and enter the sitemap url. Eg: + +```python +app.add('site_map', 'a_valid_sitemap_url/sitemap.xml') +``` ### More Formats coming soon diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index e2b8302ce4..dba354c4bc 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -8,6 +8,7 @@ from embedchain.loaders.youtube_video import YoutubeVideoLoader from embedchain.loaders.pdf_file import PdfFileLoader from embedchain.loaders.web_page import WebPageLoader +from embedchain.loaders.site_map import SitemapLoader from embedchain.loaders.local_qna_pair import LocalQnaPairLoader from embedchain.loaders.local_text import LocalTextLoader from embedchain.chunkers.youtube_video import YoutubeVideoChunker @@ -53,6 +54,7 @@ def _get_loader(self, data_type): 'web_page': WebPageLoader(), 'qna_pair': LocalQnaPairLoader(), 'text': LocalTextLoader(), + 'site_map': SitemapLoader(), } if data_type in loaders: return loaders[data_type] @@ -73,6 +75,7 @@ def _get_chunker(self, data_type): 'web_page': WebPageChunker(), 'qna_pair': QnaPairChunker(), 'text': TextChunker(), + 'site_map': WebPageChunker(), } if data_type in chunkers: return chunkers[data_type] diff --git a/embedchain/loaders/site_map.py b/embedchain/loaders/site_map.py new file mode 100644 index 0000000000..1627dc77eb --- /dev/null +++ b/embedchain/loaders/site_map.py @@ -0,0 +1,32 @@ +import requests + +from bs4 import BeautifulSoup +from embedchain.loaders.web_page import WebPageLoader + +class SitemapLoader: + def load_data(self, sitemap_url): + """ + This method takes a sitemap url as input and retrieves + all the urls to use the WebPageLoader to load content + of each page. + """ + output = [] + web_page_loader = WebPageLoader() + + response = requests.get(sitemap_url) + + if response.status_code == 200: + soup = BeautifulSoup(response.text, features="xml") + links = [link.text for link in soup.find_all('loc')] + + for link in links: + each_load_data = web_page_loader.load_data(link) + # WebPageLoader returns a list with single element which is extracted and appended to + # the output list containing data for all pages + output.append(each_load_data[0]) + + return output + + else: + raise response.raise_for_status() + diff --git a/setup.py b/setup.py index e838ec48a7..955bca4e8d 100644 --- a/setup.py +++ b/setup.py @@ -29,5 +29,6 @@ "beautifulsoup4", "pypdf", "pytube", + "lxml", ] ) From 7cb75549d39f7b75cc961630ab4d1cfa4c8eab91 Mon Sep 17 00:00:00 2001 From: Aaishik Dutta Date: Tue, 11 Jul 2023 20:48:58 +0530 Subject: [PATCH 02/13] added sitemap modified --- embedchain/loaders/site_map.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/embedchain/loaders/site_map.py b/embedchain/loaders/site_map.py index 1627dc77eb..5b4a2db5cc 100644 --- a/embedchain/loaders/site_map.py +++ b/embedchain/loaders/site_map.py @@ -1,14 +1,15 @@ import requests - from bs4 import BeautifulSoup + from embedchain.loaders.web_page import WebPageLoader + class SitemapLoader: def load_data(self, sitemap_url): """ - This method takes a sitemap url as input and retrieves - all the urls to use the WebPageLoader to load content - of each page. + This method takes a sitemap url as input and retrieves + all the urls to use the WebPageLoader to load content + of each page. """ output = [] web_page_loader = WebPageLoader() @@ -17,16 +18,16 @@ def load_data(self, sitemap_url): if response.status_code == 200: soup = BeautifulSoup(response.text, features="xml") - links = [link.text for link in soup.find_all('loc')] + links = [link.text for link in soup.find_all("loc")] for link in links: each_load_data = web_page_loader.load_data(link) - # WebPageLoader returns a list with single element which is extracted and appended to - # the output list containing data for all pages + # WebPageLoader returns a list with single element + # which is extracted and appended to the output list + # containing data for all pages output.append(each_load_data[0]) return output - + else: raise response.raise_for_status() - From 49d10b0dcd989b54a2c2ae1e7f78aeaf55440b94 Mon Sep 17 00:00:00 2001 From: Aaishik Dutta Date: Wed, 12 Jul 2023 10:46:44 +0530 Subject: [PATCH 03/13] added refactor and lint format fixes --- embedchain/config/InitConfig.py | 1 + embedchain/loaders/site_map.py | 29 ++++++++++------------------- embedchain/vectordb/chroma_db.py | 2 +- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/embedchain/config/InitConfig.py b/embedchain/config/InitConfig.py index fb4daf2e50..923ab879a9 100644 --- a/embedchain/config/InitConfig.py +++ b/embedchain/config/InitConfig.py @@ -62,6 +62,7 @@ def _set_db_to_default(self): Sets database to default (`ChromaDb`). """ from embedchain.vectordb.chroma_db import ChromaDB + self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port) def _setup_logging(self, debug_level): diff --git a/embedchain/loaders/site_map.py b/embedchain/loaders/site_map.py index 5b4a2db5cc..e38a2be8f7 100644 --- a/embedchain/loaders/site_map.py +++ b/embedchain/loaders/site_map.py @@ -7,27 +7,18 @@ class SitemapLoader: def load_data(self, sitemap_url): """ - This method takes a sitemap url as input and retrieves - all the urls to use the WebPageLoader to load content + This method takes a sitemap URL as input and retrieves + all the URLs to use the WebPageLoader to load content of each page. """ output = [] web_page_loader = WebPageLoader() - response = requests.get(sitemap_url) - - if response.status_code == 200: - soup = BeautifulSoup(response.text, features="xml") - links = [link.text for link in soup.find_all("loc")] - - for link in links: - each_load_data = web_page_loader.load_data(link) - # WebPageLoader returns a list with single element - # which is extracted and appended to the output list - # containing data for all pages - output.append(each_load_data[0]) - - return output - - else: - raise response.raise_for_status() + response.raise_for_status() + + soup = BeautifulSoup(response.text, "xml") + links = [link.text for link in soup.find_all("loc")] + for link in links: + each_load_data = web_page_loader.load_data(link) + output.append(each_load_data) + return [data[0] for data in output] diff --git a/embedchain/vectordb/chroma_db.py b/embedchain/vectordb/chroma_db.py index 2142f9d3c5..72408104a5 100644 --- a/embedchain/vectordb/chroma_db.py +++ b/embedchain/vectordb/chroma_db.py @@ -1,5 +1,5 @@ -import os import logging +import os import chromadb from chromadb.utils import embedding_functions From 7b12ca54de0c0360316a1ddb7b0ad2bd1c8b998a Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Tue, 11 Jul 2023 23:29:13 -0700 Subject: [PATCH 04/13] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b193a8967..649f27bcce 100644 --- a/README.md +++ b/README.md @@ -296,7 +296,7 @@ app.add_local('qna_pair', ("Question", "Answer")) ``` ### Sitemap -To add a XML site map containing list of all urls, use the data_type as `site_map` and enter the sitemap url. Eg: +To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg: ```python app.add('site_map', 'a_valid_sitemap_url/sitemap.xml') From 7aedab5cdba8231137015a1ed5c277f5c80d2e4d Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Tue, 11 Jul 2023 23:29:19 -0700 Subject: [PATCH 05/13] Update embedchain/data_formatter/data_formatter.py --- embedchain/data_formatter/data_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index a1a2de430b..c57ec9a8f3 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -40,7 +40,7 @@ def _get_loader(self, data_type, config): "qna_pair": LocalQnaPairLoader(), "text": LocalTextLoader(), "docx": DocxFileLoader(), - "site_map": SitemapLoader(), + "sitemap": SitemapLoader(), } if data_type in loaders: return loaders[data_type] From 8063b565b4cfc00d3e4a3820ee1ec7ff17aaa46a Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Tue, 11 Jul 2023 23:29:24 -0700 Subject: [PATCH 06/13] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 649f27bcce..be88d23a33 100644 --- a/README.md +++ b/README.md @@ -299,7 +299,7 @@ app.add_local('qna_pair', ("Question", "Answer")) To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg: ```python -app.add('site_map', 'a_valid_sitemap_url/sitemap.xml') +app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml') ``` ### Reusing a Vector DB From b50c0b8ccc556d3dd4486e1f13a9fcb5712ba361 Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Tue, 11 Jul 2023 23:29:30 -0700 Subject: [PATCH 07/13] Update embedchain/data_formatter/data_formatter.py --- embedchain/data_formatter/data_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index c57ec9a8f3..553c512b51 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -62,7 +62,7 @@ def _get_chunker(self, data_type, config): "qna_pair": QnaPairChunker(config), "text": TextChunker(config), "docx": DocxFileChunker(config), - "site_map": WebPageChunker(config), + "sitemap": WebPageChunker(config), } if data_type in chunkers: return chunkers[data_type] From a93d37cd6f532f4e2ef77c708a22a0ccac91a333 Mon Sep 17 00:00:00 2001 From: Aaishik Dutta Date: Wed, 12 Jul 2023 12:03:38 +0530 Subject: [PATCH 08/13] incorporated review comments --- README.md | 4 ++-- embedchain/data_formatter/data_formatter.py | 6 +++--- embedchain/loaders/{site_map.py => sitemap.py} | 0 3 files changed, 5 insertions(+), 5 deletions(-) rename embedchain/loaders/{site_map.py => sitemap.py} (100%) diff --git a/README.md b/README.md index 3b193a8967..be88d23a33 100644 --- a/README.md +++ b/README.md @@ -296,10 +296,10 @@ app.add_local('qna_pair', ("Question", "Answer")) ``` ### Sitemap -To add a XML site map containing list of all urls, use the data_type as `site_map` and enter the sitemap url. Eg: +To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg: ```python -app.add('site_map', 'a_valid_sitemap_url/sitemap.xml') +app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml') ``` ### Reusing a Vector DB diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index a1a2de430b..6e6e18f765 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -9,7 +9,7 @@ from embedchain.loaders.local_qna_pair import LocalQnaPairLoader from embedchain.loaders.local_text import LocalTextLoader from embedchain.loaders.pdf_file import PdfFileLoader -from embedchain.loaders.site_map import SitemapLoader +from embedchain.loaders.sitemap import SitemapLoader from embedchain.loaders.web_page import WebPageLoader from embedchain.loaders.youtube_video import YoutubeVideoLoader @@ -40,7 +40,7 @@ def _get_loader(self, data_type, config): "qna_pair": LocalQnaPairLoader(), "text": LocalTextLoader(), "docx": DocxFileLoader(), - "site_map": SitemapLoader(), + "sitemap": SitemapLoader(), } if data_type in loaders: return loaders[data_type] @@ -62,7 +62,7 @@ def _get_chunker(self, data_type, config): "qna_pair": QnaPairChunker(config), "text": TextChunker(config), "docx": DocxFileChunker(config), - "site_map": WebPageChunker(config), + "sitemap": WebPageChunker(config), } if data_type in chunkers: return chunkers[data_type] diff --git a/embedchain/loaders/site_map.py b/embedchain/loaders/sitemap.py similarity index 100% rename from embedchain/loaders/site_map.py rename to embedchain/loaders/sitemap.py From 5dd54c2c0c63e7f426351cf1236aad50bfe14fcd Mon Sep 17 00:00:00 2001 From: Aaishik Dutta Date: Fri, 21 Jul 2023 23:31:32 +0530 Subject: [PATCH 09/13] added fix for PersonSourceApp not instantiating --- embedchain/apps/PersonApp.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/embedchain/apps/PersonApp.py b/embedchain/apps/PersonApp.py index d868a32eb6..a5488f6bfa 100644 --- a/embedchain/apps/PersonApp.py +++ b/embedchain/apps/PersonApp.py @@ -29,19 +29,19 @@ class PersonApp(EmbedChainPersonApp, App): Extends functionality from EmbedChainPersonApp and App """ - def query(self, input_query, config: QueryConfig = None): + def query(self, input_query, config: QueryConfig = None, dry_run=False): self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT) query_config = QueryConfig( template=self.template, ) - return super().query(input_query, query_config) + return super().query(input_query, query_config, dry_run) - def chat(self, input_query, config: ChatConfig = None): + def chat(self, input_query, config: ChatConfig = None, dry_run=False): self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT_WITH_HISTORY) chat_config = ChatConfig( template=self.template, ) - return super().chat(input_query, chat_config) + return super().chat(input_query, chat_config, dry_run) class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp): @@ -51,12 +51,14 @@ class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp): """ def query(self, input_query, config: QueryConfig = None): + self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT) query_config = QueryConfig( template=self.template, ) return super().query(input_query, query_config) def chat(self, input_query, config: ChatConfig = None): + self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT_WITH_HISTORY) chat_config = ChatConfig( template=self.template, ) From 5ec0aa11bcc5d3e30e21c52ea69f6a797f4b628f Mon Sep 17 00:00:00 2001 From: Aaishik Dutta Date: Fri, 21 Jul 2023 23:33:55 +0530 Subject: [PATCH 10/13] added fix for PersonSourceApp not instantiating --- embedchain/apps/PersonApp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/embedchain/apps/PersonApp.py b/embedchain/apps/PersonApp.py index a5488f6bfa..4fe6c278a7 100644 --- a/embedchain/apps/PersonApp.py +++ b/embedchain/apps/PersonApp.py @@ -29,19 +29,19 @@ class PersonApp(EmbedChainPersonApp, App): Extends functionality from EmbedChainPersonApp and App """ - def query(self, input_query, config: QueryConfig = None, dry_run=False): + def query(self, input_query, config: QueryConfig = None): self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT) query_config = QueryConfig( template=self.template, ) - return super().query(input_query, query_config, dry_run) + return super().query(input_query, query_config) - def chat(self, input_query, config: ChatConfig = None, dry_run=False): + def chat(self, input_query, config: ChatConfig = None): self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT_WITH_HISTORY) chat_config = ChatConfig( template=self.template, ) - return super().chat(input_query, chat_config, dry_run) + return super().chat(input_query, chat_config) class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp): From e3ae3b25f96a39b5bad24139cf8f5bc51376b756 Mon Sep 17 00:00:00 2001 From: Aaishik Dutta Date: Sat, 22 Jul 2023 11:17:44 +0530 Subject: [PATCH 11/13] added dry_run to Person App --- embedchain/apps/PersonApp.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/embedchain/apps/PersonApp.py b/embedchain/apps/PersonApp.py index 4fe6c278a7..3a0378c1aa 100644 --- a/embedchain/apps/PersonApp.py +++ b/embedchain/apps/PersonApp.py @@ -29,19 +29,19 @@ class PersonApp(EmbedChainPersonApp, App): Extends functionality from EmbedChainPersonApp and App """ - def query(self, input_query, config: QueryConfig = None): + def query(self, input_query, config: QueryConfig = None, dry_run=False): self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT) query_config = QueryConfig( template=self.template, ) - return super().query(input_query, query_config) + return super().query(input_query, query_config, dry_run) - def chat(self, input_query, config: ChatConfig = None): + def chat(self, input_query, config: ChatConfig = None, dry_run=False): self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT_WITH_HISTORY) chat_config = ChatConfig( template=self.template, ) - return super().chat(input_query, chat_config) + return super().chat(input_query, chat_config, dry_run) class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp): @@ -50,16 +50,16 @@ class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp): Extends functionality from EmbedChainPersonApp and OpenSourceApp """ - def query(self, input_query, config: QueryConfig = None): + def query(self, input_query, config: QueryConfig = None, dry_run=False): self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT) query_config = QueryConfig( template=self.template, ) - return super().query(input_query, query_config) + return super().query(input_query, query_config, dry_run) - def chat(self, input_query, config: ChatConfig = None): + def chat(self, input_query, config: ChatConfig = None, dry_run=False): self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT_WITH_HISTORY) chat_config = ChatConfig( template=self.template, ) - return super().chat(input_query, chat_config) + return super().chat(input_query, chat_config, dry_run) From da77ca1879bfa79da0e35d293bdea30951b30f12 Mon Sep 17 00:00:00 2001 From: Aaishik Dutta Date: Sat, 22 Jul 2023 13:12:56 +0530 Subject: [PATCH 12/13] fixed test case --- tests/vectordb/test_chroma_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/vectordb/test_chroma_db.py b/tests/vectordb/test_chroma_db.py index de009b9b20..46d4c7c4fe 100644 --- a/tests/vectordb/test_chroma_db.py +++ b/tests/vectordb/test_chroma_db.py @@ -17,7 +17,7 @@ def test_init_with_host_and_port(self): host = "test-host" port = "1234" - with patch.object(chromadb, "Client") as mock_client: + with patch.object(chromadb, "HttpClient") as mock_client: _db = ChromaDB(host=host, port=port, embedding_fn=len) expected_settings = Settings( From d4b93dbfa2f1c4cb9a2b3f49ca45bd56b2b777f1 Mon Sep 17 00:00:00 2001 From: aaishikdutta <107566376+aaishikdutta@users.noreply.github.com> Date: Mon, 24 Jul 2023 13:29:36 +0530 Subject: [PATCH 13/13] Update README.md Added update badge url --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9868a03206..86a02f19a6 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # embedchain [![PyPI](https://img.shields.io/pypi/v/embedchain)](https://pypi.org/project/embedchain/) -[![Discord](https://dcbadge.vercel.app/api/server/nhvCbCtKV?style=flat)](https://discord.gg/6PzXDgEjG5) +[![Discord](https://dcbadge.vercel.app/api/server/6PzXDgEjG5?style=flat)](https://discord.gg/6PzXDgEjG5) [![Twitter](https://img.shields.io/twitter/follow/embedchain)](https://twitter.com/embedchain) [![Substack](https://img.shields.io/badge/Substack-%23006f5c.svg?logo=substack)](https://embedchain.substack.com/) [![Open in Colab](https://camo.githubusercontent.com/84f0493939e0c4de4e6dbe113251b4bfb5353e57134ffd9fcab6b8714514d4d1/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/drive/138lMWhENGeEu7Q1-6lNbNTHGLZXBBz_B?usp=sharing)