Upgrade to v1.7.0 and copy docs folder (deepset-ai#3014)

* update version to 1.7.0 * copy docs * update openapi * generate schemas * make update_json_schema() idempotent * update docs, schema and openapi
jamescalam · Aug 15, 2022 · baefd32 · baefd32
1 parent d617553
commit baefd32
Show file tree

Hide file tree

Showing 99 changed files with 36,356 additions and 13 deletions.
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-1.6.1rc0
+1.7.0
diff --git a/docs/_src/api/openapi/openapi-1.7.0.json b/docs/_src/api/openapi/openapi-1.7.0.json
diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json
@@ -2,7 +2,7 @@
  "openapi": "3.0.2",
  "info": {
  "title": "Haystack REST API",
- "version": "1.6.1rc0"
+ "version": "1.7.0"
  },
  "paths": {
  "/initialized": {

diff --git a/docs/v1.7.0/Makefile b/docs/v1.7.0/Makefile
@@ -0,0 +1,25 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+
+SPHINXBUILD := sphinx-build
+MAKEINFO := makeinfo
+
+BUILDDIR := build
+SOURCE := _src/
+# SPHINXFLAGS := -a -W -n -A local=1 -d $(BUILDDIR)/doctree
+SPHINXFLAGS := -A local=1 -d $(BUILDDIR)/doctree
+SPHINXOPTS := $(SPHINXFLAGS) $(SOURCE)
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ $(SPHINXBUILD) -M $@ $(SPHINXOPTS) $(BUILDDIR)/$@
diff --git a/docs/v1.7.0/_src/api/Makefile b/docs/v1.7.0/_src/api/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/v1.7.0/_src/api/_static/floating_sidebar.css b/docs/v1.7.0/_src/api/_static/floating_sidebar.css
@@ -0,0 +1,29 @@
+div.sphinxsidebarwrapper {
+ position: relative;
+ top: 0px;
+ padding: 0;
+}
+
+div.sphinxsidebar {
+ margin: 0;
+ padding: 0 15px 0 15px;
+ width: 210px;
+ float: left;
+ font-size: 1em;
+ text-align: left;
+}
+
+div.sphinxsidebar .logo {
+ font-size: 1.8em;
+ color: #0A507A;
+ font-weight: 300;
+ text-align: center;
+}
+
+div.sphinxsidebar .logo img {
+ vertical-align: middle;
+}
+
+div.sphinxsidebar .download a img {
+ vertical-align: middle;
+}
diff --git a/docs/v1.7.0/_src/api/_templates/xxlayout.html b/docs/v1.7.0/_src/api/_templates/xxlayout.html
@@ -0,0 +1,46 @@
+{# put the sidebar before the body #}
+{% block sidebar1 %}{{ sidebar() }}{% endblock %}
+{% block sidebar2 %}{% endblock %}
+
+{% block extrahead %}
+ <link href='https://fonts.googleapis.com/css?family=Open+Sans:300,400,700'
+ rel='stylesheet' type='text/css' />
+{{ super() }}
+{#- if not embedded #}
+ <style type="text/css">
+ table.right { float: left; margin-left: 20px; }
+ table.right td { border: 1px solid #ccc; }
+ {% if pagename == 'index' %}
+ .related { display: none; }
+ {% endif %}
+ </style>
+ <script>
+ // intelligent scrolling of the sidebar content
+ $(window).scroll(function() {
+ var sb = $('.sphinxsidebarwrapper');
+ var win = $(window);
+ var sbh = sb.height();
+ var offset = $('.sphinxsidebar').position()['top'];
+ var wintop = win.scrollTop();
+ var winbot = wintop + win.innerHeight();
+ var curtop = sb.position()['top'];
+ var curbot = curtop + sbh;
+ // does sidebar fit in window?
+ if (sbh < win.innerHeight()) {
+ // yes: easy case -- always keep at the top
+ sb.css('top', $u.min([$u.max([0, wintop - offset - 10]),
+ $(document).height() - sbh - 200]));
+ } else {
+ // no: only scroll if top/bottom edge of sidebar is at
+ // top/bottom edge of window
+ if (curtop > wintop && curbot > winbot) {
+ sb.css('top', $u.max([wintop - offset - 10, 0]));
+ } else if (curtop < wintop && curbot < winbot) {
+ sb.css('top', $u.min([winbot - sbh - offset - 20,
+ $(document).height() - sbh - 200]));
+ }
+ }
+ });
+ </script>
+{#- endif #}
+{% endblock %}
diff --git a/docs/v1.7.0/_src/api/api/crawler.md b/docs/v1.7.0/_src/api/api/crawler.md
@@ -0,0 +1,144 @@
+<a id="crawler"></a>
+
+# Module crawler
+
+<a id="crawler.Crawler"></a>
+
+## Crawler
+
+```python
+class Crawler(BaseComponent)
+```
+
+Crawl texts from a website so that we can use them later in Haystack as a corpus for search / question answering etc.
+
+**Example:**
+```python
+| from haystack.nodes.connector import Crawler
+|
+| crawler = Crawler(output_dir="crawled_files")
+| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
+| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
+| filter_urls= ["haystack.deepset.ai/overview/"])
+```
+
+<a id="crawler.Crawler.__init__"></a>
+
+#### Crawler.\_\_init\_\_
+
+```python
+def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None)
+```
+
+Init object with basic params for crawling (can be overwritten later).
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http(s) address(es) (can also be supplied later when calling crawl())
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `extract_hidden_text`: Whether to extract the hidden text contained in page.
+E.g. the text can be inside a span with style="display: none"
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
+
+<a id="crawler.Crawler.crawl"></a>
+
+#### Crawler.crawl
+
+```python
+def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path]
+```
+
+Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
+
+file per URL, including text and basic meta data).
+You can optionally specify via `filter_urls` to only crawl URLs that match a certain pattern.
+All parameters are optional here and only meant to overwrite instance attributes at runtime.
+If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
+
+**Returns**:
+
+List of paths where the crawled webpages got stored
+
+<a id="crawler.Crawler.run"></a>
+
+#### Crawler.run
+
+```python
+def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
+```
+
+Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+0: Only initial list of urls
+1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `return_documents`: Return json files content
+- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's
+attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
+not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
+In this case the id will be generated by using the content and the defined metadata.
+- `extract_hidden_text`: Whether to extract the hidden text contained in page.
+E.g. the text can be inside a span with style="display: none"
+- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on
+dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted.
+E.g. 2: Crawler will wait 2 seconds before scraping page
+- `crawler_naming_function`: A function mapping the crawled page to a file name.
+By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url.
+E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link)
+ This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores.
+ 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest()
+ This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content.
+
+**Returns**:
+
+Tuple({"paths": List of filepaths, ...}, Name of output edge)
+