Add CI for windows runner (#1458)

* Feat: Removing use of temp file while downloading archive from url along with adding CI for windows and mac platform * Windows CI by default installing pytorch gpu hence updating CI to pick cpu version * fixing mac cache build issue * updating windows pip install command for torch * another attempt * updating ci * Adding sudo * fixing ls failure on windows * another attempt to fix build issue * Saving env variable of test files * Adding debug log * Github action differ on windows * adding debug * anohter attempt * Windows have different ways to receive env * fixing template * minor fx * Adding debug * Removing use of json * Adding back fromJson * addin toJson * removing print * anohter attempt * disabling parallel run at least for testing * installing docker for mac runner * correcting docker install command * Linux dockers are not suported in windows * Removing mac changes * Upgrading pytorch * using lts pytorch * Separating win and ubuntu * Install java 11 * enabling linux container env * docker cli command * docker cli command * start elastic service * List all service * correcting service name * Attempt to fix multiple test run * convert to json * another attempt to check * Updating build cache step * attempt * Add tika * Separating windows CI * Changing CI name * Skipping test which does not work in windows * Skipping tests for windows * create cleanup function in conftest * adding skipif marker on tests * Run windows PR on only push to master * Addressing review comments * Enabling windows ci for this PR * Tika init is being called when importing tika function * handling tika import issue * handling tika import issue in test * Fixing import issue * removing tika fixure * Removing fixture from tests * Disable windows ci on pull request * Add back extra pytorch install step Co-authored-by: Malte Pietsch <[email protected]>
deepset-ai · Oct 29, 2021 · e5b4b62 · e5b4b62
1 parent 08341f5
commit e5b4b62
Show file tree

Hide file tree

Showing 12 changed files with 131 additions and 25 deletions.
diff --git a/.github/workflows/ci.yml → .github/workflows/linux_ci.yml b/.github/workflows/ci.yml → .github/workflows/linux_ci.yml
@@ -1,4 +1,4 @@
-name: Build
+name: Linux CI
 
 on:
  push:
@@ -33,7 +33,7 @@ jobs:
  uses: actions/cache@v2
  with:
  path: ${{ env.pythonLocation }}
- key: ${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
+ key: linux-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
  - name: Install dependencies
  if: steps.cache-python-env.outputs.cache-hit != 'true'
  run: |
@@ -70,7 +70,7 @@ jobs:
  uses: actions/cache@v2
  with:
  path: ${{ env.pythonLocation }}
- key: ${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
+ key: linux-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
  - name: Run Elasticsearch
  run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx128m" elasticsearch:7.9.2
 

diff --git a/.github/workflows/windows_ci.yml b/.github/workflows/windows_ci.yml
@@ -0,0 +1,97 @@
+name: Windows CI
+
+on:
+ push:
+ branches: [ master ]
+# pull_request:
+# branches: [ master ]
+
+jobs:
+ type-check:
+ runs-on: windows-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: actions/setup-python@v2
+ with:
+ python-version: 3.8
+ - name: Test with mypy
+ run: |
+ pip install mypy types-Markdown types-requests types-PyYAML pydantic
+ mypy haystack
+
+ build-cache:
+ needs: type-check
+ runs-on: windows-latest
+
+ steps:
+ - uses: actions/checkout@v2
+ - uses: actions/setup-python@v2
+ with:
+ python-version: 3.7
+ - run: echo "date=$(date +'%Y-%m-%d')" >> $env:GITHUB_ENV
+ - name: Cache
+ id: cache-python-env
+ uses: actions/cache@v2
+ with:
+ path: ${{ env.pythonLocation }}
+ key: windows-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
+ - name: Install Pytorch on windows
+ run: |
+ pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+ - name: Install dependencies
+ if: steps.cache-python-env.outputs.cache-hit != 'true'
+ run: |
+ python -m pip install --upgrade pip
+ pip install --upgrade --upgrade-strategy eager -r requirements-dev.txt -e .
+ pip install --upgrade --upgrade-strategy eager -f https://download.pytorch.org/whl/torch_stable.html -r requirements.txt -e .
+ pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+cpu.html
+
+ prepare-build:
+ needs: build-cache
+ # With Windows it gives error, also this step only listing test files only
+ runs-on: ubuntu-20.04
+ steps:
+ - uses: actions/checkout@v2
+ - id: set-matrix
+ run: |
+ echo "::set-output name=matrix::$(cd test && ls -d test_*.py | jq -R . | jq -cs .)"
+ outputs:
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
+
+ build:
+ needs: prepare-build
+ runs-on: windows-latest
+ strategy:
+ matrix:
+ test-path: ${{fromJson(needs.prepare-build.outputs.matrix)}}
+ fail-fast: false
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python 3.7
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.7
+ - run: echo "date=$(date +'%Y-%m-%d')" >> $env:GITHUB_ENV
+ - name: Cache
+ uses: actions/cache@v2
+ with:
+ path: ${{ env.pythonLocation }}
+ key: windows-${{ env.pythonLocation }}-${{ env.date }}-${{ hashFiles('setup.py') }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements-dev.txt') }}
+
+ # Windows runner can't run Linux containers. Refer https://github.com/actions/virtual-environments/issues/1143
+ - name: Set up Windows test env
+ run: |
+ choco install xpdf-utils
+ choco install openjdk11
+ refreshenv
+ choco install tesseract --pre
+ choco install elasticsearch --version=7.9.2
+ refreshenv
+ Get-Service elasticsearch-service-x64 | Start-Service
+
+ # We have to remove files if not test going to run from it
+ # As on windows we are going to disable quite a few tests these, hence these files will throw error refer https://github.com/pytest-dev/pytest/issues/812
+ # Removing test_ray, test_utils, test_preprocessor, test_knowledge_graph and test_connector
+ - name: Run tests
+ if: ${{ !contains(fromJSON('["test_ray.py", "test_knowledge_graph.py", "test_connector.py"]'), matrix.test-path) }}
+ run: cd test && pytest --document_store_type=memory,faiss,elasticsearch -m "not tika and not graphdb" -s ${{ matrix.test-path }}
diff --git a/haystack/utils/preprocessing.py b/haystack/utils/preprocessing.py
@@ -8,7 +8,6 @@
  BaseConverter, 
  DocxToTextConverter,
  PDFToTextConverter,
- TikaConverter,
  TextConverter
 )
 
@@ -99,6 +98,11 @@ def tika_convert_files_to_dicts(
  :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
  :param split_paragraphs: split text in paragraphs.
  """
+ try:
+ from haystack.nodes.file_converter import TikaConverter
+ except Exception as ex:
+ logger.error("Tika not installed. Please install tika and try again. Error: {}".format(ex))
+ raise ex
  converter = TikaConverter()
  paths = [p for p in Path(dir_path).glob("**/*")]
  allowed_suffixes = [".pdf", ".txt"]

diff --git a/test/conftest.py b/test/conftest.py
@@ -224,7 +224,7 @@ def tika_fixture():
 
 
 @pytest.fixture(scope="session")
-def xpdf_fixture(tika_fixture):
+def xpdf_fixture():
  verify_installation = run(["pdftotext"], shell=True)
  if verify_installation.returncode == 127:
  if platform.startswith("linux"):

diff --git a/test/test_document_store.py b/test/test_document_store.py
@@ -68,6 +68,7 @@ def test_write_with_duplicate_doc_ids_custom_index(document_store):
  # writing to the default, empty index should still work
  document_store.write_documents(documents, duplicate_documents="fail")
 
+
 def test_get_all_documents_without_filters(document_store_with_docs):
  documents = document_store_with_docs.get_all_documents()
  assert all(isinstance(d, Document) for d in documents)
@@ -812,7 +813,7 @@ def test_get_meta_values_by_key(document_store):
 
 
 @pytest.mark.elasticsearch
-def test_elasticsearch_custom_fields(elasticsearch_fixture):
+def test_elasticsearch_custom_fields():
  client = Elasticsearch()
  client.indices.delete(index='haystack_test_custom', ignore=[404])
  document_store = ElasticsearchDocumentStore(index="haystack_test_custom", content_field="custom_text_field",

diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py
@@ -1,4 +1,5 @@
-import time
+import sys
+
 import faiss
 import math
 import numpy as np
@@ -19,6 +20,7 @@
 ]
 
 
+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_index_save_and_load(tmp_path):
  document_store = FAISSDocumentStore(
  sql_url=f"sqlite:https:////{tmp_path/'haystack_test.db'}",
@@ -47,6 +49,7 @@ def test_faiss_index_save_and_load(tmp_path):
  assert not new_document_store.progress_bar
 
 
+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_index_save_and_load_custom_path(tmp_path):
  document_store = FAISSDocumentStore(
  sql_url=f"sqlite:https:////{tmp_path/'haystack_test.db'}",
@@ -95,7 +98,7 @@ def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
  stored_emb = document_store.faiss_indexes[document_store.index].reconstruct(int(doc.meta["vector_id"]))
  # compare original input vec with stored one (ignore extra dim added by hnsw)
  assert np.allclose(original_doc["embedding"], stored_emb, rtol=0.01)
- 
+
 
 @pytest.mark.slow
 @pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@@ -158,6 +161,7 @@ def test_update_with_empty_store(document_store, retriever):
  assert len(documents_indexed) == len(DOCUMENTS)
 
 
+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 @pytest.mark.parametrize("index_factory", ["Flat", "HNSW", "IVF1,Flat"])
 def test_faiss_retrieving(index_factory, tmp_path):
  document_store = FAISSDocumentStore(
@@ -253,7 +257,7 @@ def test_delete_docs_by_id_with_filters(document_store, retriever):
  all_ids_left = [doc.id for doc in documents]
  assert all(doc_id in all_ids_left for doc_id in ids_not_to_delete)
 
- 
+
 
 @pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
 @pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True)
@@ -271,6 +275,7 @@ def test_pipeline(document_store, retriever):
  assert len(output["documents"]) == 3
 
 
+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_passing_index_from_outside(tmp_path):
  d = 768
  nlist = 2
@@ -295,6 +300,7 @@ def test_faiss_passing_index_from_outside(tmp_path):
  assert 0 <= int(doc.meta["vector_id"]) <= 7
 
 
+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_cosine_similarity(tmp_path):
  document_store = FAISSDocumentStore(
  sql_url=f"sqlite:https:////{tmp_path/'haystack_test_faiss.db'}", similarity='cosine'
@@ -322,7 +328,7 @@ def test_faiss_cosine_similarity(tmp_path):
 
  # check if the stored embedding was normalized
  assert np.allclose(original_emb[0], result_emb, rtol=0.01)
- 
+
  # check if the score is plausible for cosine similarity
  assert 0 <= doc.score <= 1.0
 
@@ -342,7 +348,7 @@ def embed_documents(self, docs):
  assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01)
 
 
-
+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner")
 def test_faiss_cosine_sanity_check(tmp_path):
  document_store = FAISSDocumentStore(
  sql_url=f"sqlite:https:////{tmp_path/'haystack_test_faiss.db'}", similarity='cosine',

diff --git a/test/test_file_converter.py b/test/test_file_converter.py
@@ -13,7 +13,7 @@
  # "Converter", [PDFToTextConverter, TikaConverter, PDFToTextOCRConverter]
  "Converter", [PDFToTextOCRConverter]
 )
-def test_convert(Converter, xpdf_fixture):
+def test_convert(Converter):
  converter = Converter()
  document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
  pages = document["content"].split("\f")
@@ -31,7 +31,7 @@ def test_convert(Converter, xpdf_fixture):
 
 @pytest.mark.tika
 @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-def test_table_removal(Converter, xpdf_fixture):
+def test_table_removal(Converter):
  converter = Converter(remove_numeric_tables=True)
  document = converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
  pages = document["content"].split("\f")
@@ -42,7 +42,7 @@ def test_table_removal(Converter, xpdf_fixture):
 
 @pytest.mark.tika
 @pytest.mark.parametrize("Converter", [PDFToTextConverter, TikaConverter])
-def test_language_validation(Converter, xpdf_fixture, caplog):
+def test_language_validation(Converter, caplog):
  converter = Converter(valid_languages=["en"])
  converter.convert(file_path=Path("samples/pdf/sample_pdf_1.pdf"))
  assert (

diff --git a/test/test_generator.py b/test/test_generator.py
@@ -1,3 +1,4 @@
+import sys
 from typing import List
 
 import numpy as np
@@ -428,6 +429,7 @@ def test_generator_pipeline(document_store, retriever, rag_generator):
  assert "berlin" in answers[0]["answer"]
 
 
+@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Gives memory allocation error on windows runner")
 @pytest.mark.slow
 @pytest.mark.generator
 @pytest.mark.parametrize("document_store", ["memory"], indirect=True)

diff --git a/test/test_knowledge_graph.py b/test/test_knowledge_graph.py
@@ -6,8 +6,9 @@
 from haystack.document_stores import GraphDBKnowledgeGraph
 from haystack.utils import fetch_archive_from_http
 
+
 @pytest.mark.graphdb
-def test_graph_retrieval(graphdb_fixture):
+def test_graph_retrieval():
  # TODO rename doc_dir
  graph_dir = "../data/tutorial10_knowledge_graph/"
  s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip"

diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py
@@ -18,7 +18,6 @@
 """
 
 
-@pytest.mark.tika
 def test_preprocess_sentence_split():
  document = {"content": TEXT}
  preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="sentence", split_respect_sentence_boundary=False)
@@ -32,7 +31,6 @@ def test_preprocess_sentence_split():
  assert len(documents) == 2
 
 
-@pytest.mark.tika
 def test_preprocess_word_split():
  document = {"content": TEXT}
  preprocessor = PreProcessor(split_length=10, split_overlap=0, split_by="word", split_respect_sentence_boundary=False)
@@ -56,7 +54,6 @@ def test_preprocess_word_split():
  assert len(documents) == 15
 
 
-@pytest.mark.tika
 def test_preprocess_passage_split():
  document = {"content": TEXT}
  preprocessor = PreProcessor(split_length=1, split_overlap=0, split_by="passage", split_respect_sentence_boundary=False)
@@ -68,7 +65,6 @@ def test_preprocess_passage_split():
  assert len(documents) == 2
 
 
-@pytest.mark.tika
 def test_clean_header_footer():
  converter = PDFToTextConverter()
  document = converter.convert(file_path=Path("samples/pdf/sample_pdf_2.pdf")) # file contains header/footer

diff --git a/test/test_retriever.py b/test/test_retriever.py
@@ -94,7 +94,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
 
 
 @pytest.mark.elasticsearch
-def test_elasticsearch_custom_query(elasticsearch_fixture):
+def test_elasticsearch_custom_query():
  client = Elasticsearch()
  client.indices.delete(index="haystack_test_custom", ignore=[404])
  document_store = ElasticsearchDocumentStore(

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -1,17 +1,16 @@
 import pytest
 
-from haystack.preprocessor.utils import convert_files_to_dicts, tika_convert_files_to_dicts
-from haystack.preprocessor.cleaning import clean_wiki_text
+from haystack.utils.preprocessing import convert_files_to_dicts, tika_convert_files_to_dicts
+from haystack.utils.cleaning import clean_wiki_text
 
 
-@pytest.mark.tika
-def test_convert_files_to_dicts(xpdf_fixture):
+def test_convert_files_to_dicts():
  documents = convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
  assert documents and len(documents) > 0
 
 
 @pytest.mark.tika
-def test_tika_convert_files_to_dicts(tika_fixture):
+def test_tika_convert_files_to_dicts():
  documents = tika_convert_files_to_dicts(dir_path="samples", clean_func=clean_wiki_text, split_paragraphs=True)
  assert documents and len(documents) > 0