Refactored and enabled the QA end-to-end tests.

Vikaslakkacs · Sep 1, 2023 · 91eec78 · 91eec78
1 parent d365c40
commit 91eec78
Show file tree

Hide file tree

Showing 11 changed files with 298 additions and 205 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Example for using [custom LLM and multiple KBs](./examples/multi_kb/README.md)
 - Support for [`PROMPTS_DIR`](./docs/user_guide/advanced/prompt-customization.md#prompt-configuration).
 - [#101](https://github.com/NVIDIA/NeMo-Guardrails/pull/101) Support for [using OpenAI embeddings](./docs/user_guide/configuration-guide.md#the-embeddings-model) models in addition to SentenceTransformers.
+- First set of end-to-end QA tests for the example configurations.
 
 ### Changed
 

diff --git a/qa/README.md b/qa/README.md
@@ -41,9 +41,17 @@ Please refer to the [installation guide](installation-guide.md) for instructions
 2. Change the directory to `nemoguardrails/qa` folder, and then run all the automated tests
 
  ```bash
- > python -m pytest test_*.py
+ > QA=True python -m pytest test_*.py
  ```
 
+NOTE: The QA tests are skipped by default as they are expensive (i.e., they make live call to OpenAI and other services). To enable them, you have to set the `QA` environment variable.
+
+Alternatively, you can also run the tests from the root of the project:
+
+```bash
+> QA=True pytest qa
+```
+
 3. If there are any failure(s), analyze the corresponding example test log.
 
  ```bash

diff --git a/qa/chatter.py b/qa/chatter.py
@@ -18,7 +18,6 @@
 import subprocess
 import traceback
 
-
 EXAMPLES_FOLDER = os.path.join(os.path.dirname(os.path.dirname(__file__)), "examples")
 
 
@@ -30,7 +29,14 @@ def create_chatter(name, configname, logger):
  logger.info(f"config: {config}")
  try:
  command = ["nemoguardrails", "chat", f"--config={config}"]
- chatter = subprocess.Popen(command, cwd=cwd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+ chatter = subprocess.Popen(
+ command,
+ cwd=cwd,
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ )
  if chatter is not None:
  output = chatter.stdout.readline().strip()
  logger.info(f"output: {output}")
@@ -43,6 +49,7 @@ def create_chatter(name, configname, logger):
 
  return chatter
 
+
 def close_chatter(chatter):
  """Close the given chatter"""
  if chatter is not None:

diff --git a/qa/logger.py b/qa/logger.py
@@ -23,10 +23,10 @@ def create_logger(filename):
  logger.setLevel(logging.INFO)
 
  # Create a file handler
- file_handler = logging.FileHandler(filename, mode='w')
+ file_handler = logging.FileHandler(filename, mode="w")
 
  # Configure the formatter and add it to the file handler
- formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
  file_handler.setFormatter(formatter)
 
  # Add the file handler to the logger

diff --git a/qa/test_execution_rails.py b/qa/test_execution_rails.py
@@ -12,52 +12,51 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 
 import pytest
 
-from unittest import TestCase
-from .logger import create_logger
-from .chatter import create_chatter, close_chatter
+from .utils import ExampleConfigChatterTestCase
 
+QA_MODE = os.environ.get("QA")
 
-class TestExecutionRails(TestCase):
- logger = None
- chatter = None
 
- @classmethod
- def setUpClass(cls) -> None:
- super().setUpClass()
- # Create a logger and a chatter
- cls.logger = create_logger('execution_rails.log')
- cls.chatter = create_chatter("execution_rails", "execution_rails/sample_rails", cls.logger)
-
- @classmethod
- def tearDownClass(cls) -> None:
- super().tearDownClass()
- close_chatter(cls.chatter)
+class TestExecutionRails(ExampleConfigChatterTestCase):
+ example_name = "execution_rails"
 
+ @pytest.mark.skipif(not QA_MODE, reason="Not in QA mode.")
  @pytest.mark.unit
  def test_execution_rails(self):
  """Test the execution_rails example"""
- self.logger.info("Running test_execution_rails...")
-
  # Define Q&A messages
  messages = {
- "Hi there!": ["Hi! How can I assist you today?", "Hello again! How can I help you?", "Hello again! What can I help you with today?", "Hello again! What can I do for you?", "Hello! What can I help you with today?", "Hello again! How can I help you today?", "Hello again! How can I be of assistance?", "Hello there! How can I help you today?", "Hello! How can I assist you today?", "Hey there!", "Hi! How can I help you today?", "Hello! How can I help you today?", "Hello, how can I help you today?", "Hello there! How can I help you?"],
- "How can you help?": ["I am an AI assistant that helps answer mathematical questions. My core mathematical skills are powered by wolfram alpha.", "How are you feeling today?"],
- "What is 434 + 56*7.5?": ["434 + 56*7.5 is equal to 854.", "The result is 854.", "The result of 434 + 56*7.5 is 854.", "The answer is 854.", "434 + 56 * 7.5 is equal to 854."]
+ "Hi there!": [
+ "Hi! How can I assist you today?",
+ "Hello again! How can I help you?",
+ "Hello again! What can I help you with today?",
+ "Hello again! What can I do for you?",
+ "Hello! What can I help you with today?",
+ "Hello again! How can I help you today?",
+ "Hello again! How can I be of assistance?",
+ "Hello there! How can I help you today?",
+ "Hello! How can I assist you today?",
+ "Hey there!",
+ "Hi! How can I help you today?",
+ "Hello! How can I help you today?",
+ "Hello, how can I help you today?",
+ "Hello there! How can I help you?",
+ ],
+ "How can you help?": [
+ "I am an AI assistant that helps answer mathematical questions. My core mathematical skills are powered by wolfram alpha.",
+ "How are you feeling today?",
+ ],
+ "What is 434 + 56*7.5?": [
+ "434 + 56*7.5 is equal to 854.",
+ "The result is 854.",
+ "The result of 434 + 56*7.5 is 854.",
+ "The answer is 854.",
+ "434 + 56 * 7.5 is equal to 854.",
+ ],
  }
 
- if self.chatter is not None:
- # Process the questions and validate the answers
- for question, expected_answers in messages.items():
- self.logger.info(f"User: {question}")
- # Send the question to chatter
- self.chatter.stdin.write(question + '\n')
- self.chatter.stdin.flush()
-
- # Read the answer from chatter
- output = self.chatter.stdout.readline().strip()
- self.logger.info(f"Bot: {output}")
- # Validate the answer
- assert len([answer for answer in expected_answers if answer in output]) > 0
+ self.run_test(messages)
diff --git a/qa/test_grounding_rail.py b/qa/test_grounding_rail.py
@@ -12,57 +12,59 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 
 import pytest
 
-from unittest import TestCase
-from .logger import create_logger
-from .chatter import create_chatter, close_chatter
-from .validator import are_strings_semantically_same
+from .utils import ExampleConfigChatterTestCase
 
+QA_MODE = os.environ.get("QA")
 
-class TestGroundingRail(TestCase):
- logger = None
- chatter = None
 
- @classmethod
- def setUpClass(cls) -> None:
- super().setUpClass()
- # Create a logger and a chatter
- cls.logger = create_logger('grounding_rail.log')
- cls.chatter = create_chatter("grounding_rail", "grounding_rail", cls.logger)
-
- @classmethod
- def tearDownClass(cls) -> None:
- super().tearDownClass()
- close_chatter(cls.chatter)
+class TestGroundingRail(ExampleConfigChatterTestCase):
+ example_name = "grounding_rail"
 
+ @pytest.mark.skipif(not QA_MODE, reason="Not in QA mode.")
  @pytest.mark.unit
  def test_grounding_rail(self):
  """Test the grounding_rail example"""
- self.logger.info("Running test_grounding_rail...")
-
  # Define Q&A messages
  messages = {
- "Hi there!": ["Hi! How can I assist you today?", "Hello again! How can I help you?", "Hello again! What can I help you with today?", "Hello again! What can I do for you?", "Hello! What can I help you with today?", "Hello again! How can I help you today?", "Hello again! How can I be of assistance?", "Hello there! How can I help you today?", "Hello! How can I assist you today?", "Hey there!", "Hi! How can I help you today?", "Hello! How can I help you today?", "Hello, how can I help you today?", "Hello there! How can I help you?"],
- "How many jobs were added in the transportation industry?": ["In March, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000). Employment in transportation and warehousing has shown little net change in recent months.", " In March, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000).", "According to the March 2023 US jobs report, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000)."],
- "What was the unemployment rate for senior citizens?": ["I'm sorry, I do not know the answer to that question.", "I'm sorry, I don't know the answer to that question.", "I'm sorry, I don't have the information you're looking for. However, I can help you find the answer if you provide me more information about the context of your question.", "I'm sorry, I don't know the answer to that question. Would you like me to look for more information?", "I'm sorry, I don't seem to have the answer to that question. Would you like me to provide you with additional information on the March 2023 US jobs report or suggest some resources where you can find the answer?", "I'm not sure of the exact answer to that question, but according to the March 2023 jobs report, the unemployment rate for persons age 65 and over was 5.5 percent, down from 5.8 percent in February.", "I'm sorry, I don't know the answer to that question. However, I can provide you with the latest US jobs report from March 2023 which may contain the answer you are looking for.", "I'm sorry, I don't have the information you are looking for. Would you like me to search for other sources on the topic?"],
- "How many CUDA cores does a 4090 have?": ["I'm sorry, I do not know the answer to that question. However, I can provide you with a link to the NVIDIA website where you can find more information about the GeForce RTX 4090 GPU: https://www.nvidia.com/en-us/geforce/graphics-cards/rtx-4090/", "I'm sorry, I don't have the information you're looking for. You may need to consult an expert or search online for the answer."]
+ "Hi there!": [
+ "Hi! How can I assist you today?",
+ "Hello again! How can I help you?",
+ "Hello again! What can I help you with today?",
+ "Hello again! What can I do for you?",
+ "Hello! What can I help you with today?",
+ "Hello again! How can I help you today?",
+ "Hello again! How can I be of assistance?",
+ "Hello there! How can I help you today?",
+ "Hello! How can I assist you today?",
+ "Hey there!",
+ "Hi! How can I help you today?",
+ "Hello! How can I help you today?",
+ "Hello, how can I help you today?",
+ "Hello there! How can I help you?",
+ ],
+ "How many jobs were added in the transportation industry?": [
+ "In March, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000). Employment in transportation and warehousing has shown little net change in recent months.",
+ " In March, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000).",
+ "According to the March 2023 US jobs report, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000).",
+ ],
+ "What was the unemployment rate for senior citizens?": [
+ "I'm sorry, I do not know the answer to that question.",
+ "I'm sorry, I don't know the answer to that question.",
+ "I'm sorry, I don't have the information you're looking for. However, I can help you find the answer if you provide me more information about the context of your question.",
+ "I'm sorry, I don't know the answer to that question. Would you like me to look for more information?",
+ "I'm sorry, I don't seem to have the answer to that question. Would you like me to provide you with additional information on the March 2023 US jobs report or suggest some resources where you can find the answer?",
+ "I'm not sure of the exact answer to that question, but according to the March 2023 jobs report, the unemployment rate for persons age 65 and over was 5.5 percent, down from 5.8 percent in February.",
+ "I'm sorry, I don't know the answer to that question. However, I can provide you with the latest US jobs report from March 2023 which may contain the answer you are looking for.",
+ "I'm sorry, I don't have the information you are looking for. Would you like me to search for other sources on the topic?",
+ ],
+ "How many CUDA cores does a 4090 have?": [
+ "I'm sorry, I do not know the answer to that question. However, I can provide you with a link to the NVIDIA website where you can find more information about the GeForce RTX 4090 GPU: https://www.nvidia.com/en-us/geforce/graphics-cards/rtx-4090/",
+ "I'm sorry, I don't have the information you're looking for. You may need to consult an expert or search online for the answer.",
+ ],
  }
 
- if self.chatter is not None:
- # Process the questions and validate the answers
- for question, expected_answers in messages.items():
- self.logger.info(f"User: {question}")
- # Send the question to chatter
- self.chatter.stdin.write(question + '\n')
- self.chatter.stdin.flush()
-
- # Read the answer from chatter
- output = self.chatter.stdout.readline().strip()
- self.logger.info(f"Bot: {output}")
- # Validate the answer
- if len([answer for answer in expected_answers if answer in output]) > 0:
- assert True
- else:
- assert are_strings_semantically_same(expected_answers[0], output)
+ self.run_test(messages)