Skip to content

Commit

Permalink
Refactored and enabled the QA end-to-end tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
drazvan committed Sep 1, 2023
1 parent d365c40 commit 91eec78
Show file tree
Hide file tree
Showing 11 changed files with 298 additions and 205 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Example for using [custom LLM and multiple KBs](./examples/multi_kb/README.md)
- Support for [`PROMPTS_DIR`](./docs/user_guide/advanced/prompt-customization.md#prompt-configuration).
- [#101](https://github.com/NVIDIA/NeMo-Guardrails/pull/101) Support for [using OpenAI embeddings](./docs/user_guide/configuration-guide.md#the-embeddings-model) models in addition to SentenceTransformers.
- First set of end-to-end QA tests for the example configurations.

### Changed

Expand Down
10 changes: 9 additions & 1 deletion qa/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,17 @@ Please refer to the [installation guide](installation-guide.md) for instructions
2. Change the directory to `nemoguardrails/qa` folder, and then run all the automated tests

```bash
> python -m pytest test_*.py
> QA=True python -m pytest test_*.py
```

NOTE: The QA tests are skipped by default as they are expensive (i.e., they make live call to OpenAI and other services). To enable them, you have to set the `QA` environment variable.

Alternatively, you can also run the tests from the root of the project:

```bash
> QA=True pytest qa
```

3. If there are any failure(s), analyze the corresponding example test log.

```bash
Expand Down
11 changes: 9 additions & 2 deletions qa/chatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import subprocess
import traceback


EXAMPLES_FOLDER = os.path.join(os.path.dirname(os.path.dirname(__file__)), "examples")


Expand All @@ -30,7 +29,14 @@ def create_chatter(name, configname, logger):
logger.info(f"config: {config}")
try:
command = ["nemoguardrails", "chat", f"--config={config}"]
chatter = subprocess.Popen(command, cwd=cwd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
chatter = subprocess.Popen(
command,
cwd=cwd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
if chatter is not None:
output = chatter.stdout.readline().strip()
logger.info(f"output: {output}")
Expand All @@ -43,6 +49,7 @@ def create_chatter(name, configname, logger):

return chatter


def close_chatter(chatter):
"""Close the given chatter"""
if chatter is not None:
Expand Down
4 changes: 2 additions & 2 deletions qa/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ def create_logger(filename):
logger.setLevel(logging.INFO)

# Create a file handler
file_handler = logging.FileHandler(filename, mode='w')
file_handler = logging.FileHandler(filename, mode="w")

# Configure the formatter and add it to the file handler
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
file_handler.setFormatter(formatter)

# Add the file handler to the logger
Expand Down
69 changes: 34 additions & 35 deletions qa/test_execution_rails.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,52 +12,51 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import pytest

from unittest import TestCase
from .logger import create_logger
from .chatter import create_chatter, close_chatter
from .utils import ExampleConfigChatterTestCase

QA_MODE = os.environ.get("QA")

class TestExecutionRails(TestCase):
logger = None
chatter = None

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
# Create a logger and a chatter
cls.logger = create_logger('execution_rails.log')
cls.chatter = create_chatter("execution_rails", "execution_rails/sample_rails", cls.logger)

@classmethod
def tearDownClass(cls) -> None:
super().tearDownClass()
close_chatter(cls.chatter)
class TestExecutionRails(ExampleConfigChatterTestCase):
example_name = "execution_rails"

@pytest.mark.skipif(not QA_MODE, reason="Not in QA mode.")
@pytest.mark.unit
def test_execution_rails(self):
"""Test the execution_rails example"""
self.logger.info("Running test_execution_rails...")

# Define Q&A messages
messages = {
"Hi there!": ["Hi! How can I assist you today?", "Hello again! How can I help you?", "Hello again! What can I help you with today?", "Hello again! What can I do for you?", "Hello! What can I help you with today?", "Hello again! How can I help you today?", "Hello again! How can I be of assistance?", "Hello there! How can I help you today?", "Hello! How can I assist you today?", "Hey there!", "Hi! How can I help you today?", "Hello! How can I help you today?", "Hello, how can I help you today?", "Hello there! How can I help you?"],
"How can you help?": ["I am an AI assistant that helps answer mathematical questions. My core mathematical skills are powered by wolfram alpha.", "How are you feeling today?"],
"What is 434 + 56*7.5?": ["434 + 56*7.5 is equal to 854.", "The result is 854.", "The result of 434 + 56*7.5 is 854.", "The answer is 854.", "434 + 56 * 7.5 is equal to 854."]
"Hi there!": [
"Hi! How can I assist you today?",
"Hello again! How can I help you?",
"Hello again! What can I help you with today?",
"Hello again! What can I do for you?",
"Hello! What can I help you with today?",
"Hello again! How can I help you today?",
"Hello again! How can I be of assistance?",
"Hello there! How can I help you today?",
"Hello! How can I assist you today?",
"Hey there!",
"Hi! How can I help you today?",
"Hello! How can I help you today?",
"Hello, how can I help you today?",
"Hello there! How can I help you?",
],
"How can you help?": [
"I am an AI assistant that helps answer mathematical questions. My core mathematical skills are powered by wolfram alpha.",
"How are you feeling today?",
],
"What is 434 + 56*7.5?": [
"434 + 56*7.5 is equal to 854.",
"The result is 854.",
"The result of 434 + 56*7.5 is 854.",
"The answer is 854.",
"434 + 56 * 7.5 is equal to 854.",
],
}

if self.chatter is not None:
# Process the questions and validate the answers
for question, expected_answers in messages.items():
self.logger.info(f"User: {question}")
# Send the question to chatter
self.chatter.stdin.write(question + '\n')
self.chatter.stdin.flush()

# Read the answer from chatter
output = self.chatter.stdout.readline().strip()
self.logger.info(f"Bot: {output}")
# Validate the answer
assert len([answer for answer in expected_answers if answer in output]) > 0
self.run_test(messages)
82 changes: 42 additions & 40 deletions qa/test_grounding_rail.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,57 +12,59 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import pytest

from unittest import TestCase
from .logger import create_logger
from .chatter import create_chatter, close_chatter
from .validator import are_strings_semantically_same
from .utils import ExampleConfigChatterTestCase

QA_MODE = os.environ.get("QA")

class TestGroundingRail(TestCase):
logger = None
chatter = None

@classmethod
def setUpClass(cls) -> None:
super().setUpClass()
# Create a logger and a chatter
cls.logger = create_logger('grounding_rail.log')
cls.chatter = create_chatter("grounding_rail", "grounding_rail", cls.logger)

@classmethod
def tearDownClass(cls) -> None:
super().tearDownClass()
close_chatter(cls.chatter)
class TestGroundingRail(ExampleConfigChatterTestCase):
example_name = "grounding_rail"

@pytest.mark.skipif(not QA_MODE, reason="Not in QA mode.")
@pytest.mark.unit
def test_grounding_rail(self):
"""Test the grounding_rail example"""
self.logger.info("Running test_grounding_rail...")

# Define Q&A messages
messages = {
"Hi there!": ["Hi! How can I assist you today?", "Hello again! How can I help you?", "Hello again! What can I help you with today?", "Hello again! What can I do for you?", "Hello! What can I help you with today?", "Hello again! How can I help you today?", "Hello again! How can I be of assistance?", "Hello there! How can I help you today?", "Hello! How can I assist you today?", "Hey there!", "Hi! How can I help you today?", "Hello! How can I help you today?", "Hello, how can I help you today?", "Hello there! How can I help you?"],
"How many jobs were added in the transportation industry?": ["In March, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000). Employment in transportation and warehousing has shown little net change in recent months.", " In March, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000).", "According to the March 2023 US jobs report, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000)."],
"What was the unemployment rate for senior citizens?": ["I'm sorry, I do not know the answer to that question.", "I'm sorry, I don't know the answer to that question.", "I'm sorry, I don't have the information you're looking for. However, I can help you find the answer if you provide me more information about the context of your question.", "I'm sorry, I don't know the answer to that question. Would you like me to look for more information?", "I'm sorry, I don't seem to have the answer to that question. Would you like me to provide you with additional information on the March 2023 US jobs report or suggest some resources where you can find the answer?", "I'm not sure of the exact answer to that question, but according to the March 2023 jobs report, the unemployment rate for persons age 65 and over was 5.5 percent, down from 5.8 percent in February.", "I'm sorry, I don't know the answer to that question. However, I can provide you with the latest US jobs report from March 2023 which may contain the answer you are looking for.", "I'm sorry, I don't have the information you are looking for. Would you like me to search for other sources on the topic?"],
"How many CUDA cores does a 4090 have?": ["I'm sorry, I do not know the answer to that question. However, I can provide you with a link to the NVIDIA website where you can find more information about the GeForce RTX 4090 GPU: https://www.nvidia.com/en-us/geforce/graphics-cards/rtx-4090/", "I'm sorry, I don't have the information you're looking for. You may need to consult an expert or search online for the answer."]
"Hi there!": [
"Hi! How can I assist you today?",
"Hello again! How can I help you?",
"Hello again! What can I help you with today?",
"Hello again! What can I do for you?",
"Hello! What can I help you with today?",
"Hello again! How can I help you today?",
"Hello again! How can I be of assistance?",
"Hello there! How can I help you today?",
"Hello! How can I assist you today?",
"Hey there!",
"Hi! How can I help you today?",
"Hello! How can I help you today?",
"Hello, how can I help you today?",
"Hello there! How can I help you?",
],
"How many jobs were added in the transportation industry?": [
"In March, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000). Employment in transportation and warehousing has shown little net change in recent months.",
" In March, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000).",
"According to the March 2023 US jobs report, employment in transportation and warehousing changed little (+10,000). Couriers and messengers (+7,000) and air transportation (+6,000) added jobs, while warehousing and storage lost jobs (-12,000).",
],
"What was the unemployment rate for senior citizens?": [
"I'm sorry, I do not know the answer to that question.",
"I'm sorry, I don't know the answer to that question.",
"I'm sorry, I don't have the information you're looking for. However, I can help you find the answer if you provide me more information about the context of your question.",
"I'm sorry, I don't know the answer to that question. Would you like me to look for more information?",
"I'm sorry, I don't seem to have the answer to that question. Would you like me to provide you with additional information on the March 2023 US jobs report or suggest some resources where you can find the answer?",
"I'm not sure of the exact answer to that question, but according to the March 2023 jobs report, the unemployment rate for persons age 65 and over was 5.5 percent, down from 5.8 percent in February.",
"I'm sorry, I don't know the answer to that question. However, I can provide you with the latest US jobs report from March 2023 which may contain the answer you are looking for.",
"I'm sorry, I don't have the information you are looking for. Would you like me to search for other sources on the topic?",
],
"How many CUDA cores does a 4090 have?": [
"I'm sorry, I do not know the answer to that question. However, I can provide you with a link to the NVIDIA website where you can find more information about the GeForce RTX 4090 GPU: https://www.nvidia.com/en-us/geforce/graphics-cards/rtx-4090/",
"I'm sorry, I don't have the information you're looking for. You may need to consult an expert or search online for the answer.",
],
}

if self.chatter is not None:
# Process the questions and validate the answers
for question, expected_answers in messages.items():
self.logger.info(f"User: {question}")
# Send the question to chatter
self.chatter.stdin.write(question + '\n')
self.chatter.stdin.flush()

# Read the answer from chatter
output = self.chatter.stdout.readline().strip()
self.logger.info(f"Bot: {output}")
# Validate the answer
if len([answer for answer in expected_answers if answer in output]) > 0:
assert True
else:
assert are_strings_semantically_same(expected_answers[0], output)
self.run_test(messages)
Loading

0 comments on commit 91eec78

Please sign in to comment.