From f8b7045c86d130fb8d384c04c228f0620600c109 Mon Sep 17 00:00:00 2001 From: Kevin Blin Date: Fri, 24 Mar 2023 11:02:59 +0100 Subject: [PATCH 1/4] feat: add moral-exceptQA evals --- evals/registry/data/moral-exceptQA/samples_2.jsonl | 3 +++ evals/registry/data/moral-exceptQA/samples_3.jsonl | 3 +++ evals/registry/evals/moral-exceptQA.yaml | 12 ++++++++++++ 3 files changed, 18 insertions(+) create mode 100644 evals/registry/data/moral-exceptQA/samples_2.jsonl create mode 100644 evals/registry/data/moral-exceptQA/samples_3.jsonl create mode 100644 evals/registry/evals/moral-exceptQA.yaml diff --git a/evals/registry/data/moral-exceptQA/samples_2.jsonl b/evals/registry/data/moral-exceptQA/samples_2.jsonl new file mode 100644 index 0000000000..68fe3014c4 --- /dev/null +++ b/evals/registry/data/moral-exceptQA/samples_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ebe19e89e9cd7f2eb1be5aa9f4c28f42f8ccbf09ecd26bb248b1c32d224fb1 +size 88223 diff --git a/evals/registry/data/moral-exceptQA/samples_3.jsonl b/evals/registry/data/moral-exceptQA/samples_3.jsonl new file mode 100644 index 0000000000..a44c686b8f --- /dev/null +++ b/evals/registry/data/moral-exceptQA/samples_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68d5aedf8b070fcfe6730113ad89ee027c34b49a5be35cf7569febf1e73531db +size 91183 diff --git a/evals/registry/evals/moral-exceptQA.yaml b/evals/registry/evals/moral-exceptQA.yaml new file mode 100644 index 0000000000..dadebe7f83 --- /dev/null +++ b/evals/registry/evals/moral-exceptQA.yaml @@ -0,0 +1,12 @@ +moral-exceptQA: + id: match_moral_exceptQA.test.v1 + description: Testing the models ability to align with human intuition on when is it acceptable to break an established moral norm. + metrics: [accuracy] +moral-exceptQA.test.v2: + class: evals.elsuite.basic.match:Match + args: + samples_jsonl: moral-exceptQA/samples_2.jsonl +moral-exceptQA.test.v3: + class: evals.elsuite.basic.match:Match + args: + samples_jsonl: moral-exceptQA/samples_3.jsonl From 53b0072c88602f0002239f383ea9b60dd53fbb7f Mon Sep 17 00:00:00 2001 From: Kevin Blin Date: Fri, 31 Mar 2023 15:49:15 +0200 Subject: [PATCH 2/4] fix: improve prompt and remove 3rd choice option --- evals/registry/data/moral-exceptQA/samples.jsonl | 3 +++ evals/registry/data/moral-exceptQA/samples_2.jsonl | 3 --- evals/registry/data/moral-exceptQA/samples_3.jsonl | 3 --- evals/registry/evals/moral-exceptQA.yaml | 10 +++------- 4 files changed, 6 insertions(+), 13 deletions(-) create mode 100644 evals/registry/data/moral-exceptQA/samples.jsonl delete mode 100644 evals/registry/data/moral-exceptQA/samples_2.jsonl delete mode 100644 evals/registry/data/moral-exceptQA/samples_3.jsonl diff --git a/evals/registry/data/moral-exceptQA/samples.jsonl b/evals/registry/data/moral-exceptQA/samples.jsonl new file mode 100644 index 0000000000..64bf4afaf6 --- /dev/null +++ b/evals/registry/data/moral-exceptQA/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd16d05af13b06239f584186b5e72dc21a32760bf06ec22ba274b29b336eba4a +size 87566 diff --git a/evals/registry/data/moral-exceptQA/samples_2.jsonl b/evals/registry/data/moral-exceptQA/samples_2.jsonl deleted file mode 100644 index 68fe3014c4..0000000000 --- a/evals/registry/data/moral-exceptQA/samples_2.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:81ebe19e89e9cd7f2eb1be5aa9f4c28f42f8ccbf09ecd26bb248b1c32d224fb1 -size 88223 diff --git a/evals/registry/data/moral-exceptQA/samples_3.jsonl b/evals/registry/data/moral-exceptQA/samples_3.jsonl deleted file mode 100644 index a44c686b8f..0000000000 --- a/evals/registry/data/moral-exceptQA/samples_3.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68d5aedf8b070fcfe6730113ad89ee027c34b49a5be35cf7569febf1e73531db -size 91183 diff --git a/evals/registry/evals/moral-exceptQA.yaml b/evals/registry/evals/moral-exceptQA.yaml index dadebe7f83..b03a7f3753 100644 --- a/evals/registry/evals/moral-exceptQA.yaml +++ b/evals/registry/evals/moral-exceptQA.yaml @@ -1,12 +1,8 @@ moral-exceptQA: id: match_moral_exceptQA.test.v1 - description: Testing the models ability to align with human intuition on when is it acceptable to break an established moral norm. + description: This eval tests the models ability to align with human intuition on when is it acceptable to break an established moral norm. metrics: [accuracy] -moral-exceptQA.test.v2: +moral-exceptQA.test.v1: class: evals.elsuite.basic.match:Match args: - samples_jsonl: moral-exceptQA/samples_2.jsonl -moral-exceptQA.test.v3: - class: evals.elsuite.basic.match:Match - args: - samples_jsonl: moral-exceptQA/samples_3.jsonl + samples_jsonl: moral-exceptQA/samples.jsonl From e412c2a26b7772f15e43251ad4b03a235389d552 Mon Sep 17 00:00:00 2001 From: Kevin Blin Date: Sun, 2 Apr 2023 10:31:23 +0200 Subject: [PATCH 3/4] fix: typos in system prompts --- evals/registry/data/moral-exceptQA/samples.jsonl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/registry/data/moral-exceptQA/samples.jsonl b/evals/registry/data/moral-exceptQA/samples.jsonl index 64bf4afaf6..2414afffa3 100644 --- a/evals/registry/data/moral-exceptQA/samples.jsonl +++ b/evals/registry/data/moral-exceptQA/samples.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd16d05af13b06239f584186b5e72dc21a32760bf06ec22ba274b29b336eba4a -size 87566 +oid sha256:ebc93df2ec630632ab52820e6a33aa44350f5307b2f5b38f892b4cd5d34682d4 +size 87418 From fe64c7c5434b9203f142f55952f70fae06874964 Mon Sep 17 00:00:00 2001 From: Kevin Blin Date: Wed, 12 Apr 2023 15:39:35 +0200 Subject: [PATCH 4/4] fix: correctly rename yaml test --- .../data/{moral-exceptQA => moral_exceptQA}/samples.jsonl | 0 .../evals/{moral-exceptQA.yaml => moral_exceptQA.yaml} | 8 ++++---- 2 files changed, 4 insertions(+), 4 deletions(-) rename evals/registry/data/{moral-exceptQA => moral_exceptQA}/samples.jsonl (100%) rename evals/registry/evals/{moral-exceptQA.yaml => moral_exceptQA.yaml} (63%) diff --git a/evals/registry/data/moral-exceptQA/samples.jsonl b/evals/registry/data/moral_exceptQA/samples.jsonl similarity index 100% rename from evals/registry/data/moral-exceptQA/samples.jsonl rename to evals/registry/data/moral_exceptQA/samples.jsonl diff --git a/evals/registry/evals/moral-exceptQA.yaml b/evals/registry/evals/moral_exceptQA.yaml similarity index 63% rename from evals/registry/evals/moral-exceptQA.yaml rename to evals/registry/evals/moral_exceptQA.yaml index b03a7f3753..550016ecd5 100644 --- a/evals/registry/evals/moral-exceptQA.yaml +++ b/evals/registry/evals/moral_exceptQA.yaml @@ -1,8 +1,8 @@ -moral-exceptQA: - id: match_moral_exceptQA.test.v1 +moral_exceptQA: + id: moral_exceptQA.test.v1 description: This eval tests the models ability to align with human intuition on when is it acceptable to break an established moral norm. metrics: [accuracy] -moral-exceptQA.test.v1: +moral_exceptQA.test.v1: class: evals.elsuite.basic.match:Match args: - samples_jsonl: moral-exceptQA/samples.jsonl + samples_jsonl: moral_exceptQA/samples.jsonl