From 7b68f5f6734d7ad9903112d10fcc381e7f85fd54 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Thu, 9 Mar 2023 08:21:39 +0000
Subject: [PATCH 01/36] add boolq_pt template and christykoh as included user

---
 elk/promptsource/templates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/elk/promptsource/templates.py b/elk/promptsource/templates.py
index 5f8a4f34..f34a1933 100644
--- a/elk/promptsource/templates.py
+++ b/elk/promptsource/templates.py
@@ -24,7 +24,7 @@
 
 # These are users whose datasets should be included in the results returned by
 # filter_english_datasets (regardless of their metadata)
-INCLUDED_USERS = {"Zaid", "craffel", "lauritowal"}
+INCLUDED_USERS = {"Zaid", "craffel", "lauritowal", "christykoh"}
 
 
 def highlight(input):

From edfdd6d5f44e55533c5d67767a11f45ecea52d79 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Tue, 14 Mar 2023 05:17:05 +0000
Subject: [PATCH 02/36] add templates.yaml for boolqpt

---
 .../christykoh/boolq_pt/templates.yaml        | 189 ++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 elk/promptsource/templates/christykoh/boolq_pt/templates.yaml

diff --git a/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml b/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml
new file mode 100644
index 00000000..fbf4b5b0
--- /dev/null
+++ b/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml
@@ -0,0 +1,189 @@
+dataset: boolq_pt
+templates:
+  3e386463-1715-4578-9cba-07d11a0d3b61: !Template
+    answer_choices: False ||| True
+    id: 3e386463-1715-4578-9cba-07d11a0d3b61
+    jinja: 'Passagem: {{passage}}
+
+
+      Depois de ler esta passagem, tenho uma pergunta: {{question}}? Verdadeiro ou falso?
+      |||
+
+      {% if label != -1 %}
+
+      {{answer_choices[label]}}
+      
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: after_reading
+    reference: ''
+  492f0f88-4370-46cd-839b-1de37a55aeda: !Template
+    answer_choices: No ||| Yes
+    id: 492f0f88-4370-46cd-839b-1de37a55aeda
+    jinja: "{{ passage }} \nPergunta: {{ question }}\nResposta: ||| \n{% if label !=\
+      \ -1 %}\n{{ answer_choices[label] }}\n{% endif %}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: GPT-3 Style
+    reference: Same as Figure G29, p. 58 of the GPT-3 paper
+  6cb6a026-c070-470a-b75d-bb8fdf424e35: !Template
+    answer_choices: No ||| Yes
+    id: 6cb6a026-c070-470a-b75d-bb8fdf424e35
+    jinja: "{{ passage }}\n\nDepois de ler isso, eu me pergunto {{ question }}? |||\n{% if\
+      \ label != -1 %}\n{{ answer_choices[label] }} \n{% endif %}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: "I wonder\u2026"
+    reference: ''
+  7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5: !Template
+    answer_choices: No ||| Yes
+    id: 7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5
+    jinja: 'Texto: {{passage}}
+
+
+      Responda sim/não à seguinte pergunta: {{question}}? Sim ou não? |||
+
+      {% if label != -1 %}
+
+      {{answer_choices[label]}}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: yes_no_question
+    reference: ''
+  7d21d974-0624-4d4f-9e8c-644e2d009cb5: !Template
+    answer_choices: No ||| Yes
+    id: 7d21d974-0624-4d4f-9e8c-644e2d009cb5
+    jinja: "{{ passage }}\n\nDepois de ler isso, você poderia me dizer {{ question }}? \
+      \ ||| {% if label != -1 %}{{ answer_choices[label] }}\n{% endif %}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: "could you tell me\u2026"
+    reference: ''
+  922d3e87-ac58-4731-84d1-f0a40e47afb5: !Template
+    answer_choices: No ||| Yes
+    id: 922d3e87-ac58-4731-84d1-f0a40e47afb5
+    jinja: "EXAME\n1. Responda sim ou não.\nDocumento: {{passage}}\nPergunta: {{question}}? \
+      \ ||| \n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: exam
+    reference: ''
+  9a1bf459-8047-437c-9def-f21e960429cc: !Template
+    answer_choices: No ||| Yes
+    id: 9a1bf459-8047-437c-9def-f21e960429cc
+    jinja: 'Com base na seguinte passagem, {{ question }}? {{ passage }}
+
+
+      |||
+
+      {% if label != -1 %}
+
+      {{ answer_choices[label] }}
+      
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: based on the following passage
+    reference: "Adapted from Perez et al. 2021 and Schick & Sch\xFCtz 2021."
+  9f4c6b0a-437b-40c0-b467-db4b7218d38d: !Template
+    answer_choices: False ||| True
+    id: 9f4c6b0a-437b-40c0-b467-db4b7218d38d
+    jinja: 'Exercício: leia o texto e responda à questão com Verdadeiro ou Falso.
+
+
+      Texto: {{passage}}
+
+      Pergunta: {{question}}? |||
+
+      {% if label != -1 %}
+
+      {{answer_choices[label]}}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: exercise
+    reference: ''
+  b2b3cb60-d6e3-491c-a09a-8201e13e417e: !Template
+    answer_choices: No ||| Yes
+    id: b2b3cb60-d6e3-491c-a09a-8201e13e417e
+    jinja: '{{ passage }}
+
+      Com base na passagem anterior, {{ question }}? ||| {% if label != -1 %}{{ answer_choices[label]
+      }}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: based on the previous passage
+    reference: "Adapted from Perez et al. 2021 and Schick & Sch\xFCtz 2021."
+  eb78772c-e81e-4b8a-a77b-b75efd1c212a: !Template
+    answer_choices: False ||| True
+    id: eb78772c-e81e-4b8a-a77b-b75efd1c212a
+    jinja: '{{passage}}
+
+
+      P: {{question}}? Verdadeiro ou falso? |||
+
+      {% if label != -1 %}
+
+      {{answer_choices[label]}}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: valid_binary
+    reference: ''

From d2857985e596a567dc5af6244da46f7a245e938b Mon Sep 17 00:00:00 2001
From: Reagan Lee <reaganjlee@gmail.com>
Date: Sun, 26 Mar 2023 07:25:38 +0000
Subject: [PATCH 03/36] pt yaml

---
 .../templates/boolq_pt/templates.yaml         | 189 ++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 elk/promptsource/templates/boolq_pt/templates.yaml

diff --git a/elk/promptsource/templates/boolq_pt/templates.yaml b/elk/promptsource/templates/boolq_pt/templates.yaml
new file mode 100644
index 00000000..93528f32
--- /dev/null
+++ b/elk/promptsource/templates/boolq_pt/templates.yaml
@@ -0,0 +1,189 @@
+dataset: boolq_pt
+templates:
+  3e386463-1715-4578-9cba-07d11a0d3b61: !Template
+    answer_choices: False ||| True
+    id: 3e386463-1715-4578-9cba-07d11a0d3b61
+    jinja: 'Passagem: {{passage}}
+
+
+      Depois de ler esta passagem, tenho uma pergunta: {{question}}? Verdadeiro ou falso?
+      |||
+
+      {% if label != -1 %}
+
+      {{answer_choices[label]}}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: after_reading
+    reference: ''
+  492f0f88-4370-46cd-839b-1de37a55aeda: !Template
+    answer_choices: No ||| Yes
+    id: 492f0f88-4370-46cd-839b-1de37a55aeda
+    jinja: "{{ passage }} \nPergunta: {{ question }}\nResposta: ||| \n{% if label !=\
+      \ -1 %}\n{{ answer_choices[label] }}\n{% endif %}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: GPT-3 Style
+    reference: Same as Figure G29, p. 58 of the GPT-3 paper
+  6cb6a026-c070-470a-b75d-bb8fdf424e35: !Template
+    answer_choices: No ||| Yes
+    id: 6cb6a026-c070-470a-b75d-bb8fdf424e35
+    jinja: "{{ passage }}\n\nDepois de ler isso, eu me pergunto {{ question }}? |||\n{% if\
+      \ label != -1 %}\n{{ answer_choices[label] }} \n{% endif %}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: "I wonder\u2026"
+    reference: ''
+  7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5: !Template
+    answer_choices: No ||| Yes
+    id: 7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5
+    jinja: 'Texto: {{passage}}
+
+
+      Responda sim/não à seguinte pergunta: {{question}}? Sim ou não? |||
+
+      {% if label != -1 %}
+
+      {{answer_choices[label]}}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: yes_no_question
+    reference: ''
+  7d21d974-0624-4d4f-9e8c-644e2d009cb5: !Template
+    answer_choices: No ||| Yes
+    id: 7d21d974-0624-4d4f-9e8c-644e2d009cb5
+    jinja: "{{ passage }}\n\nDepois de ler isso, você poderia me dizer {{ question }}? \
+      \ ||| {% if label != -1 %}{{ answer_choices[label] }}\n{% endif %}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: "could you tell me\u2026"
+    reference: ''
+  922d3e87-ac58-4731-84d1-f0a40e47afb5: !Template
+    answer_choices: No ||| Yes
+    id: 922d3e87-ac58-4731-84d1-f0a40e47afb5
+    jinja: "EXAME\n1. Responda sim ou não.\nDocumento: {{passage}}\nPergunta: {{question}}? \
+      \ ||| \n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: exam
+    reference: ''
+  9a1bf459-8047-437c-9def-f21e960429cc: !Template
+    answer_choices: No ||| Yes
+    id: 9a1bf459-8047-437c-9def-f21e960429cc
+    jinja: 'Com base na seguinte passagem, {{ question }}? {{ passage }}
+
+
+      |||
+
+      {% if label != -1 %}
+
+      {{ answer_choices[label] }}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: based on the following passage
+    reference: "Adapted from Perez et al. 2021 and Schick & Sch\xFCtz 2021."
+  9f4c6b0a-437b-40c0-b467-db4b7218d38d: !Template
+    answer_choices: False ||| True
+    id: 9f4c6b0a-437b-40c0-b467-db4b7218d38d
+    jinja: 'Exercício: leia o texto e responda à questão com Verdadeiro ou Falso.
+
+
+      Texto: {{passage}}
+
+      Pergunta: {{question}}? |||
+
+      {% if label != -1 %}
+
+      {{answer_choices[label]}}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: exercise
+    reference: ''
+  b2b3cb60-d6e3-491c-a09a-8201e13e417e: !Template
+    answer_choices: No ||| Yes
+    id: b2b3cb60-d6e3-491c-a09a-8201e13e417e
+    jinja: '{{ passage }}
+
+      Com base na passagem anterior, {{ question }}? ||| {% if label != -1 %}{{ answer_choices[label]
+      }}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: based on the previous passage
+    reference: "Adapted from Perez et al. 2021 and Schick & Sch\xFCtz 2021."
+  eb78772c-e81e-4b8a-a77b-b75efd1c212a: !Template
+    answer_choices: False ||| True
+    id: eb78772c-e81e-4b8a-a77b-b75efd1c212a
+    jinja: '{{passage}}
+
+
+      P: {{question}}? Verdadeiro ou falso? |||
+
+      {% if label != -1 %}
+
+      {{answer_choices[label]}}
+
+      {% endif %}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: valid_binary
+    reference: ''

From 0ff1609b4d7fd7dbd50b825e1e3039eeeaec1465 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 5 Apr 2023 08:08:17 +0000
Subject: [PATCH 04/36] add ag_news template, translated to pt

---
 elk/promptsource/templates.py                 |   2 +-
 .../christykoh/ag_news_pt/templates.yaml      | 215 ++++++++++++++++++
 2 files changed, 216 insertions(+), 1 deletion(-)
 create mode 100644 elk/promptsource/templates/christykoh/ag_news_pt/templates.yaml

diff --git a/elk/promptsource/templates.py b/elk/promptsource/templates.py
index a19526e1..36480b1e 100644
--- a/elk/promptsource/templates.py
+++ b/elk/promptsource/templates.py
@@ -24,7 +24,7 @@
 
 # These are users whose datasets should be included in the results returned by
 # filter_english_datasets (regardless of their metadata)
-INCLUDED_USERS = {"Zaid", "craffel", "lauritowal"}
+INCLUDED_USERS = {"Zaid", "craffel", "lauritowal", "christykoh"}
 
 
 def highlight(input):
diff --git a/elk/promptsource/templates/christykoh/ag_news_pt/templates.yaml b/elk/promptsource/templates/christykoh/ag_news_pt/templates.yaml
new file mode 100644
index 00000000..2dc94749
--- /dev/null
+++ b/elk/promptsource/templates/christykoh/ag_news_pt/templates.yaml
@@ -0,0 +1,215 @@
+dataset: ag_news_pt
+templates:
+  24e44a81-a18a-42dd-a71c-5b31b2d2cb39: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: 24e44a81-a18a-42dd-a71c-5b31b2d2cb39
+    jinja: "Qual rótulo melhor descreve este artigo de notícias?\n{{text}} ||| \n{{answer_choices[label]\
+      \ }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: classify_question_first
+    reference: ''
+  8fdc1056-1029-41a1-9c67-354fc2b8ceaf: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: 8fdc1056-1029-41a1-9c67-354fc2b8ceaf
+    jinja: "Isso é uma notícia sobre {{\"política mundial, esportes, negócios,\
+       \ ou ciência e tecnologia\"}}?\n{{text}} \n||| \n{{answer_choices[label] }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: classify_with_choices_question_first
+    reference: ''
+  918267e0-af68-4117-892d-2dbe66a58ce9: !Template
+    answer_choices: Político ||| Atleta ||| Executivo de negócios ||| Cientista
+    id: 918267e0-af68-4117-892d-2dbe66a58ce9
+    jinja: 'Você recomendaria o seguinte artigo para um {{"político"}}, um {{"atleta"}},
+       um {{"executivo de negócios"}} ou um {{"cientista"}}?
+
+
+      {{ text }}
+
+      |||
+
+      {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: recommend
+    reference: ''
+  9345df33-4f23-4944-a33c-eef94e626862: !Template
+    answer_choices: Notícias do mundo ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: 9345df33-4f23-4944-a33c-eef94e626862
+    jinja: "{{text}} \n\nQual das seguintes seções de um jornal seria esta\
+       \ artigo provavelmente aparece em? {{\"Notícias do mundo\"}}, {{\"Esportes\"}}, {{\"Negócios\"\
+       }} ou {{\"Ciência e Tecnologia\"}}? ||| \n{{answer_choices[label] }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: which_section_choices
+    reference: ''
+  98534347-fff7-4c39-a795-4e69a44791f7: !Template
+    answer_choices: Notícias do mundo ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: 98534347-fff7-4c39-a795-4e69a44791f7
+    jinja: "{{text}} \n\nEm qual seção de um jornal este artigo provavelmente apareceria\
+       \ em? ||| \n{{answer_choices[label] }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: which_section
+    reference: ''
+  b1d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b1d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: 'Considere o seguinte exemplo: '''''' {{text}} ''''''
+      Opção 1: {{answer_choices[label]}}.
+      Opção 2: {{answer_choices[1 - label]}}. Entre a opção 1 e a opção 2, o tópico deste exemplo é |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_1
+    reference: Burns et al
+  b2d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b2d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: 'Consider the following example: '''''' {{text}} ''''''
+      Opção 1: {{answer_choices[label]}}.
+      Opção 2: {{answer_choices[1 - label]}}. Entre a opção 1 e a opção 2, qual é o tópico deste exemplo? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_2
+    reference: Burns et al
+  b3d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b3d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: 'Considere o seguinte exemplo: '''''' {{text}} ''''''
+      Qual é o tópico deste exemplo, opção 1: {{answer_choices[label]}},
+      ou opção 2: {{answer_choices[1 - label]}}? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_3
+    reference: Burns et al
+  b4d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b4d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Opção 1: {{answer_choices[label]}}. Opção 2: {{answer_choices[1 - label]}}. Entre a opção 1 e a opção 2, o tópico deste exemplo é |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_4
+    reference: Burns et al
+  b5d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b5d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Opção 1: {{answer_choices[label]}}. Opção 2: {{answer_choices[1 - label]}}. Entre a opção 1 e a opção 2, qual é o tema deste exemplo? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_5
+    reference: Burns et al
+  b6d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b6d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Qual é o tópico deste exemplo, escolha 1: {{answer_choices[label]}}, ou escolha 2: {{answer_choices[1 - label]}}? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_6
+    reference: Burns et al
+  b7d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b7d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Qual rótulo melhor descreve este artigo de notícias, escolha 1: {{answer_choices[label]}}, ou escolha 2: {{answer_choices[1 - label]}}? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_7
+    reference: Burns et al
+  b8d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b8d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Em qual seção de um jornal esse artigo provavelmente apareceria, escolha 1: {{answer_choices[label]}}, ou escolha 2: {{answer_choices[1 - label]}}? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_8
+    reference: Burns et al
+  b401b0ee-6ffe-4a91-8e15-77ee073cd858: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b401b0ee-6ffe-4a91-8e15-77ee073cd858
+    jinja: "{{text}} \nIsto é uma notícia sobre {{\"política mundial, esportes,\
+       \ negócios ou ciência e tecnologia\"}}? ||| \n{{answer_choices[label] }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: classify_with_choices
+    reference: ''
+  cb355f33-7e8c-4455-a72b-48d315bd4f60: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: cb355f33-7e8c-4455-a72b-48d315bd4f60
+    jinja: "{{text}} \nQual rótulo melhor descreve esta notícia? ||| \n{{answer_choices[label]\
+      \ }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: classify
+    reference: ''

From 7180a643a9cd1acf9b02b9154cf517cf792cc752 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 5 Apr 2023 08:08:17 +0000
Subject: [PATCH 05/36] add ag_news template, translated to pt

---
 .../christykoh/ag_news_pt/templates.yaml      | 215 ++++++++++++++++++
 1 file changed, 215 insertions(+)
 create mode 100644 elk/promptsource/templates/christykoh/ag_news_pt/templates.yaml

diff --git a/elk/promptsource/templates/christykoh/ag_news_pt/templates.yaml b/elk/promptsource/templates/christykoh/ag_news_pt/templates.yaml
new file mode 100644
index 00000000..2dc94749
--- /dev/null
+++ b/elk/promptsource/templates/christykoh/ag_news_pt/templates.yaml
@@ -0,0 +1,215 @@
+dataset: ag_news_pt
+templates:
+  24e44a81-a18a-42dd-a71c-5b31b2d2cb39: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: 24e44a81-a18a-42dd-a71c-5b31b2d2cb39
+    jinja: "Qual rótulo melhor descreve este artigo de notícias?\n{{text}} ||| \n{{answer_choices[label]\
+      \ }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: classify_question_first
+    reference: ''
+  8fdc1056-1029-41a1-9c67-354fc2b8ceaf: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: 8fdc1056-1029-41a1-9c67-354fc2b8ceaf
+    jinja: "Isso é uma notícia sobre {{\"política mundial, esportes, negócios,\
+       \ ou ciência e tecnologia\"}}?\n{{text}} \n||| \n{{answer_choices[label] }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: classify_with_choices_question_first
+    reference: ''
+  918267e0-af68-4117-892d-2dbe66a58ce9: !Template
+    answer_choices: Político ||| Atleta ||| Executivo de negócios ||| Cientista
+    id: 918267e0-af68-4117-892d-2dbe66a58ce9
+    jinja: 'Você recomendaria o seguinte artigo para um {{"político"}}, um {{"atleta"}},
+       um {{"executivo de negócios"}} ou um {{"cientista"}}?
+
+
+      {{ text }}
+
+      |||
+
+      {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: recommend
+    reference: ''
+  9345df33-4f23-4944-a33c-eef94e626862: !Template
+    answer_choices: Notícias do mundo ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: 9345df33-4f23-4944-a33c-eef94e626862
+    jinja: "{{text}} \n\nQual das seguintes seções de um jornal seria esta\
+       \ artigo provavelmente aparece em? {{\"Notícias do mundo\"}}, {{\"Esportes\"}}, {{\"Negócios\"\
+       }} ou {{\"Ciência e Tecnologia\"}}? ||| \n{{answer_choices[label] }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: which_section_choices
+    reference: ''
+  98534347-fff7-4c39-a795-4e69a44791f7: !Template
+    answer_choices: Notícias do mundo ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: 98534347-fff7-4c39-a795-4e69a44791f7
+    jinja: "{{text}} \n\nEm qual seção de um jornal este artigo provavelmente apareceria\
+       \ em? ||| \n{{answer_choices[label] }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: which_section
+    reference: ''
+  b1d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b1d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: 'Considere o seguinte exemplo: '''''' {{text}} ''''''
+      Opção 1: {{answer_choices[label]}}.
+      Opção 2: {{answer_choices[1 - label]}}. Entre a opção 1 e a opção 2, o tópico deste exemplo é |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_1
+    reference: Burns et al
+  b2d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b2d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: 'Consider the following example: '''''' {{text}} ''''''
+      Opção 1: {{answer_choices[label]}}.
+      Opção 2: {{answer_choices[1 - label]}}. Entre a opção 1 e a opção 2, qual é o tópico deste exemplo? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_2
+    reference: Burns et al
+  b3d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b3d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: 'Considere o seguinte exemplo: '''''' {{text}} ''''''
+      Qual é o tópico deste exemplo, opção 1: {{answer_choices[label]}},
+      ou opção 2: {{answer_choices[1 - label]}}? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_3
+    reference: Burns et al
+  b4d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b4d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Opção 1: {{answer_choices[label]}}. Opção 2: {{answer_choices[1 - label]}}. Entre a opção 1 e a opção 2, o tópico deste exemplo é |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_4
+    reference: Burns et al
+  b5d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b5d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Opção 1: {{answer_choices[label]}}. Opção 2: {{answer_choices[1 - label]}}. Entre a opção 1 e a opção 2, qual é o tema deste exemplo? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_5
+    reference: Burns et al
+  b6d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b6d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Qual é o tópico deste exemplo, escolha 1: {{answer_choices[label]}}, ou escolha 2: {{answer_choices[1 - label]}}? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_6
+    reference: Burns et al
+  b7d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b7d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Qual rótulo melhor descreve este artigo de notícias, escolha 1: {{answer_choices[label]}}, ou escolha 2: {{answer_choices[1 - label]}}? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_7
+    reference: Burns et al
+  b8d49782-dfd9-41a5-87a6-8fceab9c2198: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b8d49782-dfd9-41a5-87a6-8fceab9c2198
+    jinja: '{{text}}
+    Em qual seção de um jornal esse artigo provavelmente apareceria, escolha 1: {{answer_choices[label]}}, ou escolha 2: {{answer_choices[1 - label]}}? |||  {{answer_choices[label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics: []
+      original_task: true
+    name: burns_8
+    reference: Burns et al
+  b401b0ee-6ffe-4a91-8e15-77ee073cd858: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: b401b0ee-6ffe-4a91-8e15-77ee073cd858
+    jinja: "{{text}} \nIsto é uma notícia sobre {{\"política mundial, esportes,\
+       \ negócios ou ciência e tecnologia\"}}? ||| \n{{answer_choices[label] }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: classify_with_choices
+    reference: ''
+  cb355f33-7e8c-4455-a72b-48d315bd4f60: !Template
+    answer_choices: Política mundial ||| Esportes ||| Negócios ||| Ciência e Tecnologia
+    id: cb355f33-7e8c-4455-a72b-48d315bd4f60
+    jinja: "{{text}} \nQual rótulo melhor descreve esta notícia? ||| \n{{answer_choices[label]\
+      \ }}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: classify
+    reference: ''

From b8f5e8b58f70524e27fc6faa8fe76dbb174682c1 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Fri, 7 Apr 2023 07:38:44 +0000
Subject: [PATCH 06/36] save eval runs to separate subfolders by target dataset

---
 elk/evaluation/evaluate.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
index 17aa073c..5f2fd1d2 100644
--- a/elk/evaluation/evaluate.py
+++ b/elk/evaluation/evaluate.py
@@ -39,13 +39,16 @@ class Eval(Serializable):
     normalization: Literal["legacy", "none", "elementwise", "meanonly"] = "meanonly"
 
     debug: bool = False
-    out_dir: Optional[Path] = None
+    out_dir_suffix: Optional[Path] = None # custom name for subdir in transfer_eval folder
     num_gpus: int = -1
 
     concatenated_layer_offset: int = 0
 
     def execute(self):
-        transfer_eval = elk_reporter_dir() / self.source / "transfer_eval"
+        if self.out_dir_suffix == None:
+            self.out_dir_suffix = '-'.join(self.data.prompts.datasets).replace(' ', '_')
+
+        transfer_eval = elk_reporter_dir() / self.source / "transfer_eval" / self.out_dir_suffix
 
         run = Evaluate(cfg=self, out_dir=transfer_eval)
         run.evaluate()

From 51fed5d5046b30cbae22f0e4425315547b2741a0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 7 Apr 2023 07:41:08 +0000
Subject: [PATCH 07/36] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 elk/evaluation/evaluate.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
index 5f2fd1d2..6daa7bd8 100644
--- a/elk/evaluation/evaluate.py
+++ b/elk/evaluation/evaluate.py
@@ -39,16 +39,20 @@ class Eval(Serializable):
     normalization: Literal["legacy", "none", "elementwise", "meanonly"] = "meanonly"
 
     debug: bool = False
-    out_dir_suffix: Optional[Path] = None # custom name for subdir in transfer_eval folder
+    out_dir_suffix: Optional[
+        Path
+    ] = None  # custom name for subdir in transfer_eval folder
     num_gpus: int = -1
 
     concatenated_layer_offset: int = 0
 
     def execute(self):
-        if self.out_dir_suffix == None:
-            self.out_dir_suffix = '-'.join(self.data.prompts.datasets).replace(' ', '_')
+        if self.out_dir_suffix is None:
+            self.out_dir_suffix = "-".join(self.data.prompts.datasets).replace(" ", "_")
 
-        transfer_eval = elk_reporter_dir() / self.source / "transfer_eval" / self.out_dir_suffix
+        transfer_eval = (
+            elk_reporter_dir() / self.source / "transfer_eval" / self.out_dir_suffix
+        )
 
         run = Evaluate(cfg=self, out_dir=transfer_eval)
         run.evaluate()

From 266084463343ac11b8e956ad0910685ae0455a40 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Fri, 7 Apr 2023 09:14:00 +0000
Subject: [PATCH 08/36] eval multiple datasets

---
 elk/evaluation/evaluate.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
index 5f2fd1d2..51d6b838 100644
--- a/elk/evaluation/evaluate.py
+++ b/elk/evaluation/evaluate.py
@@ -39,19 +39,19 @@ class Eval(Serializable):
     normalization: Literal["legacy", "none", "elementwise", "meanonly"] = "meanonly"
 
     debug: bool = False
-    out_dir_suffix: Optional[Path] = None # custom name for subdir in transfer_eval folder
+    out_dir: Optional[Path] = None
     num_gpus: int = -1
 
     concatenated_layer_offset: int = 0
 
     def execute(self):
-        if self.out_dir_suffix == None:
-            self.out_dir_suffix = '-'.join(self.data.prompts.datasets).replace(' ', '_')
+        datasets = self.data.prompts.datasets
 
-        transfer_eval = elk_reporter_dir() / self.source / "transfer_eval" / self.out_dir_suffix
+        transfer_dir = elk_reporter_dir() / self.source / "transfer_eval"
 
-        run = Evaluate(cfg=self, out_dir=transfer_eval)
-        run.evaluate()
+        for dataset in datasets:
+            run = Evaluate(cfg=self, out_dir=transfer_dir / dataset)
+            run.evaluate()
 
 
 @dataclass

From b2302099165c8b82a1239b2b02da6e2d6c48b42a Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Fri, 7 Apr 2023 17:40:19 +0000
Subject: [PATCH 09/36] change prompt answer chouces to portuguese

---
 .../templates/christykoh/boolq_pt/templates.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml b/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml
index fbf4b5b0..1120d3cd 100644
--- a/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml
+++ b/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml
@@ -1,7 +1,7 @@
 dataset: boolq_pt
 templates:
   3e386463-1715-4578-9cba-07d11a0d3b61: !Template
-    answer_choices: False ||| True
+    answer_choices: Falso ||| Verdadeiro
     id: 3e386463-1715-4578-9cba-07d11a0d3b61
     jinja: 'Passagem: {{passage}}
 
@@ -24,7 +24,7 @@ templates:
     name: after_reading
     reference: ''
   492f0f88-4370-46cd-839b-1de37a55aeda: !Template
-    answer_choices: No ||| Yes
+    answer_choices: Não ||| Sim
     id: 492f0f88-4370-46cd-839b-1de37a55aeda
     jinja: "{{ passage }} \nPergunta: {{ question }}\nResposta: ||| \n{% if label !=\
       \ -1 %}\n{{ answer_choices[label] }}\n{% endif %}"
@@ -38,7 +38,7 @@ templates:
     name: GPT-3 Style
     reference: Same as Figure G29, p. 58 of the GPT-3 paper
   6cb6a026-c070-470a-b75d-bb8fdf424e35: !Template
-    answer_choices: No ||| Yes
+    answer_choices: Não ||| Sim
     id: 6cb6a026-c070-470a-b75d-bb8fdf424e35
     jinja: "{{ passage }}\n\nDepois de ler isso, eu me pergunto {{ question }}? |||\n{% if\
       \ label != -1 %}\n{{ answer_choices[label] }} \n{% endif %}"
@@ -52,7 +52,7 @@ templates:
     name: "I wonder\u2026"
     reference: ''
   7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5: !Template
-    answer_choices: No ||| Yes
+    answer_choices: Não ||| Sim
     id: 7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5
     jinja: 'Texto: {{passage}}
 
@@ -74,7 +74,7 @@ templates:
     name: yes_no_question
     reference: ''
   7d21d974-0624-4d4f-9e8c-644e2d009cb5: !Template
-    answer_choices: No ||| Yes
+    answer_choices: Não ||| Sim
     id: 7d21d974-0624-4d4f-9e8c-644e2d009cb5
     jinja: "{{ passage }}\n\nDepois de ler isso, você poderia me dizer {{ question }}? \
       \ ||| {% if label != -1 %}{{ answer_choices[label] }}\n{% endif %}"
@@ -88,7 +88,7 @@ templates:
     name: "could you tell me\u2026"
     reference: ''
   922d3e87-ac58-4731-84d1-f0a40e47afb5: !Template
-    answer_choices: No ||| Yes
+    answer_choices: Não ||| Sim
     id: 922d3e87-ac58-4731-84d1-f0a40e47afb5
     jinja: "EXAME\n1. Responda sim ou não.\nDocumento: {{passage}}\nPergunta: {{question}}? \
       \ ||| \n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}"
@@ -102,7 +102,7 @@ templates:
     name: exam
     reference: ''
   9a1bf459-8047-437c-9def-f21e960429cc: !Template
-    answer_choices: No ||| Yes
+    answer_choices: Não ||| Sim
     id: 9a1bf459-8047-437c-9def-f21e960429cc
     jinja: 'Com base na seguinte passagem, {{ question }}? {{ passage }}
 
@@ -148,7 +148,7 @@ templates:
     name: exercise
     reference: ''
   b2b3cb60-d6e3-491c-a09a-8201e13e417e: !Template
-    answer_choices: No ||| Yes
+    answer_choices: Não ||| Sim
     id: b2b3cb60-d6e3-491c-a09a-8201e13e417e
     jinja: '{{ passage }}
 

From 74c9915bb31c4e353941b71011062168552baabe Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Tue, 11 Apr 2023 21:31:55 +0000
Subject: [PATCH 10/36] add imdb_pt template

---
 .../christykoh/imdb_pt/templates.yaml         | 219 ++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 elk/promptsource/templates/christykoh/imdb_pt/templates.yaml

diff --git a/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml b/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml
new file mode 100644
index 00000000..8fba1f39
--- /dev/null
+++ b/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml
@@ -0,0 +1,219 @@
+dataset: imdb_pt
+templates:
+  02ff2949-0f45-4d97-941e-6fa4c0afbc2d: !Template
+    answer_choices: negativo ||| positivo
+    id: 02ff2949-0f45-4d97-941e-6fa4c0afbc2d
+    jinja: 'A crítica de filme a seguir expressa que sentimento? {{text}}
+
+
+      ||| {{ answer_choices
+      [label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Movie Expressed Sentiment 2
+    reference: ''
+  2351d12a-e630-4d19-8b41-e199266e38f7: !Template
+    answer_choices: ruim ||| bom
+    id: 2351d12a-e630-4d19-8b41-e199266e38f7
+    jinja: '{{text}}
+
+
+      O crítico achou este filme {{"bom ou mal"}}? ||| {{ answer_choices
+      [label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Reviewer Opinion bad good choices
+    reference: ''
+  5f372fb1-795a-47b6-8ddf-c4fd1579e76a: !Template
+    answer_choices: negativo ||| positivo
+    id: 5f372fb1-795a-47b6-8ddf-c4fd1579e76a
+    jinja: "{{text}} \nEsta avaliação é {{\"positivo ou negativo\"}}? ||| \n{{answer_choices[label]}}"
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: 'Sentiment with choices '
+    reference: ''
+  866474a5-1498-46b7-bfee-ac0c5160707f: !Template
+    answer_choices: negativo ||| positivo
+    id: 866474a5-1498-46b7-bfee-ac0c5160707f
+    jinja: '{{text}}
+
+
+      Como o espectador se sente sobre o filme?
+
+
+      ||| {{ answer_choices
+      [label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Reviewer Sentiment Feeling
+    reference: ''
+  96538f30-f2c1-430e-8fc6-936a16966d9c: !Template
+    answer_choices: negativo ||| positivo
+    id: 96538f30-f2c1-430e-8fc6-936a16966d9c
+    jinja: '{{text}} Que sentimento o escritor expressa pelo filme? ||| {{
+      answer_choices [label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Writer Expressed Sentiment
+    reference: ''
+  af51297c-38a3-4d6c-a8b5-04b1243d7443: !Template
+    answer_choices: negativo ||| positivo
+    id: af51297c-38a3-4d6c-a8b5-04b1243d7443
+    jinja: '{{text}}
+
+      O sentimento expresso pelo filme é ||| {{ answer_choices
+      [label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Movie Expressed Sentiment
+    reference: ''
+  b93b74ac-fe95-40b4-9610-318b46ab820f: !Template
+    answer_choices: negativo ||| positivo
+    id: b93b74ac-fe95-40b4-9610-318b46ab820f
+    jinja: '{{text}}
+
+
+      Qual é o sentimento expresso neste texto?
+
+
+      ||| {{ answer_choices
+      [label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Text Expressed Sentiment
+    reference: ''
+  b9b5d79d-f0b3-4bec-a724-f585db3e93ff: !Template
+    answer_choices: negativo ||| positivo
+    id: b9b5d79d-f0b3-4bec-a724-f585db3e93ff
+    jinja: '{{text}}
+
+
+    Isso definitivamente não é um ||| {{ answer_choices [1-label]}} avaliação.'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: false
+    name: Negation template for positive and negative
+    reference: ''
+  bd82ba0f-01d4-4fa1-bf8d-07e392c00cd9: !Template
+    answer_choices: Não ||| Sim
+    id: bd82ba0f-01d4-4fa1-bf8d-07e392c00cd9
+    jinja: '{{text}}
+
+      O crítico gostou do filme? ||| {{ answer_choices [label]}}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Reviewer Enjoyment Yes No
+    reference: ''
+  c70d1687-2421-49a2-9553-91b8bac4cfbe: !Template
+    answer_choices: negativo ||| positivo
+    id: c70d1687-2421-49a2-9553-91b8bac4cfbe
+    jinja: '{{text}}
+
+      Qual é o sentimento expresso pelo crítico para o filme?
+
+      ||| {{ answer_choices [label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Reviewer Expressed Sentiment
+    reference: ''
+  dacb5f03-dc80-428c-b707-8574436675c9: !Template
+    answer_choices: 0 ||| 1
+    id: dacb5f03-dc80-428c-b707-8574436675c9
+    jinja: 'Considere o seguinte exemplo:  '''''' {{text}} ''''''
+
+
+      Entre {{ answer_choices[0] }} e {{answer_choices[1] }}, que é o sentimento
+      deste exemplo?
+
+      ||| {{ answer_choices[label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: burns_2
+    reference: ''
+  e01970ab-42c0-4e6e-a08f-4940d889ef37: !Template
+    answer_choices: Eles não gostaram! ||| Eles adoraram
+    id: e01970ab-42c0-4e6e-a08f-4940d889ef37
+    jinja: '{{text}}
+
+      Como o crítico se sente sobre o filme? |||
+
+      {{ answer_choices[label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: false
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: Reviewer Enjoyment
+    reference: ''
+  eb791ab2-d2b4-4be6-a569-64086983abee: !Template
+    answer_choices: 0 ||| 1
+    id: eb791ab2-d2b4-4be6-a569-64086983abee
+    jinja: 'Considere o seguinte exemplo:  '''''' {{text}} ''''''
+
+      Entre {{ answer_choices[0] }} e {{answer_choices[1] }}, o sentimento de
+      este exemplo é ||| {{ answer_choices[label] }}'
+    metadata: !TemplateMetadata
+      choices_in_prompt: true
+      languages:
+      - pt
+      metrics:
+      - Accuracy
+      original_task: true
+    name: burns_1
+    reference: ''

From 85fd9e40b56ff6ba4b103d48c6962c0b24b12b6b Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Tue, 11 Apr 2023 21:36:00 +0000
Subject: [PATCH 11/36] implement prompt sharing, generate combined
 templates.yaml

---
 elk/extraction/extraction.py     |  2 ++
 elk/extraction/prompt_loading.py | 31 +++++++++++++++++++++++++++++--
 elk/promptsource/templates.py    | 10 ++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index 187428fc..3285a491 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -54,6 +54,7 @@ class Extract(Serializable):
     token_loc: Literal["first", "last", "mean"] = "last"
     min_gpu_mem: Optional[int] = None
     num_gpus: int = -1
+    combined_prompter_path: Optional[str] = None # if template file does not exist, combine from datasets and save to this path
 
     def __post_init__(self, layer_stride: int):
         if self.layers and layer_stride > 1:
@@ -98,6 +99,7 @@ def extract_hiddens(
         stream=cfg.prompts.stream,
         rank=rank,
         world_size=world_size,
+        combined_prompter_path=cfg.combined_prompter_path
     )  # this dataset is already sharded, but hasn't been truncated to max_examples
 
     # AutoModel should do the right thing here in nearly all cases. We don't actually
diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index a494be5a..e1322f73 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -1,3 +1,4 @@
+from os.path import exists
 from dataclasses import dataclass
 from random import Random
 from typing import Any, Iterator, Literal, Optional
@@ -40,7 +41,7 @@ class PromptConfig(Serializable):
         num_shots: The number of examples to use in few-shot prompts. If zero, prompts
             are zero-shot. Defaults to 0.
         num_variants: The number of prompt templates to apply to each predicate upon
-            call to __getitem__. Use -1 to apply all available templates. Defaults to 1.
+            call to __getitem__. Use -1 to apply all available templates. Defaults to -1.
         seed: The seed to use for prompt randomization. Defaults to 42.
         stream: Whether to stream the dataset from the Internet. Defaults to False.
     """
@@ -78,6 +79,7 @@ def load_prompts(
     stream: bool = False,
     rank: int = 0,
     world_size: int = 1,
+    combined_prompter_path: str = ""
 ) -> Iterator[dict]:
     """Load a dataset full of prompts generated from the specified datasets.
 
@@ -100,10 +102,24 @@ def load_prompts(
     train_datasets = []
     rng = Random(seed)
 
+    # If combined template is not empty and does not exist as a file yet, need to aggregate
+    # Init/create a new file for combining templates
+    combined_prompter = None
+    if combined_prompter_path:
+        print("Combining templates into shared prompter.")
+        combined_prompter = DatasetTemplates("combined_templates", combined_prompter_path)
+    # should_aggregate_templates = (combined_prompter and not exists(combined_prompter.yaml_path))
+    # print("should aggregate: ", should_aggregate_templates)
+
     # First load the datasets and prompters. We need to know the minimum number of
     # templates for any dataset in order to make sure we don't run out of prompts.
     for ds_string in dataset_strings:
         ds_name, _, config_name = ds_string.partition(" ")
+        prompter = DatasetTemplates(ds_name, config_name)
+        # Populate combined prompter with templates from different datasets
+        # if should_aggregate_templates:
+        combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
+        print("len of prompter templates is ", len(combined_prompter.templates))
         prompters.append(DatasetTemplates(ds_name, config_name))
 
         ds_dict = assert_type(
@@ -136,11 +152,22 @@ def load_prompts(
         train_datasets.append(train_ds)
 
     min_num_templates = min(len(prompter.templates) for prompter in prompters)
+    # if should_aggregate_templates:
+    
+    if combined_prompter:
+        # save combined templates to yaml file
+        print("saving aggregate templates")
+        combined_prompter.sync_mapping()
+        combined_prompter.write_to_file()
+        min_num_templates = len(combined_prompter.templates)
+        print("length of combined_prompter templates is ", min_num_templates)
+
     num_variants = (
         min_num_templates
         if num_variants == -1
         else min(num_variants, min_num_templates)
     )
+    print()
     assert num_variants > 0
     if rank == 0:
         print(f"Using {num_variants} variants of each prompt")
@@ -179,7 +206,7 @@ def load_prompts(
                 label_column=label_column,
                 num_classes=num_classes,
                 num_variants=num_variants,
-                prompter=prompter,
+                prompter=prompter if not combined_prompter else combined_prompter,
                 rng=rng,
                 fewshot_iter=fewshot_iter,
             )
diff --git a/elk/promptsource/templates.py b/elk/promptsource/templates.py
index ea4e9196..68855cf6 100644
--- a/elk/promptsource/templates.py
+++ b/elk/promptsource/templates.py
@@ -543,6 +543,16 @@ def delete_folder(self) -> None:
             if len(os.listdir(base_folder)) == 0:
                 rmtree(base_folder)
 
+    def get_templates_with_new_uuids(self) -> dict:
+        """
+        Generate new uuids for templates, used when merging template datasets.
+        """
+        new_templates = {}
+        for template in self.templates.values():
+            template.id = str(uuid.uuid4())
+            new_templates[template.id] = template
+        return new_templates
+
     def __getitem__(self, template_key: str) -> "Template":
         return self.templates[self.name_to_id_mapping[template_key]]
 

From 8383f26d922c83be45ffa54f39faea49b933726b Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 06:25:57 +0000
Subject: [PATCH 12/36] fix num templates logic

---
 elk/extraction/prompt_loading.py                  | 15 ++++++++-------
 .../templates/christykoh/imdb_pt/templates.yaml   |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index e1322f73..8af7cf67 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -118,9 +118,10 @@ def load_prompts(
         prompter = DatasetTemplates(ds_name, config_name)
         # Populate combined prompter with templates from different datasets
         # if should_aggregate_templates:
-        combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
-        print("len of prompter templates is ", len(combined_prompter.templates))
-        prompters.append(DatasetTemplates(ds_name, config_name))
+        if combined_prompter:
+            combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
+            print("len of prompter templates is ", len(combined_prompter.templates))
+            prompters.append(DatasetTemplates(ds_name, config_name))
 
         ds_dict = assert_type(
             dict, load_dataset(ds_name, config_name or None, streaming=stream)
@@ -150,17 +151,17 @@ def load_prompts(
 
         raw_datasets.append(split)
         train_datasets.append(train_ds)
-
-    min_num_templates = min(len(prompter.templates) for prompter in prompters)
-    # if should_aggregate_templates:
     
-    if combined_prompter:
+    min_num_templates = -1
+    if combined_prompter != None:
         # save combined templates to yaml file
         print("saving aggregate templates")
         combined_prompter.sync_mapping()
         combined_prompter.write_to_file()
         min_num_templates = len(combined_prompter.templates)
         print("length of combined_prompter templates is ", min_num_templates)
+    else: 
+        min_num_templates = min(len(prompter.templates) for prompter in prompters)
 
     num_variants = (
         min_num_templates
diff --git a/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml b/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml
index 8fba1f39..bc18de77 100644
--- a/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml
+++ b/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml
@@ -18,7 +18,7 @@ templates:
     name: Movie Expressed Sentiment 2
     reference: ''
   2351d12a-e630-4d19-8b41-e199266e38f7: !Template
-    answer_choices: ruim ||| bom
+    answer_choices: mal ||| bom
     id: 2351d12a-e630-4d19-8b41-e199266e38f7
     jinja: '{{text}}
 

From df41ab49b4b4499504261cd7b6c7ada916e12c2b Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 06:33:00 +0000
Subject: [PATCH 13/36] fix pt answer choice

---
 elk/promptsource/templates/christykoh/imdb_pt/templates.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml b/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml
index 8fba1f39..bc18de77 100644
--- a/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml
+++ b/elk/promptsource/templates/christykoh/imdb_pt/templates.yaml
@@ -18,7 +18,7 @@ templates:
     name: Movie Expressed Sentiment 2
     reference: ''
   2351d12a-e630-4d19-8b41-e199266e38f7: !Template
-    answer_choices: ruim ||| bom
+    answer_choices: mal ||| bom
     id: 2351d12a-e630-4d19-8b41-e199266e38f7
     jinja: '{{text}}
 

From a132f4011b095f219bc50700540b3064658ea786 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 12 Apr 2023 06:43:27 +0000
Subject: [PATCH 14/36] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 elk/promptsource/templates/christykoh/boolq_pt/templates.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml b/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml
index 1120d3cd..a83c4d95 100644
--- a/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml
+++ b/elk/promptsource/templates/christykoh/boolq_pt/templates.yaml
@@ -12,7 +12,7 @@ templates:
       {% if label != -1 %}
 
       {{answer_choices[label]}}
-      
+
       {% endif %}'
     metadata: !TemplateMetadata
       choices_in_prompt: true
@@ -112,7 +112,7 @@ templates:
       {% if label != -1 %}
 
       {{ answer_choices[label] }}
-      
+
       {% endif %}'
     metadata: !TemplateMetadata
       choices_in_prompt: false

From c71bf1ce6862ca217f5e63c6f7d1685e8b6c974a Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 06:47:15 +0000
Subject: [PATCH 15/36] remove empty prompt_dataset file

---
 elk/extraction/prompt_dataset.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 elk/extraction/prompt_dataset.py

diff --git a/elk/extraction/prompt_dataset.py b/elk/extraction/prompt_dataset.py
deleted file mode 100644
index e69de29b..00000000

From 1ec5787ab12a6b68b83d909c6385ce7b466b86cb Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 07:01:16 +0000
Subject: [PATCH 16/36] fix empty prompters bug

---
 elk/extraction/prompt_loading.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index 77be12b1..bfbf905b 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -118,7 +118,7 @@ def load_prompts(
         if combined_prompter:
             combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
             print("len of prompter templates is ", len(combined_prompter.templates))
-            prompters.append(DatasetTemplates(ds_name, config_name))
+        prompters.append(DatasetTemplates(ds_name, config_name))
 
         ds_dict = assert_type(
             dict, load_dataset(ds_name, config_name or None, streaming=stream)
@@ -150,7 +150,7 @@ def load_prompts(
         train_datasets.append(train_ds)
     
     min_num_templates = -1
-    if combined_prompter != None:
+    if combined_prompter:
         # save combined templates to yaml file
         print("saving aggregate templates")
         combined_prompter.sync_mapping()

From 66c7a6b82ca8fd7c15a59d78fc1aa75b3eaade34 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 10:00:31 +0000
Subject: [PATCH 17/36] fix multiclass label bug

---
 elk/extraction/prompt_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index c46d3d7b..3d0d6587 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -259,7 +259,7 @@ def qa_cat(q: str, a: str) -> str:
         raise ValueError(f'Prompt duplicated {dup_count} times! "{maybe_dup}"')
 
     return dict(
-        label=label,
+        label=new_label,
         prompts=prompts,
         template_names=prompter.all_template_names,
     )

From d91acac50a6eff252a9caede0d282a60f92b85af Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 10:33:33 +0000
Subject: [PATCH 18/36] move prompt combination to PromptConfig post_init logic

---
 elk/extraction/extraction.py     |  4 +++
 elk/extraction/prompt_loading.py | 50 +++++++++++++++-----------------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index 4b6cd3a9..c17d080f 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -270,8 +270,12 @@ def get_splits() -> SplitDict:
     model_cfg = AutoConfig.from_pretrained(cfg.model)
     num_variants = cfg.prompts.num_variants
 
+    # if combined prompter flag is set, combine prompt templates
+
+    # extraneous, remove ?
     ds_name, _, config_name = cfg.prompts.datasets[0].partition(" ")
     info = get_dataset_config_info(ds_name, config_name or None)
+    # ? end
 
     layer_cols = {
         f"hidden_{layer}": Array3D(
diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index bfbf905b..332d5e7d 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -26,8 +26,8 @@
 class PromptConfig(Serializable):
     """
     Args:
-        dataset: Space-delimited name of the HuggingFace dataset to use, e.g.
-            `"super_glue boolq"` or `"imdb"`.
+        datasets: List of space-delimited names of the HuggingFace datasets to use, e.g.
+            [`"super_glue boolq", "imdb"]`.
         balance: Whether to force class balance in the dataset using undersampling.
         data_dir: The directory to use for caching the dataset. Defaults to
             `~/.cache/huggingface/datasets`.
@@ -44,6 +44,9 @@ class PromptConfig(Serializable):
             call to __getitem__. Use -1 to apply all available templates. Defaults to -1.
         seed: The seed to use for prompt randomization. Defaults to 42.
         stream: Whether to stream the dataset from the Internet. Defaults to False.
+        combined_prompter_path: Path to save a combined template file to, when testing
+            prompt invariance across multiple datasets, and will be interpreted as a subpath
+            of `combined_paths` in the promptsource templates dir. Defaults to empty string.
     """
 
     datasets: list[str] = field(positional=True)
@@ -55,6 +58,7 @@ class PromptConfig(Serializable):
     num_variants: int = -1
     seed: int = 42
     stream: bool = False
+    combined_prompter_path: str = ""
 
     def __post_init__(self):
         if len(self.max_examples) > 2:
@@ -69,6 +73,21 @@ def __post_init__(self):
         if len(self.max_examples) == 1:
             self.max_examples *= 2
 
+        # Combining prompts
+        if self.combined_prompter_path:
+            print("Copying templates across datasets to combined_templates/ " +
+                f"{self.combined_prompter_path}/templates.yaml")
+            combined_prompter = DatasetTemplates("combined_templates", self.combined_prompter_path)
+            for ds_string in self.datasets:
+                ds_name, _, config_name = ds_string.partition(" ")
+                prompter = DatasetTemplates(ds_name, config_name)
+                combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
+                print("len of prompter templates is ", len(combined_prompter.templates))
+            combined_prompter.write_to_file()
+
+            # Update datasets reference to use combined prompter
+            self.datasets = [f"combined_templates {self.combined_prompter_path}"] *  len(self.datasets)
+
 
 def load_prompts(
     *dataset_strings: str,
@@ -78,8 +97,7 @@ def load_prompts(
     split_type: Literal["train", "val"] = "train",
     stream: bool = False,
     rank: int = 0,
-    world_size: int = 1,
-    combined_prompter_path: str = ""
+    world_size: int = 1
 ) -> Iterator[dict]:
     """Load a dataset full of prompts generated from the specified datasets.
 
@@ -102,22 +120,11 @@ def load_prompts(
     train_datasets = []
     rng = Random(seed)
 
-    # If flag is set, init/create a new file for combining templates
-    combined_prompter = None
-    if combined_prompter_path:
-        print("Combining templates into shared prompter.")
-        combined_prompter = DatasetTemplates("combined_templates", combined_prompter_path)
-
     # First load the datasets and prompters. We need to know the minimum number of
     # templates for any dataset in order to make sure we don't run out of prompts.
     for ds_string in dataset_strings:
         ds_name, _, config_name = ds_string.partition(" ")
         prompter = DatasetTemplates(ds_name, config_name)
-        # Populate combined prompter with templates from different datasets
-        # if should_aggregate_templates:
-        if combined_prompter:
-            combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
-            print("len of prompter templates is ", len(combined_prompter.templates))
         prompters.append(DatasetTemplates(ds_name, config_name))
 
         ds_dict = assert_type(
@@ -149,16 +156,7 @@ def load_prompts(
         raw_datasets.append(split)
         train_datasets.append(train_ds)
     
-    min_num_templates = -1
-    if combined_prompter:
-        # save combined templates to yaml file
-        print("saving aggregate templates")
-        combined_prompter.sync_mapping()
-        combined_prompter.write_to_file()
-        min_num_templates = len(combined_prompter.templates)
-        print("length of combined_prompter templates is ", min_num_templates)
-    else: 
-        min_num_templates = min(len(prompter.templates) for prompter in prompters)
+    min_num_templates = min(len(prompter.templates) for prompter in prompters)
 
     num_variants = (
         min_num_templates
@@ -283,7 +281,7 @@ def qa_cat(q: str, a: str) -> str:
         raise ValueError(f'Prompt duplicated {dup_count} times! "{maybe_dup}"')
 
     return dict(
-        label=label,
+        label=new_label,
         prompts=prompts,
         template_names=prompter.all_template_names,
     )

From 0c2f5c4d6ab6bc6eafccfba2bf954f8d009b01eb Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 10:52:10 +0000
Subject: [PATCH 19/36] fix refactor bugs, runnable state

---
 elk/extraction/extraction.py     | 6 +-----
 elk/extraction/prompt_loading.py | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index c17d080f..88e39328 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -59,7 +59,6 @@ class Extract(Serializable):
     token_loc: Literal["first", "last", "mean"] = "last"
     min_gpu_mem: Optional[int] = None
     num_gpus: int = -1
-    combined_prompter_path: Optional[str] = None # if template file does not exist, combine from datasets and save to this path
 
     def __post_init__(self, layer_stride: int):
         if self.layers and layer_stride > 1:
@@ -103,8 +102,7 @@ def extract_hiddens(
         split_type=split_type,
         stream=cfg.prompts.stream,
         rank=rank,
-        world_size=world_size,
-        combined_prompter_path=cfg.combined_prompter_path
+        world_size=world_size
     )  # this dataset is already sharded, but hasn't been truncated to max_examples
 
     model = instantiate_model(
@@ -270,8 +268,6 @@ def get_splits() -> SplitDict:
     model_cfg = AutoConfig.from_pretrained(cfg.model)
     num_variants = cfg.prompts.num_variants
 
-    # if combined prompter flag is set, combine prompt templates
-
     # extraneous, remove ?
     ds_name, _, config_name = cfg.prompts.datasets[0].partition(" ")
     info = get_dataset_config_info(ds_name, config_name or None)
diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index d80da4c4..9821ddcf 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -202,7 +202,7 @@ def load_prompts(
                 label_column=label_column,
                 num_classes=num_classes,
                 num_variants=num_variants,
-                prompter=prompter if not combined_prompter else combined_prompter,
+                prompter=prompter,
                 rng=rng,
                 fewshot_iter=fewshot_iter,
             )

From 066cd447f6e5abe4fe64c0af0d3f18284cad588b Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 12:09:05 +0000
Subject: [PATCH 20/36] rewrite template merging, regenerate prompter every run

---
 elk/extraction/extraction.py     |  3 ++-
 elk/extraction/prompt_loading.py | 34 +++++++++++++++++++-------------
 elk/promptsource/templates.py    | 13 ++++++------
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index 88e39328..fe70603a 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -102,7 +102,8 @@ def extract_hiddens(
         split_type=split_type,
         stream=cfg.prompts.stream,
         rank=rank,
-        world_size=world_size
+        world_size=world_size,
+        combined_template_path=cfg.prompts.combined_template_path
     )  # this dataset is already sharded, but hasn't been truncated to max_examples
 
     model = instantiate_model(
diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index 9821ddcf..c840f212 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -44,7 +44,7 @@ class PromptConfig(Serializable):
             call to __getitem__. Use -1 to apply all available templates. Defaults to -1.
         seed: The seed to use for prompt randomization. Defaults to 42.
         stream: Whether to stream the dataset from the Internet. Defaults to False.
-        combined_prompter_path: Path to save a combined template file to, when testing
+        combined_template_path: Path to save a combined template file to, when testing
             prompt invariance across multiple datasets, and will be interpreted as a subpath
             of `combined_paths` in the promptsource templates dir. Defaults to empty string.
     """
@@ -58,7 +58,7 @@ class PromptConfig(Serializable):
     num_variants: int = -1
     seed: int = 42
     stream: bool = False
-    combined_prompter_path: str = ""
+    combined_template_path: str = ""
 
     def __post_init__(self):
         if len(self.max_examples) > 2:
@@ -74,21 +74,20 @@ def __post_init__(self):
             self.max_examples *= 2
 
         # Combining prompts
-        if self.combined_prompter_path:
+        if self.combined_template_path:
             print("Copying templates across datasets to combined_templates/ " +
-                f"{self.combined_prompter_path}/templates.yaml")
-            combined_prompter = DatasetTemplates("combined_templates", self.combined_prompter_path)
+                f"{self.combined_template_path}/templates.yaml")
+            combined_prompter = DatasetTemplates("combined_templates", self.combined_template_path)
+            combined_prompter.templates = {}
             for ds_string in self.datasets:
                 ds_name, _, config_name = ds_string.partition(" ")
                 prompter = DatasetTemplates(ds_name, config_name)
-                combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
-                print("len of prompter templates is ", len(combined_prompter.templates))
+                # TODO: Verify that cols are same; if not, warn that templates could not be combined.
+                combined_prompter.merge_templates_from(prompter)
+                # combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
+            print("Total number of templates gathered: ", len(combined_prompter.templates))
             combined_prompter.write_to_file()
 
-            # Update datasets reference to use combined prompter
-            self.datasets = [f"combined_templates {self.combined_prompter_path}"] *  len(self.datasets)
-
-
 def load_prompts(
     *dataset_strings: str,
     num_shots: int = 0,
@@ -97,7 +96,8 @@ def load_prompts(
     split_type: Literal["train", "val"] = "train",
     stream: bool = False,
     rank: int = 0,
-    world_size: int = 1
+    world_size: int = 1,
+    combined_template_path: str = ""
 ) -> Iterator[dict]:
     """Load a dataset full of prompts generated from the specified datasets.
 
@@ -124,8 +124,10 @@ def load_prompts(
     # templates for any dataset in order to make sure we don't run out of prompts.
     for ds_string in dataset_strings:
         ds_name, _, config_name = ds_string.partition(" ")
-        prompter = DatasetTemplates(ds_name, config_name)
-        prompters.append(DatasetTemplates(ds_name, config_name))
+
+        if combined_template_path == "":
+            prompter = DatasetTemplates(ds_name, config_name)
+            prompters.append(DatasetTemplates(ds_name, config_name))
 
         ds_dict = assert_type(
             dict, load_dataset(ds_name, config_name or None, streaming=stream)
@@ -156,6 +158,10 @@ def load_prompts(
         raw_datasets.append(split)
         train_datasets.append(train_ds)
     
+    if combined_template_path:
+        combined_prompter = DatasetTemplates("combined_templates", combined_template_path)
+        prompters = [combined_prompter] * len(dataset_strings)
+
     min_num_templates = min(len(prompter.templates) for prompter in prompters)
 
     num_variants = (
diff --git a/elk/promptsource/templates.py b/elk/promptsource/templates.py
index 68855cf6..38d3f87f 100644
--- a/elk/promptsource/templates.py
+++ b/elk/promptsource/templates.py
@@ -543,15 +543,14 @@ def delete_folder(self) -> None:
             if len(os.listdir(base_folder)) == 0:
                 rmtree(base_folder)
 
-    def get_templates_with_new_uuids(self) -> dict:
+    def merge_templates_from(self, src: "DatasetTemplates"):
         """
-        Generate new uuids for templates, used when merging template datasets.
+        Merge templates from src.
         """
-        new_templates = {}
-        for template in self.templates.values():
-            template.id = str(uuid.uuid4())
-            new_templates[template.id] = template
-        return new_templates
+        for template in src.templates.values():
+            template_id = str(uuid.uuid4())
+            self.templates[template_id] = template
+        self.sync_mapping()
 
     def __getitem__(self, template_key: str) -> "Template":
         return self.templates[self.name_to_id_mapping[template_key]]

From 846b78c0f664a48c556231f41a7717cf399b348b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 12 Apr 2023 12:16:15 +0000
Subject: [PATCH 21/36] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 elk/extraction/extraction.py     |  2 +-
 elk/extraction/prompt_loading.py | 23 ++++++++++++++++-------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index fe70603a..037aa828 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -103,7 +103,7 @@ def extract_hiddens(
         stream=cfg.prompts.stream,
         rank=rank,
         world_size=world_size,
-        combined_template_path=cfg.prompts.combined_template_path
+        combined_template_path=cfg.prompts.combined_template_path,
     )  # this dataset is already sharded, but hasn't been truncated to max_examples
 
     model = instantiate_model(
diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index c840f212..7f8b5b36 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -75,9 +75,13 @@ def __post_init__(self):
 
         # Combining prompts
         if self.combined_template_path:
-            print("Copying templates across datasets to combined_templates/ " +
-                f"{self.combined_template_path}/templates.yaml")
-            combined_prompter = DatasetTemplates("combined_templates", self.combined_template_path)
+            print(
+                "Copying templates across datasets to combined_templates/ "
+                + f"{self.combined_template_path}/templates.yaml"
+            )
+            combined_prompter = DatasetTemplates(
+                "combined_templates", self.combined_template_path
+            )
             combined_prompter.templates = {}
             for ds_string in self.datasets:
                 ds_name, _, config_name = ds_string.partition(" ")
@@ -85,9 +89,12 @@ def __post_init__(self):
                 # TODO: Verify that cols are same; if not, warn that templates could not be combined.
                 combined_prompter.merge_templates_from(prompter)
                 # combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
-            print("Total number of templates gathered: ", len(combined_prompter.templates))
+            print(
+                "Total number of templates gathered: ", len(combined_prompter.templates)
+            )
             combined_prompter.write_to_file()
 
+
 def load_prompts(
     *dataset_strings: str,
     num_shots: int = 0,
@@ -97,7 +104,7 @@ def load_prompts(
     stream: bool = False,
     rank: int = 0,
     world_size: int = 1,
-    combined_template_path: str = ""
+    combined_template_path: str = "",
 ) -> Iterator[dict]:
     """Load a dataset full of prompts generated from the specified datasets.
 
@@ -157,9 +164,11 @@ def load_prompts(
 
         raw_datasets.append(split)
         train_datasets.append(train_ds)
-    
+
     if combined_template_path:
-        combined_prompter = DatasetTemplates("combined_templates", combined_template_path)
+        combined_prompter = DatasetTemplates(
+            "combined_templates", combined_template_path
+        )
         prompters = [combined_prompter] * len(dataset_strings)
 
     min_num_templates = min(len(prompter.templates) for prompter in prompters)

From 1aecd7ecd18a738bba07340b28d5e79277d62e4e Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 12:22:58 +0000
Subject: [PATCH 22/36] line len fixes

---
 elk/extraction/prompt_loading.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index c840f212..1ccba7ae 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -41,12 +41,13 @@ class PromptConfig(Serializable):
         num_shots: The number of examples to use in few-shot prompts. If zero, prompts
             are zero-shot. Defaults to 0.
         num_variants: The number of prompt templates to apply to each predicate upon
-            call to __getitem__. Use -1 to apply all available templates. Defaults to -1.
+            call to __getitem__. Use -1 to apply all available templates. Defaults to
+            -1.
         seed: The seed to use for prompt randomization. Defaults to 42.
         stream: Whether to stream the dataset from the Internet. Defaults to False.
         combined_template_path: Path to save a combined template file to, when testing
-            prompt invariance across multiple datasets, and will be interpreted as a subpath
-            of `combined_paths` in the promptsource templates dir. Defaults to empty string.
+            prompt invariance across multiple datasets, and will be interpreted as a 
+            subpath of `combined_paths` in the templates dir. Defaults to empty string.
     """
 
     datasets: list[str] = field(positional=True)
@@ -77,15 +78,16 @@ def __post_init__(self):
         if self.combined_template_path:
             print("Copying templates across datasets to combined_templates/ " +
                 f"{self.combined_template_path}/templates.yaml")
-            combined_prompter = DatasetTemplates("combined_templates", self.combined_template_path)
+            combined_prompter = DatasetTemplates("combined_templates", 
+                self.combined_template_path)
             combined_prompter.templates = {}
             for ds_string in self.datasets:
                 ds_name, _, config_name = ds_string.partition(" ")
                 prompter = DatasetTemplates(ds_name, config_name)
-                # TODO: Verify that cols are same; if not, warn that templates could not be combined.
+                # TODO: Verify that cols are same; if not, warn that templates 
+                #       could not be combined.
                 combined_prompter.merge_templates_from(prompter)
-                # combined_prompter.templates.update(prompter.get_templates_with_new_uuids())
-            print("Total number of templates gathered: ", len(combined_prompter.templates))
+            print("Total number of templates: ", len(combined_prompter.templates))
             combined_prompter.write_to_file()
 
 def load_prompts(

From b0c0f6331221c15ffc71248ee835514a097ff68a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 12 Apr 2023 12:26:30 +0000
Subject: [PATCH 23/36] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 elk/extraction/prompt_loading.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index c6715f6b..6e5edc54 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -46,7 +46,7 @@ class PromptConfig(Serializable):
         seed: The seed to use for prompt randomization. Defaults to 42.
         stream: Whether to stream the dataset from the Internet. Defaults to False.
         combined_template_path: Path to save a combined template file to, when testing
-            prompt invariance across multiple datasets, and will be interpreted as a 
+            prompt invariance across multiple datasets, and will be interpreted as a
             subpath of `combined_paths` in the templates dir. Defaults to empty string.
     """
 
@@ -87,7 +87,7 @@ def __post_init__(self):
             for ds_string in self.datasets:
                 ds_name, _, config_name = ds_string.partition(" ")
                 prompter = DatasetTemplates(ds_name, config_name)
-                # TODO: Verify that cols are same; if not, warn that templates 
+                # TODO: Verify that cols are same; if not, warn that templates
                 #       could not be combined.
                 combined_prompter.merge_templates_from(prompter)
             print("Total number of templates: ", len(combined_prompter.templates))

From 2da069adb7fad011e17417f5fa3e5c4f6f61e63a Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 17:25:14 +0000
Subject: [PATCH 24/36] update README with prompt invariance argument

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 96d51ee1..5c7fbd3c 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,11 @@ The following will generate a CCS (Contrast Consistent Search) reporter instead
 elk elicit microsoft/deberta-v2-xxlarge-mnli imdb --net ccs
 ```
 
+To test prompt invariance across multiple datasets, use the `--combined_template_path` command line argument, which will create a new `templates.yaml` file with templates from all the datasets.
+```bash
+elk elicit bigscience/bloomz-560m christykoh/ag_news_pt ag_news --combined_template_path=spar_w/ag_news
+```
+
 The following command will evaluate the probe from the run naughty-northcutt on the hidden states extracted from the model deberta-v2-xxlarge-mnli for the imdb dataset. It will result in an `eval.csv` and `cfg.yaml` file, which are stored under a subfolder in `elk-reporters/naughty-northcutt/transfer_eval`.
 
 ```bash

From 4c6d344b0bc004ab9b20461c7278df4a4fc44b66 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 20:01:27 +0000
Subject: [PATCH 25/36] fix bugs, add dataset col checks

---
 elk/extraction/extraction.py     |  3 +--
 elk/extraction/prompt_loading.py | 33 +++++++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index 51210a11..81cab1cc 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -266,10 +266,9 @@ def get_splits() -> SplitDict:
     model_cfg = AutoConfig.from_pretrained(cfg.model)
     num_variants = cfg.prompts.num_variants
 
-    # extraneous, remove ?
+    # Retrieve info, used to get splits
     ds_name, _, config_name = cfg.prompts.datasets[0].partition(" ")
     info = get_dataset_config_info(ds_name, config_name or None)
-    # ? end
 
     layer_cols = {
         f"hidden_{layer}": Array3D(
diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index 6e5edc54..6cfcb18f 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -7,6 +7,7 @@
     Dataset,
     Features,
     load_dataset,
+    load_dataset_builder,
 )
 from datasets.distributed import split_dataset_by_node
 from simple_parsing.helpers import Serializable, field
@@ -84,14 +85,41 @@ def __post_init__(self):
                 "combined_templates", self.combined_template_path
             )
             combined_prompter.templates = {}
+            prev_num_features = 0
+            prev_num_label_classes = 0
             for ds_string in self.datasets:
                 ds_name, _, config_name = ds_string.partition(" ")
                 prompter = DatasetTemplates(ds_name, config_name)
-                # TODO: Verify that cols are same; if not, warn that templates
-                #       could not be combined.
+
+                # Verify that number of features and number of classes for ClassLabel
+                # are the same across datasets.
+                ds_builder = load_dataset_builder(ds_name, config_name or None)
+                num_features = len(ds_builder.info.features)
+                if prev_num_features > 0 and num_features != prev_num_features:
+                    print("WARNING: Datasets do not have the same number of features;",
+                    f"{ds_name} has {num_features} features while prev has",
+                    f"{prev_num_features}. Prompting datasets separately.")
+                    self.combined_template_path = ""
+                    break
+                prev_num_features = num_features
+                num_classes = ds_builder.info.features['label'].num_classes
+                if prev_num_label_classes > 0 and num_classes != prev_num_label_classes:
+                    print("WARNING: Datasets do not have the same number of ClassLabel",
+                    f"classes; {ds_name} has {num_classes} classes while prev has",
+                    f"{prev_num_label_classes}. Prompting datasets separately.")
+                    self.combined_template_path = ""
+                    break
+                prev_num_label_classes = num_classes
+
+                # Once verified, merge templates.
                 combined_prompter.merge_templates_from(prompter)
+        
+        # Write to file if successfully merged all prompts.
+        if self.combined_template_path:
             print("Total number of templates: ", len(combined_prompter.templates))
             combined_prompter.write_to_file()
+            print("Saved to promptsource/templates/combined_templates/" +
+            f"{self.combined_template_path}.yaml")
 
 
 def load_prompts(
@@ -177,7 +205,6 @@ def load_prompts(
         if num_variants == -1
         else min(num_variants, min_num_templates)
     )
-    print()
     assert num_variants > 0
     if rank == 0:
         print(f"Using {num_variants} variants of each prompt")

From 53d186b7ecfdb7e1b7d449bc1cb8d743e52369db Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 12 Apr 2023 20:01:54 +0000
Subject: [PATCH 26/36] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 elk/extraction/prompt_loading.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index 6cfcb18f..57c3d64f 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -96,30 +96,36 @@ def __post_init__(self):
                 ds_builder = load_dataset_builder(ds_name, config_name or None)
                 num_features = len(ds_builder.info.features)
                 if prev_num_features > 0 and num_features != prev_num_features:
-                    print("WARNING: Datasets do not have the same number of features;",
-                    f"{ds_name} has {num_features} features while prev has",
-                    f"{prev_num_features}. Prompting datasets separately.")
+                    print(
+                        "WARNING: Datasets do not have the same number of features;",
+                        f"{ds_name} has {num_features} features while prev has",
+                        f"{prev_num_features}. Prompting datasets separately.",
+                    )
                     self.combined_template_path = ""
                     break
                 prev_num_features = num_features
-                num_classes = ds_builder.info.features['label'].num_classes
+                num_classes = ds_builder.info.features["label"].num_classes
                 if prev_num_label_classes > 0 and num_classes != prev_num_label_classes:
-                    print("WARNING: Datasets do not have the same number of ClassLabel",
-                    f"classes; {ds_name} has {num_classes} classes while prev has",
-                    f"{prev_num_label_classes}. Prompting datasets separately.")
+                    print(
+                        "WARNING: Datasets do not have the same number of ClassLabel",
+                        f"classes; {ds_name} has {num_classes} classes while prev has",
+                        f"{prev_num_label_classes}. Prompting datasets separately.",
+                    )
                     self.combined_template_path = ""
                     break
                 prev_num_label_classes = num_classes
 
                 # Once verified, merge templates.
                 combined_prompter.merge_templates_from(prompter)
-        
+
         # Write to file if successfully merged all prompts.
         if self.combined_template_path:
             print("Total number of templates: ", len(combined_prompter.templates))
             combined_prompter.write_to_file()
-            print("Saved to promptsource/templates/combined_templates/" +
-            f"{self.combined_template_path}.yaml")
+            print(
+                "Saved to promptsource/templates/combined_templates/"
+                + f"{self.combined_template_path}.yaml"
+            )
 
 
 def load_prompts(

From b78355f9ce5bb1c1770022652de3a50d71c04881 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 20:13:54 +0000
Subject: [PATCH 27/36] fix prompter init typing

---
 elk/extraction/prompt_loading.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index 6cfcb18f..478c540c 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -76,6 +76,7 @@ def __post_init__(self):
             self.max_examples *= 2
 
         # Combining prompts
+        combined_prompter: "DatasetTemplates" = None
         if self.combined_template_path:
             print(
                 "Copying templates across datasets to combined_templates/ "

From 1975410f9d26ca0076283b9d45076cfcb1d9e849 Mon Sep 17 00:00:00 2001
From: Walter Laurito <Lauritowal@yahoo.com>
Date: Wed, 12 Apr 2023 13:21:22 -0700
Subject: [PATCH 28/36] Update README.md

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 5c7fbd3c..21828f7b 100644
--- a/README.md
+++ b/README.md
@@ -26,17 +26,17 @@ The following will generate a CCS (Contrast Consistent Search) reporter instead
 elk elicit microsoft/deberta-v2-xxlarge-mnli imdb --net ccs
 ```
 
-To test prompt invariance across multiple datasets, use the `--combined_template_path` command line argument, which will create a new `templates.yaml` file with templates from all the datasets.
-```bash
-elk elicit bigscience/bloomz-560m christykoh/ag_news_pt ag_news --combined_template_path=spar_w/ag_news
-```
-
 The following command will evaluate the probe from the run naughty-northcutt on the hidden states extracted from the model deberta-v2-xxlarge-mnli for the imdb dataset. It will result in an `eval.csv` and `cfg.yaml` file, which are stored under a subfolder in `elk-reporters/naughty-northcutt/transfer_eval`.
 
 ```bash
 elk eval naughty-northcutt microsoft/deberta-v2-xxlarge-mnli imdb
 ```
 
+For prompt invariance across multiple datasets, use the `--combined_template_path` command line argument, which will create a new `templates.yaml` file with templates from all the datasets.
+```bash
+elk elicit bigscience/bloomz-560m christykoh/ag_news_pt ag_news --combined_template_path=spar_w/ag_news
+```
+
 ## Caching
 
 The hidden states resulting from `elk elicit` are cached as a HuggingFace dataset to avoid having to recompute them every time we want to train a probe. The cache is stored in the same place as all other HuggingFace datasets, which is usually `~/.cache/huggingface/datasets`.

From 8a5fb0d819b7a7ed9a1b7c4e4cea2a1403aecea2 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 20:23:37 +0000
Subject: [PATCH 29/36] try to fix typing again

---
 elk/extraction/prompt_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index c62209a1..9b2f8e78 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -76,7 +76,7 @@ def __post_init__(self):
             self.max_examples *= 2
 
         # Combining prompts
-        combined_prompter: "DatasetTemplates" = None
+        combined_prompter = None
         if self.combined_template_path:
             print(
                 "Copying templates across datasets to combined_templates/ "

From a7f5a8b8ec2ada9f476368f6fd1df15bfc2a6f42 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 12 Apr 2023 20:30:01 +0000
Subject: [PATCH 30/36] assert datasettemplates type

---
 elk/extraction/prompt_loading.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index 9b2f8e78..9a80fe86 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -121,8 +121,9 @@ def __post_init__(self):
 
         # Write to file if successfully merged all prompts.
         if self.combined_template_path:
-            print("Total number of templates: ", len(combined_prompter.templates))
-            combined_prompter.write_to_file()
+            prompter = assert_type(DatasetTemplates, combined_prompter)
+            print("Total number of templates: ", len(prompter.templates))
+            prompter.write_to_file()
             print(
                 "Saved to promptsource/templates/combined_templates/"
                 + f"{self.combined_template_path}.yaml"

From 7af1a1bba97b6ba193deb6488e07e3363870564b Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Thu, 13 Apr 2023 09:24:59 +0000
Subject: [PATCH 31/36] bugfix to run eval separately on each dataset

---
 elk/evaluation/evaluate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
index 6aca58f5..b954e661 100644
--- a/elk/evaluation/evaluate.py
+++ b/elk/evaluation/evaluate.py
@@ -48,6 +48,7 @@ def execute(self):
         transfer_dir = elk_reporter_dir() / self.source / "transfer_eval"
 
         for dataset in datasets:
+            self.data.prompts.datasets = [dataset]
             run = Evaluate(cfg=self, out_dir=transfer_dir / dataset)
             run.evaluate()
 

From 74551fa1a6a60399e41cbcf109de578ba0bc725b Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Thu, 13 Apr 2023 09:34:36 +0000
Subject: [PATCH 32/36] add combine_evals flag to differentiate a multi dataset
 eval from a batch of single-dataset evals

---
 elk/evaluation/evaluate.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
index b954e661..9cfdd6eb 100644
--- a/elk/evaluation/evaluate.py
+++ b/elk/evaluation/evaluate.py
@@ -41,16 +41,22 @@ class Eval(Serializable):
     num_gpus: int = -1
     skip_baseline: bool = False
     concatenated_layer_offset: int = 0
+    combine_evals: bool = False
 
     def execute(self):
         datasets = self.data.prompts.datasets
 
         transfer_dir = elk_reporter_dir() / self.source / "transfer_eval"
 
-        for dataset in datasets:
-            self.data.prompts.datasets = [dataset]
-            run = Evaluate(cfg=self, out_dir=transfer_dir / dataset)
+        if self.combine_evals:
+            run = Evaluate(cfg=self, out_dir=transfer_dir / ", ".join(datasets))
             run.evaluate()
+        else:
+            # eval on each dataset separately
+            for dataset in datasets:
+                self.data.prompts.datasets = [dataset]
+                run = Evaluate(cfg=self, out_dir=transfer_dir / dataset)
+                run.evaluate()
 
 
 @dataclass

From f7a4713f3601bd82d5659a53479d0ed61060cbdf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 19 Apr 2023 15:44:40 +0000
Subject: [PATCH 33/36] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 elk/extraction/prompt_loading.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index e6432500..f0ca68e6 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -1,8 +1,8 @@
-from os.path import exists
 from collections import Counter
 from copy import deepcopy
 from dataclasses import dataclass
 from itertools import zip_longest
+from os.path import exists
 from random import Random
 from typing import Any, Iterator, Literal, Optional
 
@@ -48,7 +48,7 @@ class PromptConfig(Serializable):
             -1.
         seed: The seed to use for prompt randomization. Defaults to 42.
         stream: Whether to stream the dataset from the Internet. Defaults to False.
-        combined_template_output_path: Path to save a combined template file to, when 
+        combined_template_output_path: Path to save a combined template file to, when
             applying prompt invariance across multiple datasets. Interpreted as a
             subpath of `combined_paths` in the templates dir. Defaults to empty string.
     """
@@ -127,11 +127,11 @@ def combine_templates(self):
             "Saved to promptsource/templates/combined_templates/"
             + f"{self.combined_template_output_path}.yaml"
         )
-    
+
     def verify_cols(self, ds_builder, ref_ds_builder) -> bool:
-        '''Verify that number of features and number of classes for ClassLabel
-            match the expected values.
-        '''
+        """Verify that number of features and number of classes for ClassLabel
+        match the expected values.
+        """
         expected_features = len(ref_ds_builder.info.features)
         expected_classes = ref_ds_builder.info.features["label"].num_classes
         num_features = len(ds_builder.info.features)
@@ -198,15 +198,13 @@ def load_prompts(
         An iterable of prompt dictionaries.
     """
     ds_name, _, config_name = ds_string.partition(" ")
-    
+
     prompter = None
     if combined_template_output_path and exists(combined_template_output_path):
-        prompter = DatasetTemplates(
-            "combined_templates", combined_template_output_path
-        )
+        prompter = DatasetTemplates("combined_templates", combined_template_output_path)
     else:
         prompter = DatasetTemplates(ds_name, config_name)
-    
+
     ds_dict = assert_type(
         dict, load_dataset(ds_name, config_name or None, streaming=stream)
     )

From 6e2f54c0a8fa14c140056c8f72263bf69e7398aa Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Wed, 19 Apr 2023 15:51:00 +0000
Subject: [PATCH 34/36] define ds_name

---
 elk/extraction/prompt_loading.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index e6432500..733027dc 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -136,6 +136,7 @@ def verify_cols(self, ds_builder, ref_ds_builder) -> bool:
         expected_classes = ref_ds_builder.info.features["label"].num_classes
         num_features = len(ds_builder.info.features)
         num_classes = ds_builder.info.features["label"].num_classes
+        ds_name = ds_builder.builder_name
         if expected_features > 0 and num_features != expected_features:
             print(
                 "WARNING: Datasets do not have the same number of features;",

From 5f0f32aed9a4547bc6720286135a995e188a2673 Mon Sep 17 00:00:00 2001
From: Christy Koh <christykoh@berkeley.edu>
Date: Thu, 20 Apr 2023 07:38:39 +0000
Subject: [PATCH 35/36] fix ds_name bug

---
 elk/extraction/prompt_loading.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/elk/extraction/prompt_loading.py b/elk/extraction/prompt_loading.py
index a4102554..88687e0c 100644
--- a/elk/extraction/prompt_loading.py
+++ b/elk/extraction/prompt_loading.py
@@ -115,7 +115,7 @@ def combine_templates(self):
             if i == 0:
                 # Set first dataset as reference
                 ref_ds_builder = ds_builder
-            elif not self.verify_cols(ds_builder, ref_ds_builder):
+            elif not self.verify_cols(ref_ds_builder, ds_builder, ds_name):
                 return
 
             # Once verified, merge templates.
@@ -128,7 +128,7 @@ def combine_templates(self):
             + f"{self.combined_template_output_path}.yaml"
         )
 
-    def verify_cols(self, ds_builder, ref_ds_builder) -> bool:
+    def verify_cols(self, ref_ds_builder, ds_builder, ds_name) -> bool:
         """Verify that number of features and number of classes for ClassLabel
         match the expected values.
         """
@@ -136,7 +136,6 @@ def verify_cols(self, ds_builder, ref_ds_builder) -> bool:
         expected_classes = ref_ds_builder.info.features["label"].num_classes
         num_features = len(ds_builder.info.features)
         num_classes = ds_builder.info.features["label"].num_classes
-        ds_name = ds_builder.builder_name
         if expected_features > 0 and num_features != expected_features:
             print(
                 "WARNING: Datasets do not have the same number of features;",

From 8fa07b4672dee1e9c6ac386f246c8d38d564fbb1 Mon Sep 17 00:00:00 2001
From: Walter Laurito <Lauritowal@yahoo.com>
Date: Thu, 20 Apr 2023 04:06:35 -0700
Subject: [PATCH 36/36] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4c0c35dd..d3f55e5b 100644
--- a/README.md
+++ b/README.md
@@ -32,9 +32,9 @@ The following command will evaluate the probe from the run naughty-northcutt on
 elk eval naughty-northcutt microsoft/deberta-v2-xxlarge-mnli imdb
 ```
 
-For prompt invariance across multiple datasets, use the `--combined_template_path` command line argument, which will create a new `templates.yaml` file with templates from all the datasets.
+For prompt invariance across multiple datasets, use the `--combined_template_output_path` command line argument, which will create a new `templates.yaml` file with templates from all the datasets.
 ```bash
-elk elicit bigscience/bloomz-560m christykoh/ag_news_pt ag_news --combined_template_path=spar_w/ag_news
+elk elicit bigscience/bloomz-560m christykoh/ag_news_pt ag_news --combined_template_output_path=spar_w/ag_news
 ```
 
 The following runs `elicit` on the Cartesian product of the listed models and datasets, storing it in a special folder ELK_DIR/sweeps/<memorable_name>. Moreover, `--add_pooled` adds an additional dataset that pools all of the datasets together.