diff --git a/evals/data.py b/evals/data.py index 27a833a114..e96bb4e4e1 100644 --- a/evals/data.py +++ b/evals/data.py @@ -200,12 +200,12 @@ def default(self, o: Any) -> str: return _to_py_types(o) -def jsondumps(o: Any, **kwargs: Any) -> str: - return json.dumps(o, cls=EnhancedJSONEncoder, **kwargs) +def jsondumps(o: Any, ensure_ascii: bool = False, **kwargs: Any) -> str: + return json.dumps(o, cls=EnhancedJSONEncoder, ensure_ascii=ensure_ascii, **kwargs) -def jsondump(o: Any, fp: Any, **kwargs: Any) -> None: - json.dump(o, fp, cls=EnhancedJSONEncoder, **kwargs) +def jsondump(o: Any, fp: Any, ensure_ascii: bool = False, **kwargs: Any) -> None: + json.dump(o, fp, cls=EnhancedJSONEncoder, ensure_ascii=ensure_ascii, **kwargs) def jsonloads(s: str, **kwargs: Any) -> Any: diff --git a/evals/record.py b/evals/record.py index c4cb3d5102..d02ab20d5f 100644 --- a/evals/record.py +++ b/evals/record.py @@ -293,16 +293,12 @@ def __init__(self, log_path: Optional[str], run_spec: RunSpec): self.event_file_path = log_path if log_path is not None: with bf.BlobFile(log_path, "wb") as f: - f.write( - ( - jsondumps({"spec": dataclasses.asdict(run_spec)}, ensure_ascii=False) + "\n" - ).encode("utf-8") - ) + f.write((jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n").encode("utf-8")) def _flush_events_internal(self, events_to_write: Sequence[Event]): start = time.time() try: - lines = [jsondumps(event, ensure_ascii=False) + "\n" for event in events_to_write] + lines = [jsondumps(event) + "\n" for event in events_to_write] except TypeError as e: logger.error(f"Failed to serialize events: {events_to_write}") raise e @@ -318,8 +314,8 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]): self._flushes_done += 1 def record_final_report(self, final_report: Any): - with bf.BlobFile(self.event_file_path, "a") as f: - f.write(jsondumps({"final_report": final_report}) + "\n") + with bf.BlobFile(self.event_file_path, "ab") as f: + f.write((jsondumps({"final_report": final_report}) + "\n").encode("utf-8")) logging.info(f"Final report: {final_report}. Logged to {self.event_file_path}") @@ -345,8 +341,8 @@ def __init__( self._conn = snowflake_connection if log_path is not None: - with bf.BlobFile(log_path, "w") as f: - f.write(jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n") + with bf.BlobFile(log_path, "wb") as f: + f.write((jsondumps({"spec": dataclasses.asdict(run_spec)}) + "\n").encode("utf-8")) query = """ INSERT ALL INTO runs (run_id, model_name, eval_name, base_eval, split, run_config, settings, created_by, created_at) @@ -411,15 +407,15 @@ def _flush_events_internal(self, events_to_write: Sequence[Event]): ) idx_l = idx_r - with bf.BlobFile(self.event_file_path, "a") as f: - f.writelines(lines) + with bf.BlobFile(self.event_file_path, "ab") as f: + f.write(b"".join([l.encode("utf-8") for l in lines])) self._last_flush_time = time.time() self._flushes_done += 1 def record_final_report(self, final_report: Any): with self._writing_lock: - with bf.BlobFile(self.event_file_path, "a") as f: - f.write(jsondumps({"final_report": final_report}) + "\n") + with bf.BlobFile(self.event_file_path, "ab") as f: + f.write((jsondumps({"final_report": final_report}) + "\n").encode("utf-8")) query = """ UPDATE runs SET final_report = PARSE_JSON(%(final_report)s) diff --git a/evals/registry/eval_sets/test-modelgraded.yaml b/evals/registry/eval_sets/test-modelgraded.yaml index fedbd572e1..a86552b7ad 100644 --- a/evals/registry/eval_sets/test-modelgraded.yaml +++ b/evals/registry/eval_sets/test-modelgraded.yaml @@ -12,3 +12,4 @@ test-modelgraded: - rap-people-vs-people - rap-animals-vs-fruits - rap-people-vs-fruits + - mg-humor-people_jp diff --git a/scripts/modelgraded_generator.py b/scripts/modelgraded_generator.py index c2b1dd69c6..157e4bf47a 100644 --- a/scripts/modelgraded_generator.py +++ b/scripts/modelgraded_generator.py @@ -180,6 +180,7 @@ def format(template: str, **kwargs: dict[str, str]) -> str: data_dir = f"{REGISTRY_PATH}/data/test_modelgraded" yaml_str = f"# This file is generated by {os.path.basename(__file__)}\n\n" +evals = [] for prompt_name, subject in unlabeled_target_sets: prompt = unlabeled_prompts[prompt_name]["prompt"] samples = [{"input": format(prompt, subject=s)} for s in subjects[subject]] @@ -201,9 +202,12 @@ def format(template: str, **kwargs: dict[str, str]) -> str: ) + "\n\n" ) + evals += [f"mg-{prompt_name}-{subject}: {file_name}"] yaml_file = f"{REGISTRY_PATH}/evals/test-modelgraded-generated.yaml" with open(yaml_file, "w") as f: f.write(yaml_str) print(f"wrote {yaml_file}") +for e in evals: + print(e)