Remove filecache (openai#389)

Remove filecache since it's causing confusion and it's not necessary at the moment.
sdiviney · Mar 21, 2023 · e83f0ba · e83f0ba
1 parent 5ef78ec
commit e83f0ba
Show file tree

Hide file tree

Showing 4 changed files with 2 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -72,10 +72,6 @@ Do you have any examples of evals implemented in multiple different ways?
 
 - Yes! In particular, see `evals/registry/evals/coqa.yaml`. We have implemented small subsets of the [CoQA](https://stanfordnlp.github.io/coqa/) dataset for various eval templates to help illustrate the differences.
 
-I changed my data but this isn't reflected when running my eval, what's going on?
-
-- Your data may have been cached to `/tmp/filecache`. Try removing this cache and rerunning your eval.
-
 When I run an eval, it sometimes hangs at the very end (after the final report). What's going on?
 
 - This is a known issue, but you should be able to interrupt it safely and the eval should finish immediately after.

diff --git a/docs/build-eval.md b/docs/build-eval.md
@@ -44,7 +44,7 @@ Register the eval by adding a file to `evals/registry/evals/<eval_name>.yaml` us
  samples_jsonl: <eval_name>/samples.jsonl
 ```
 
-Upon running the eval, the data will be searched for in `evals/registry/data`, e.g. if `test_match/samples.jsonl` is the provided filepath the data is expected to be in `evals/registry/data/test_match/samples.jsonl`. 
+Upon running the eval, the data will be searched for in `evals/registry/data`, e.g. if `test_match/samples.jsonl` is the provided filepath the data is expected to be in `evals/registry/data/test_match/samples.jsonl`.
 
 The naming convention for evals is in the form `<eval_name>.<split>.<version>`.
 - `<eval_name>` is the eval name, used to group evals whose scores are comparable.
@@ -59,7 +59,7 @@ You can now run your eval on your data from the CLI with your choice of model:
 ```
 oaieval gpt-3.5-turbo <eval_name>
 ```
-Congratulations, you have built your eval! Keep iterating on it until you are confident in the results. Remember, if you change the data file, remove `/tmp/filecache` so that the eval is run with your updated data.
+Congratulations, you have built your eval! Keep iterating on it until you are confident in the results.
 
 ## For model-graded evals: a step-by-step workflow
 

diff --git a/docs/custom-eval.md b/docs/custom-eval.md
@@ -146,5 +146,3 @@ If you run with the `gpt-3.5-turbo` model, you should see an output similar to t
 ... [oaieval.py:172] accuracy: 1.0
 ... [record.py:309] Logged 6 rows of events to /tmp/evallogs/<run_id>_gpt-3.5-turbo_arithmetic.jsonl: insert_time=2.038ms
 ```
-
-If you notice evals has cached your data and you need to clear that cache, you can do so with `rm -rf /tmp/filecache`.
diff --git a/evals/data.py b/evals/data.py
@@ -4,16 +4,13 @@
 import csv
 import dataclasses
 import gzip
-import hashlib
 import itertools
 import json
 import logging
 import os
-import pickle
 import urllib
 from collections.abc import Iterator
 from functools import partial
-from pathlib import Path
 from typing import Any, Sequence, Union
 
 import blobfile as bf
@@ -93,27 +90,6 @@ def _stream_jsonl_file(path) -> Iterator:
  yield json.loads(line)
 
 
-def filecache(func):
- DIR = "/tmp/filecache"
- name = func.__name__
-
- def wrapper(*args, **kwargs):
- md5 = hashlib.md5((name + ":" + str((args, kwargs))).encode("utf-8")).hexdigest()
- pkl_path = f"{DIR}/{md5}.pkl"
- if os.path.exists(pkl_path):
- logger.debug(f"Loading from file cache: {pkl_path}")
- with open(pkl_path, "rb") as f:
- return pickle.load(f)
- result = func(*args, **kwargs)
- Path(DIR).mkdir(parents=True, exist_ok=True)
- with open(pkl_path, "wb") as f:
- pickle.dump(result, f)
- return result
-
- return wrapper
-
-
-@filecache
 def get_lines(path) -> list[dict]:
  """
  Get a list of lines from a file.
@@ -122,7 +98,6 @@ def get_lines(path) -> list[dict]:
  return f.readlines()
 
 
-@filecache
 def get_jsonl(path: str) -> list[dict]:
  """
  Extract json lines from the given path.
@@ -139,12 +114,10 @@ def get_jsonl(path: str) -> list[dict]:
  return _get_jsonl_file(path)
 
 
-@filecache
 def get_jsonls(paths: Sequence[str], line_limit=None) -> list[dict]:
  return list(iter_jsonls(paths, line_limit))
 
 
-@filecache
 def get_json(path) -> dict:
  if bf.isdir(path):
  raise ValueError("Path is a directory, only files are supported")