Improved Filters, Added Test over Annotations

EleutherAI · Kyle1668 · Aug 17, 2023 · May 27, 2023 · May 27, 2023 · Jun 10, 2023
commit 4dce3fb369a1ada6e380ac2f7d6be6c485a6b8f2
diff --git a/filters/highly_repetitive.py b/filters/highly_repetitive.py
@@ -47,8 +47,46 @@ def break_and_compare_wrapper(ls: list, start_k: int, end_k: int) -> list:
  """
  # end_k is inclusive
  ls = list(ls)
+ length = len(ls)
+ half = length // 2
  for k in range(start_k, end_k + 1):
+ for i in range(0, half):
+ # remove some tokens from the end as well
+ rem = 2
+ # when rem = 0 -> 0.91 0.73 0.81
+ # when rem = 1 -> 0.91 0.78 0.84
+ # when rem = 2 -> 0.90 0.80 0.84 
+ # when rem = 3 -> 0.89 0.80 0.84
+ # when rem = 4 -> 0.89 0.80 0.84
+ # when rem = 5 -> 0.89 0.80 0.84
+ # when rem = 6 -> 0.89 0.80 0.84
+ for j in range(0, rem+1):
+ result = break_and_compare(ls[i:length - j], k)
+ if result:
+ return result, k
+ result = break_and_compare(ls[i:], k)
+ if result:
+ return result, i, k
  result = break_and_compare(ls, k)
  if result:
  return result, k
- return [], -1
+ return [], -1
+
+if __name__ == "__main__":
+# from transformers import AutoTokenizer
+# inp = """0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+# 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff"""
+# tokenizer = AutoTokenizer.from_pretrained(
+# "EleutherAI/pythia-70m-deduped",
+# )
+# inp = tokenizer(inp)['input_ids']
+# print(inp)
+# # for token in inp:
+# # print(token, tokenizer.decode(token))
+# print(break_and_compare_wrapper(inp, 2, 30))
+ ls = [1]
+ start_k = 1
+ end_k = 3
+ expected = ([1], 1)
+ output = break_and_compare_wrapper(ls, start_k, end_k)
+ print(output)
diff --git a/filters/pattern_incrementing.py b/filters/pattern_incrementing.py
@@ -1,6 +1,6 @@
 import re
 
-def replace_non_numeric_with_whitespace(text):
+def replace_non_numeric_with_whitespace(text: str) -> str:
  # Replace non-numeric characters with whitespace
  # cleaned_text = re.sub(r'[^0-9]', ' ', text)
  new_text = ""
@@ -39,7 +39,7 @@ def replace_non_numeric_with_whitespace(text):
 
  return cleaned_text
 
-def incrementing_sequences_filter(text):
+def incrementing_sequences_filter(text : str) -> bool:
  # count number of numeric and non-numeric characters
  num_numeric = 0
  num_non_numeric = 0

diff --git a/filters/test_highly_repetitive.py b/filters/test_highly_repetitive.py
@@ -40,7 +40,7 @@ def test_break_and_compare_wrapper_matching_chunks_within_range():
  ls = [1, 2, 3, 1, 2, 3, 1, 2, 3]
  start_k = 2
  end_k = 4
- expected = ([1, 2, 3], 3)
+ expected = ([1, 2, 3], 2)
  output = break_and_compare_wrapper(ls, start_k, end_k)
  assert output == expected, f"Test case 1 failed. Output: {output}, Expected: {expected}"
 

diff --git a/filters/testing_over_annotations/test.py b/filters/testing_over_annotations/test.py
@@ -0,0 +1,60 @@
+import pandas as pd
+import os
+import sys
+from transformers import AutoTokenizer
+sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+from pattern_incrementing import incrementing_sequences_filter
+from highly_repetitive import break_and_compare_wrapper
+
+df = pd.read_csv('filters/testing_over_annotations/test_data_full.csv')
+df = df[['shortened_text', 'Category']]
+
+all_categories = ['code', 'nl', 'pattern-incrementing', 'pattern-repeating', 'duplicated',
+ 'template', 'code+nl', 'empty/blank', 'other', 'random', 'structured']
+
+target_category = "pattern-repeating"
+use_tokenizer = True
+
+df.dropna(inplace=True)
+
+text = df['shortened_text'].to_list()
+category = df['Category'].to_list()
+
+tokenizer = AutoTokenizer.from_pretrained(
+ "EleutherAI/pythia-70m-deduped",
+)
+
+ls = []
+ls_true = [0]*len(category)
+for t in text:
+ # remove newlines
+ # replace all non alphanumeric characters with spaces
+ t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
+ # replace multiple spaces with single space
+ t = ' '.join(t.split())
+ if use_tokenizer:
+ t = tokenizer(t)['input_ids']
+ resp = break_and_compare_wrapper(t, 2, 10)
+ if resp[-1] != -1:
+ ls.append(target_category)
+ else:
+ ls.append("")
+print(ls)
+print(category)
+
+from sklearn.metrics import classification_report
+from sklearn.preprocessing import LabelEncoder
+encoder = LabelEncoder()
+encoder.fit(ls + category)
+
+encoded_list1 = encoder.transform(ls)
+encoded_list2 = encoder.transform(category)
+
+print(encoded_list1)
+print(encoded_list2)
+
+# Print classification report after a train/test split:
+print(classification_report(encoded_list2, encoded_list1))
+
+df['predicted'] = ls
+df.to_csv('filters/testing_over_annotations/test_data_full_pred.csv', index=False)