Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Highly Repetitive & Pattern Incrementing #4

Merged
merged 6 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Improved Filters, Added Test over Annotations
  • Loading branch information
aflah02 committed Jul 30, 2023
commit 4dce3fb369a1ada6e380ac2f7d6be6c485a6b8f2
40 changes: 39 additions & 1 deletion filters/highly_repetitive.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,46 @@ def break_and_compare_wrapper(ls: list, start_k: int, end_k: int) -> list:
"""
# end_k is inclusive
ls = list(ls)
length = len(ls)
half = length // 2
for k in range(start_k, end_k + 1):
for i in range(0, half):
# remove some tokens from the end as well
rem = 2
# when rem = 0 -> 0.91 0.73 0.81
# when rem = 1 -> 0.91 0.78 0.84
# when rem = 2 -> 0.90 0.80 0.84
# when rem = 3 -> 0.89 0.80 0.84
# when rem = 4 -> 0.89 0.80 0.84
# when rem = 5 -> 0.89 0.80 0.84
# when rem = 6 -> 0.89 0.80 0.84
for j in range(0, rem+1):
result = break_and_compare(ls[i:length - j], k)
if result:
return result, k
result = break_and_compare(ls[i:], k)
if result:
return result, i, k
result = break_and_compare(ls, k)
if result:
return result, k
return [], -1
return [], -1

if __name__ == "__main__":
# from transformers import AutoTokenizer
# inp = """0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
# 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff"""
# tokenizer = AutoTokenizer.from_pretrained(
# "EleutherAI/pythia-70m-deduped",
# )
# inp = tokenizer(inp)['input_ids']
# print(inp)
# # for token in inp:
# # print(token, tokenizer.decode(token))
# print(break_and_compare_wrapper(inp, 2, 30))
ls = [1]
start_k = 1
end_k = 3
expected = ([1], 1)
output = break_and_compare_wrapper(ls, start_k, end_k)
print(output)
4 changes: 2 additions & 2 deletions filters/pattern_incrementing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re

def replace_non_numeric_with_whitespace(text):
def replace_non_numeric_with_whitespace(text: str) -> str:
# Replace non-numeric characters with whitespace
# cleaned_text = re.sub(r'[^0-9]', ' ', text)
new_text = ""
Expand Down Expand Up @@ -39,7 +39,7 @@ def replace_non_numeric_with_whitespace(text):

return cleaned_text

def incrementing_sequences_filter(text):
def incrementing_sequences_filter(text : str) -> bool:
# count number of numeric and non-numeric characters
num_numeric = 0
num_non_numeric = 0
Expand Down
2 changes: 1 addition & 1 deletion filters/test_highly_repetitive.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_break_and_compare_wrapper_matching_chunks_within_range():
ls = [1, 2, 3, 1, 2, 3, 1, 2, 3]
start_k = 2
end_k = 4
expected = ([1, 2, 3], 3)
expected = ([1, 2, 3], 2)
output = break_and_compare_wrapper(ls, start_k, end_k)
assert output == expected, f"Test case 1 failed. Output: {output}, Expected: {expected}"

Expand Down
60 changes: 60 additions & 0 deletions filters/testing_over_annotations/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pandas as pd
import os
import sys
from transformers import AutoTokenizer
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from pattern_incrementing import incrementing_sequences_filter
from highly_repetitive import break_and_compare_wrapper

df = pd.read_csv('filters/testing_over_annotations/test_data_full.csv')
df = df[['shortened_text', 'Category']]

all_categories = ['code', 'nl', 'pattern-incrementing', 'pattern-repeating', 'duplicated',
'template', 'code+nl', 'empty/blank', 'other', 'random', 'structured']

target_category = "pattern-repeating"
use_tokenizer = True

df.dropna(inplace=True)

text = df['shortened_text'].to_list()
category = df['Category'].to_list()

tokenizer = AutoTokenizer.from_pretrained(
"EleutherAI/pythia-70m-deduped",
)

ls = []
ls_true = [0]*len(category)
for t in text:
# remove newlines
# replace all non alphanumeric characters with spaces
t = t.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
# replace multiple spaces with single space
t = ' '.join(t.split())
if use_tokenizer:
t = tokenizer(t)['input_ids']
resp = break_and_compare_wrapper(t, 2, 10)
if resp[-1] != -1:
ls.append(target_category)
else:
ls.append("")
print(ls)
print(category)

from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(ls + category)

encoded_list1 = encoder.transform(ls)
encoded_list2 = encoder.transform(category)

print(encoded_list1)
print(encoded_list2)

# Print classification report after a train/test split:
print(classification_report(encoded_list2, encoded_list1))

df['predicted'] = ls
df.to_csv('filters/testing_over_annotations/test_data_full_pred.csv', index=False)