from unittest.mock import MagicMock, patch import sklearn.naive_bayes import numpy as np import pandas as pd import re # test csv file TEST_CSV = 'data/test_info.csv' class AssertTest(object): '''Defines general test behavior.''' def __init__(self, params): self.assert_param_message = '\n'.join([str(k) + ': ' + str(v) + '' for k, v in params.items()]) def test(self, assert_condition, assert_message): assert assert_condition, assert_message + '\n\nUnit Test Function Parameters\n' + self.assert_param_message def _print_success_message(): print('Tests Passed!') # test clean_dataframe def test_numerical_df(numerical_dataframe): # test result transformed_df = numerical_dataframe(TEST_CSV) # Check type is a DataFrame assert isinstance(transformed_df, pd.DataFrame), 'Returned type is {}.'.format(type(transformed_df)) # check columns column_names = list(transformed_df) assert 'File' in column_names, 'No File column, found.' assert 'Task' in column_names, 'No Task column, found.' assert 'Category' in column_names, 'No Category column, found.' assert 'Class' in column_names, 'No Class column, found.' # check conversion values assert transformed_df.loc[0, 'Category'] == 1, '`heavy` plagiarism mapping test, failed.' assert transformed_df.loc[2, 'Category'] == 0, '`non` plagiarism mapping test, failed.' assert transformed_df.loc[30, 'Category'] == 3, '`cut` plagiarism mapping test, failed.' assert transformed_df.loc[5, 'Category'] == 2, '`light` plagiarism mapping test, failed.' assert transformed_df.loc[37, 'Category'] == -1, 'original file mapping test, failed; should have a Category = -1.' assert transformed_df.loc[41, 'Category'] == -1, 'original file mapping test, failed; should have a Category = -1.' _print_success_message() def test_containment(complete_df, containment_fn): # check basic format and value # for n = 1 and just the fifth file test_val = containment_fn(complete_df, 1, 'g0pA_taske.txt') assert isinstance(test_val, float), 'Returned type is {}.'.format(type(test_val)) assert test_val<=1.0, 'It appears that the value is not normalized; expected a value <=1, got: '+str(test_val) # known vals for first few files filenames = ['g0pA_taska.txt', 'g0pA_taskb.txt', 'g0pA_taskc.txt', 'g0pA_taskd.txt'] ngram_1 = [0.39814814814814814, 1.0, 0.86936936936936937, 0.5935828877005348] ngram_3 = [0.0093457943925233638, 0.96410256410256412, 0.61363636363636365, 0.15675675675675677] # results for comparison results_1gram = [] results_3gram = [] for i in range(4): val_1 = containment_fn(complete_df, 1, filenames[i]) val_3 = containment_fn(complete_df, 3, filenames[i]) results_1gram.append(val_1) results_3gram.append(val_3) # check correct results assert all(np.isclose(results_1gram, ngram_1, rtol=1e-04)), \ 'n=1 calculations are incorrect. Double check the intersection calculation.' # check correct results assert all(np.isclose(results_3gram, ngram_3, rtol=1e-04)), \ 'n=3 calculations are incorrect.' _print_success_message() def test_lcs(df, lcs_word): test_index = 10 # file 10 # get answer file text answer_text = df.loc[test_index, 'Text'] # get text for orig file # find the associated task type (one character, a-e) task = df.loc[test_index, 'Task'] # we know that source texts have Class = -1 orig_rows = df[(df['Class'] == -1)] orig_row = orig_rows[(orig_rows['Task'] == task)] source_text = orig_row['Text'].values[0] # calculate LCS test_val = lcs_word(answer_text, source_text) # check type assert isinstance(test_val, float), 'Returned type is {}.'.format(type(test_val)) assert test_val<=1.0, 'It appears that the value is not normalized; expected a value <=1, got: '+str(test_val) # known vals for first few files lcs_vals = [0.1917808219178082, 0.8207547169811321, 0.8464912280701754, 0.3160621761658031, 0.24257425742574257] # results for comparison results = [] for i in range(5): # get answer and source text answer_text = df.loc[i, 'Text'] task = df.loc[i, 'Task'] # we know that source texts have Class = -1 orig_rows = df[(df['Class'] == -1)] orig_row = orig_rows[(orig_rows['Task'] == task)] source_text = orig_row['Text'].values[0] # calc lcs val = lcs_word(answer_text, source_text) results.append(val) # check correct results assert all(np.isclose(results, lcs_vals, rtol=1e-05)), 'LCS calculations are incorrect.' _print_success_message() def test_data_split(train_x, train_y, test_x, test_y): # check types assert isinstance(train_x, np.ndarray),\ 'train_x is not an array, instead got type: {}'.format(type(train_x)) assert isinstance(train_y, np.ndarray),\ 'train_y is not an array, instead got type: {}'.format(type(train_y)) assert isinstance(test_x, np.ndarray),\ 'test_x is not an array, instead got type: {}'.format(type(test_x)) assert isinstance(test_y, np.ndarray),\ 'test_y is not an array, instead got type: {}'.format(type(test_y)) # should hold all 95 submission files assert len(train_x) + len(test_x) == 95, \ 'Unexpected amount of train + test data. Expecting 95 answer text files, got ' +str(len(train_x) + len(test_x)) assert len(test_x) > 1, \ 'Unexpected amount of test data. There should be multiple test files.' # check shape assert train_x.shape[1]==2, \ 'train_x should have as many columns as selected features, got: {}'.format(train_x.shape[1]) assert len(train_y.shape)==1, \ 'train_y should be a 1D array, got shape: {}'.format(train_y.shape) _print_success_message()