Small fixes

deepset-ai · vblagoje · Apr 9, 2024 · Mar 21, 2024 · Mar 22, 2024 · Mar 22, 2024
commit ca9c8b9a5ea592fe4d728cf8a7585f3921ff3d6f
@@ -455,7 +455,7 @@ def _check_if_in_table(
  break
  return in_table
 
- def _hash_dataframe(self, df: pd.DataFrame, desired_samples=10, hash_length=4) -> str:
+ def _hash_dataframe(self, df: pd.DataFrame, desired_samples=5, hash_length=4) -> str:
  """
  Returns a hash of the DataFrame content. The hash is based on the content of the DataFrame.
  :param df: The DataFrame to hash.
@@ -465,7 +465,7 @@ def _hash_dataframe(self, df: pd.DataFrame, desired_samples=10, hash_length=4) -
  :returns: A hash of the DataFrame content.
  """
  # take adaptive sample of rows to hash because we can have very large dataframes
- hasher = hashlib.sha256()
+ hasher = hashlib.md5()
  total_rows = len(df)
  # sample rate based on DataFrame size and desired number of samples
  sample_rate = max(1, total_rows // desired_samples)

@@ -287,18 +287,19 @@ def test_run_with_docx_file(self, test_files_path):
  def test_hashing_dataframe(self, mock_resolve_value):
  mock_resolve_value.return_value = "test_api_key"
  component = AzureOCRDocumentConverter(endpoint="")
+ hash_length = 32
 
  df = pd.DataFrame({"A": [1, 2, 3]})
  hash_string_1 = component._hash_dataframe(df)
- assert len(hash_string_1) == 64
+ assert len(hash_string_1) == hash_length
 
  df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  hash_string_2 = component._hash_dataframe(df)
- assert len(hash_string_2) == 64
+ assert len(hash_string_2) == hash_length
 
  df = pd.DataFrame({"B": [4, 5, 6], "A": [1, 2, 3], "D": [7, 8, 9]})
  hash_string_3 = component._hash_dataframe(df)
- assert len(hash_string_3) == 64
+ assert len(hash_string_3) == hash_length
 
  # doesn't mean much, more for sanity check
  assert hash_string_1 != hash_string_2 != hash_string_3