Better hashing, add unit tests

deepset-ai · vblagoje · Apr 9, 2024 · Mar 21, 2024 · Mar 22, 2024 · Mar 22, 2024
commit caf58c82dd144adb808c934925293a2ea9e6823f
@@ -287,7 +287,7 @@ def _convert_tables(self, result: AnalyzeResult, meta: Optional[Dict[str, Any]])
  table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:])
 
  # Use custom ID for tables, as columns might not be unique and thus failing in the default ID generation
- pd_hashes = pd.util.hash_pandas_object(table_df, index=True).values
+ pd_hashes = self._hash_dataframe(table_df)
  data = f"{pd_hashes}{table_meta}"
  doc_id = hashlib.sha256(data.encode()).hexdigest()
  converted_tables.append(Document(id=doc_id, dataframe=table_df, meta=table_meta))
@@ -454,3 +454,27 @@ def _check_if_in_table(
  in_table = True
  break
  return in_table
+
+ def _hash_dataframe(self, df: pd.DataFrame, desired_samples=10, hash_length=4) -> str:
+ """
+ Returns a hash of the DataFrame content. The hash is based on the content of the DataFrame.
+ :param df: The DataFrame to hash.
+ :param desired_samples: The desired number of samples to hash.
+ :param hash_length: The length of the hash for each sample.
+
+ :returns: A hash of the DataFrame content.
+ """
+ # take adaptive sample of rows to hash because we can have very large dataframes
+ hasher = hashlib.sha256()
+ total_rows = len(df)
+ # sample rate based on DataFrame size and desired number of samples
+ sample_rate = max(1, total_rows // desired_samples)
+
+ hashes = pd.util.hash_pandas_object(df, index=True)
+ sampled_hashes = hashes[::sample_rate]
+
+ for hash_value in sampled_hashes:
+ partial_hash = str(hash_value)[:hash_length].encode("utf-8")
+ hasher.update(partial_hash)
+
+ return hasher.hexdigest()
@@ -5,6 +5,7 @@
 from typing import Literal
 from unittest.mock import patch
 
+import pandas as pd
 import pytest
 from azure.ai.formrecognizer import AnalyzeResult
 
@@ -281,3 +282,23 @@ def test_run_with_docx_file(self, test_files_path):
  assert "Sample Docx File" in documents[0].content
  assert "Now we are in Page 2" in documents[0].content
  assert "Page 3 was empty this is page 4" in documents[0].content
+
+ @patch("haystack.utils.auth.EnvVarSecret.resolve_value")
+ def test_hashing_dataframe(self, mock_resolve_value):
+ mock_resolve_value.return_value = "test_api_key"
+ component = AzureOCRDocumentConverter(endpoint="")
+
+ df = pd.DataFrame({"A": [1, 2, 3]})
+ hash_string_1 = component._hash_dataframe(df)
+ assert len(hash_string_1) == 64
+
+ df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+ hash_string_2 = component._hash_dataframe(df)
+ assert len(hash_string_2) == 64
+
+ df = pd.DataFrame({"B": [4, 5, 6], "A": [1, 2, 3], "D": [7, 8, 9]})
+ hash_string_3 = component._hash_dataframe(df)
+ assert len(hash_string_3) == 64
+
+ # doesn't mean much, more for sanity check
+ assert hash_string_1 != hash_string_2 != hash_string_3