Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Azure converter updates #7409

Merged
merged 13 commits into from
Apr 9, 2024
Prev Previous commit
Next Next commit
Small fixes
  • Loading branch information
vblagoje committed Apr 5, 2024
commit ca9c8b9a5ea592fe4d728cf8a7585f3921ff3d6f
4 changes: 2 additions & 2 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ def _check_if_in_table(
break
return in_table

def _hash_dataframe(self, df: pd.DataFrame, desired_samples=10, hash_length=4) -> str:
def _hash_dataframe(self, df: pd.DataFrame, desired_samples=5, hash_length=4) -> str:
"""
Returns a hash of the DataFrame content. The hash is based on the content of the DataFrame.
:param df: The DataFrame to hash.
Expand All @@ -465,7 +465,7 @@ def _hash_dataframe(self, df: pd.DataFrame, desired_samples=10, hash_length=4) -
:returns: A hash of the DataFrame content.
"""
# take adaptive sample of rows to hash because we can have very large dataframes
hasher = hashlib.sha256()
hasher = hashlib.md5()
total_rows = len(df)
# sample rate based on DataFrame size and desired number of samples
sample_rate = max(1, total_rows // desired_samples)
Expand Down
7 changes: 4 additions & 3 deletions test/components/converters/test_azure_ocr_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,18 +287,19 @@ def test_run_with_docx_file(self, test_files_path):
def test_hashing_dataframe(self, mock_resolve_value):
mock_resolve_value.return_value = "test_api_key"
component = AzureOCRDocumentConverter(endpoint="")
hash_length = 32

df = pd.DataFrame({"A": [1, 2, 3]})
hash_string_1 = component._hash_dataframe(df)
assert len(hash_string_1) == 64
assert len(hash_string_1) == hash_length

df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
hash_string_2 = component._hash_dataframe(df)
assert len(hash_string_2) == 64
assert len(hash_string_2) == hash_length

df = pd.DataFrame({"B": [4, 5, 6], "A": [1, 2, 3], "D": [7, 8, 9]})
hash_string_3 = component._hash_dataframe(df)
assert len(hash_string_3) == 64
assert len(hash_string_3) == hash_length

# doesn't mean much, more for sanity check
assert hash_string_1 != hash_string_2 != hash_string_3
Loading