Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 35 additions & 33 deletions hamilton/caching/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@ def _compact_hash(digest: bytes) -> str:
return base64.urlsafe_b64encode(digest).decode()


def _hash_bytes(data: bytes) -> str:
"""Hash raw bytes and compact-encode the digest.

All hashing in this module routes through this single helper so the
underlying hashing algorithm can be changed in exactly one place.
"""
return _compact_hash(hashlib.md5(data).digest())


@functools.singledispatch
def hash_value(obj, *args, depth=0, **kwargs) -> str:
"""Fingerprinting strategy that computes a hash of the
Expand Down Expand Up @@ -138,22 +147,27 @@ def hash_repr(obj, *args, **kwargs) -> str:
@hash_value.register(float)
@hash_value.register(bool)
def hash_primitive(obj, *args, **kwargs) -> str:
"""Convert the primitive to a string and hash it
"""Convert the primitive to a string and hash it.

The hash is prefixed with the type name so that values sharing the same
string form but differing in type (e.g. ``1`` vs ``"1"`` vs ``1.0``) do
not collide.

Primitive type returns a hash and doesn't have to handle depth.
"""
hash_object = hashlib.md5(str(obj).encode())
return _compact_hash(hash_object.digest())
return _hash_bytes(f"{type(obj).__name__}:{obj}".encode())


@hash_value.register(bytes)
def hash_bytes(obj, *args, **kwargs) -> str:
"""Convert the primitive to a string and hash it
"""Hash a bytes object.

The hash is prefixed with a ``bytes`` type tag so that ``b"1"`` and the
string ``"1"`` (handled by :func:`hash_primitive`) do not collide.

Primitive type returns a hash and doesn't have to handle depth.
"""
hash_object = hashlib.md5(obj)
return _compact_hash(hash_object.digest())
return _hash_bytes(b"bytes:" + obj)


@hash_value.register(Sequence)
Expand All @@ -162,11 +176,8 @@ def hash_sequence(obj, *args, depth: int = 0, **kwargs) -> str:

Orders matters for the hash since orders matters in a sequence.
"""
hash_object = hashlib.sha224()
for elem in obj:
hash_object.update(hash_value(elem, depth=depth + 1).encode())

return _compact_hash(hash_object.digest())
buffer = b"".join(hash_value(elem, depth=depth + 1).encode() for elem in obj)
return _hash_bytes(buffer)


def hash_unordered_mapping(obj, *args, depth: int = 0, **kwargs) -> str:
Expand All @@ -186,12 +197,10 @@ def hash_unordered_mapping(obj, *args, depth: int = 0, **kwargs) -> str:
for key, value in obj.items():
hashed_mapping[hash_value(key, depth=depth + 1)] = hash_value(value, depth=depth + 1)

hash_object = hashlib.sha224()
for key, value in sorted(hashed_mapping.items()):
hash_object.update(key.encode())
hash_object.update(value.encode())

return _compact_hash(hash_object.digest())
buffer = b"".join(
key.encode() + value.encode() for key, value in sorted(hashed_mapping.items())
)
return _hash_bytes(buffer)


@hash_value.register(Mapping)
Expand All @@ -217,12 +226,11 @@ def hash_mapping(obj, *, ignore_order: bool = True, depth: int = 0, **kwargs) ->
# use the same depth because we're simply dispatching to another implementation
return hash_unordered_mapping(obj, depth=depth)

hash_object = hashlib.sha224()
for key, value in obj.items():
hash_object.update(hash_value(key, depth=depth + 1).encode())
hash_object.update(hash_value(value, depth=depth + 1).encode())

return _compact_hash(hash_object.digest())
buffer = b"".join(
hash_value(key, depth=depth + 1).encode() + hash_value(value, depth=depth + 1).encode()
for key, value in obj.items()
)
return _hash_bytes(buffer)


@hash_value.register(Set)
Expand All @@ -233,14 +241,9 @@ def hash_set(obj, *args, depth: int = 0, **kwargs) -> str:
For the same objects in the set, the hashes will be the
same.
"""
hashes = [hash_value(elem, depth=depth + 1) for elem in obj]
sorted_hashes = sorted(hashes)

hash_object = hashlib.sha224()
for hash in sorted_hashes:
hash_object.update(hash.encode())

return _compact_hash(hash_object.digest())
sorted_hashes = sorted(hash_value(elem, depth=depth + 1) for elem in obj)
buffer = b"".join(hash.encode() for hash in sorted_hashes)
return _hash_bytes(buffer)


@hash_value.register(h_databackends.AbstractPandasDataFrame)
Expand Down Expand Up @@ -268,8 +271,7 @@ def hash_polars_dataframe(obj, *args, depth: int = 0, **kwargs) -> str:
schema_str = ",".join(f"{name}:{dtype}" for name, dtype in obj.schema.items())
schema_hash = hash_bytes(schema_str.encode())
row_hash = hash_sequence(obj.hash_rows().to_list(), depth=depth + 1)
combined = hashlib.md5(schema_hash.encode() + row_hash.encode())
return _compact_hash(combined.digest())
return _hash_bytes(schema_hash.encode() + row_hash.encode())


@hash_value.register(h_databackends.AbstractPolarsColumn)
Expand Down
76 changes: 55 additions & 21 deletions tests/caching/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,14 +126,31 @@ def __init__(self, obj):
assert fingerprint1 != fingerprint2


# ---------------------------------------------------------------------------
# Portability / algorithm-stability guard
#
# The tests below pin literal digests. They cover only types that hash
# deterministically across platforms and library versions: their digest is a
# function of the value's Python representation (or, for numpy, an explicit
# shape + dtype + raw bytes) and the hashing algorithm alone. Pinning them
# guards against an accidental change to the hashing algorithm and documents
# that the fingerprint is reproducible on other machines.
#
# Version-sensitive types (pandas / polars DataFrames, whose digest depends on
# library-version-specific dtype reprs and row-hash internals) are NOT pinned
# here; they are covered by the relational must-differ / must-match tests
# further down, which assert behavior rather than an exact digest.
# ---------------------------------------------------------------------------


@pytest.mark.parametrize(
("obj", "expected_hash"),
[
("hello-world", "IJUxIYl1PeatR9_iDL6X7A=="),
(17.31231, "vAYX8MD8yEHK6dwnIPVUaw=="),
(16474, "L_epMRRUy3Qq5foVvFT_OQ=="),
(True, "-CfPRi9ihI3zfF4elKTadA=="),
(b"\x951!\x89u=\xe6\xadG\xdf", "qK2VJ0vVTRJemfC0beO8iA=="),
("hello-world", "L1Q1Kh6_t1atHO_H8RbBeA=="),
(17.31231, "mJPTpPyXDSZgU-u8NuztIQ=="),
(16474, "6MgAp1NbMW0ZZpe_8iKVsg=="),
(True, "J2eGynSuIpd5bwVQzO9VVg=="),
(b"\x951!\x89u=\xe6\xadG\xdf", "d1DufDgRQmqi9Kt4Z2PeUQ=="),
],
)
def test_hash_primitive(obj, expected_hash):
Expand All @@ -144,8 +161,8 @@ def test_hash_primitive(obj, expected_hash):
@pytest.mark.parametrize(
("obj", "expected_hash"),
[
([0, True, "hello-world"], "Pg9LP3Y-8yYsoWLXedPVKDwTAa7W8_fjJNTTUA=="),
((17.0, False, "world"), "wyuuKMuL8rp53_CdYAtyMmyetnTJ9LzmexhJrQ=="),
([0, True, "hello-world"], "mlOjj4yeCrSDFSn5zgdEIg=="),
((17.0, False, "world"), "BcRSGfyKeIOdym9B6TmAyQ=="),
],
)
def test_hash_sequence(obj, expected_hash):
Expand All @@ -156,7 +173,7 @@ def test_hash_sequence(obj, expected_hash):
def test_hash_equals_for_different_sequence_types():
list_obj = [0, True, "hello-world"]
tuple_obj = (0, True, "hello-world")
expected_hash = "Pg9LP3Y-8yYsoWLXedPVKDwTAa7W8_fjJNTTUA=="
expected_hash = "mlOjj4yeCrSDFSn5zgdEIg=="

list_fingerprint = fingerprinting.hash_sequence(list_obj)
tuple_fingerprint = fingerprinting.hash_sequence(tuple_obj)
Expand All @@ -165,7 +182,7 @@ def test_hash_equals_for_different_sequence_types():

def test_hash_ordered_mapping():
obj = {0: True, "key": "value", 17.0: None}
expected_hash = "1zH9TfTu0-nlWXXXYo0vigFFSQajWXov2w4AZQ=="
expected_hash = "GyxyI9-pq-EJJvSAIN509g=="
fingerprint = fingerprinting.hash_mapping(obj, ignore_order=False)
assert fingerprint == expected_hash

Expand All @@ -180,7 +197,7 @@ def test_hash_mapping_where_order_matters():

def test_hash_unordered_mapping():
obj = {0: True, "key": "value", 17.0: None}
expected_hash = "uw0dfSAEgE9nOK3bHgmJ4TR3-VFRqOAoogdRmw=="
expected_hash = "cDuuL2eA3DaSWlWW3u7o9g=="
fingerprint = fingerprinting.hash_mapping(obj, ignore_order=True)
assert fingerprint == expected_hash

Expand All @@ -195,22 +212,16 @@ def test_hash_mapping_where_order_doesnt_matter():

def test_hash_set():
obj = {0, True, "key", "value", 17.0, None}
expected_hash = "dKyAE-ob4_GD-Mb5Lu2R-VJAxGctY4L8JDwc2g=="
expected_hash = "E_f_tjbi6qn7KL3NUCZayg=="
fingerprint = fingerprinting.hash_set(obj)
assert fingerprint == expected_hash


def test_hash_pandas():
"""pandas has a specialized hash function"""
obj = pd.DataFrame({"a": [1, 2], "b": ["x", "y"]})
expected_hash = "LSHACWyG83JBIggxO9LGrerW3WZEy4nUOmIQoA=="
fingerprint = fingerprinting.hash_pandas_obj(obj)
assert fingerprint == expected_hash


def test_hash_numpy():
array = np.array([[0, 1], [2, 3]])
expected_hash = "tVIm5kJ7G0GZaaifSEtrOQ=="
# dtype is pinned explicitly so the literal digest is reproducible across
# platforms (the default integer dtype is platform-dependent).
array = np.array([[0, 1], [2, 3]], dtype=np.int64)
expected_hash = "024zwZIcWy6r4dlX4AMTow=="
fingerprint = fingerprinting.hash_value(array)
assert fingerprint == expected_hash

Expand All @@ -230,6 +241,13 @@ def test_hash_numpy_different_dtypes_differ():
assert fingerprinting.hash_value(a) != fingerprinting.hash_value(b)


def test_hash_pandas_same_data_matches():
"""Identical pandas DataFrames must produce the same hash (determinism)."""
a = pd.DataFrame({"x": [1, 2], "y": [3, 4]})
b = pd.DataFrame({"x": [1, 2], "y": [3, 4]})
assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b)


def test_hash_polars_different_columns_differ():
"""DataFrames with identical values but different column names must hash differently."""
polars = pytest.importorskip("polars")
Expand All @@ -244,3 +262,19 @@ def test_hash_polars_same_schema_same_data_matches():
a = polars.DataFrame({"x": [1, 2], "y": [3, 4]})
b = polars.DataFrame({"x": [1, 2], "y": [3, 4]})
assert fingerprinting.hash_value(a) == fingerprinting.hash_value(b)


def test_hash_cross_type_primitives_differ():
"""Values with the same string form but different types must hash differently.

Before type tagging, ``str(1) == str("1") == "1"`` collapsed int/str (and
likewise float/str and bytes/str) into identical fingerprints.
"""
fingerprints = {
fingerprinting.hash_value(1),
fingerprinting.hash_value("1"),
fingerprinting.hash_value(b"1"),
fingerprinting.hash_value(1.0),
fingerprinting.hash_value("1.0"),
}
assert len(fingerprints) == 5
Loading