diff --git a/LICENSE b/LICENSE index 60a996edb..4d72e4c07 100644 --- a/LICENSE +++ b/LICENSE @@ -226,3 +226,33 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +------------------------------- + +This product depends on xxhash (the python-xxhash package, https://github.com/ifduyue/python-xxhash), +which is licensed under the BSD 2-Clause License. + +BSD 2-Clause License + +Copyright (c) 2014-2024, Yue Du +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/hamilton/caching/fingerprinting.py b/hamilton/caching/fingerprinting.py index df488b281..f3c1b90dd 100644 --- a/hamilton/caching/fingerprinting.py +++ b/hamilton/caching/fingerprinting.py @@ -36,11 +36,12 @@ import base64 import datetime import functools -import hashlib import logging import sys from collections.abc import Mapping, Sequence, Set +import xxhash + from hamilton.experimental import h_databackends # NoneType is introduced in Python 3.10 @@ -77,12 +78,16 @@ def _compact_hash(digest: bytes) -> str: def _hash_bytes(data: bytes) -> str: - """Hash raw bytes and compact-encode the digest. + """Hash raw bytes with the non-cryptographic xxh3_128 algorithm and + compact-encode the digest. All hashing in this module routes through this single helper so the underlying hashing algorithm can be changed in exactly one place. + xxh3_128 produces a 16-byte digest (24 base64url chars, the same width + as the md5 it replaces) while running substantially faster on + buffer-bound paths. """ - return _compact_hash(hashlib.md5(data).digest()) + return _compact_hash(xxhash.xxh3_128(data).digest()) @functools.singledispatch diff --git a/pyproject.toml b/pyproject.toml index 74ad5d586..a8e00171a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "pandas", "typing_extensions > 4.0.0", "typing_inspect", + "xxhash>=0.8.0", ] [project.optional-dependencies] diff --git a/tests/caching/test_fingerprinting.py b/tests/caching/test_fingerprinting.py index 9c00e072e..296affd6b 100644 --- a/tests/caching/test_fingerprinting.py +++ b/tests/caching/test_fingerprinting.py @@ -146,11 +146,11 @@ def __init__(self, obj): @pytest.mark.parametrize( ("obj", "expected_hash"), [ - ("hello-world", "L1Q1Kh6_t1atHO_H8RbBeA=="), - (17.31231, "mJPTpPyXDSZgU-u8NuztIQ=="), - (16474, "6MgAp1NbMW0ZZpe_8iKVsg=="), - (True, "J2eGynSuIpd5bwVQzO9VVg=="), - (b"\x951!\x89u=\xe6\xadG\xdf", "d1DufDgRQmqi9Kt4Z2PeUQ=="), + ("hello-world", "EXXR8_e47ElS18aP2lThJA=="), + (17.31231, "tVUSIslYiBcW52c-7w4gvA=="), + (16474, "FAJ-iXM_Hwg9TCRreY8AyA=="), + (True, "qkJEg3-XQKmGWk5sWqmonw=="), + (b"\x951!\x89u=\xe6\xadG\xdf", "pPTyYkSU_x7NLB1Fp_YTyA=="), ], ) def test_hash_primitive(obj, expected_hash): @@ -161,8 +161,8 @@ def test_hash_primitive(obj, expected_hash): @pytest.mark.parametrize( ("obj", "expected_hash"), [ - ([0, True, "hello-world"], "mlOjj4yeCrSDFSn5zgdEIg=="), - ((17.0, False, "world"), "BcRSGfyKeIOdym9B6TmAyQ=="), + ([0, True, "hello-world"], "I98OkNhfxtScJrYNTs4ZfQ=="), + ((17.0, False, "world"), "catgOMSnsbQj1_KELNQscw=="), ], ) def test_hash_sequence(obj, expected_hash): @@ -173,7 +173,7 @@ def test_hash_sequence(obj, expected_hash): def test_hash_equals_for_different_sequence_types(): list_obj = [0, True, "hello-world"] tuple_obj = (0, True, "hello-world") - expected_hash = "mlOjj4yeCrSDFSn5zgdEIg==" + expected_hash = "I98OkNhfxtScJrYNTs4ZfQ==" list_fingerprint = fingerprinting.hash_sequence(list_obj) tuple_fingerprint = fingerprinting.hash_sequence(tuple_obj) @@ -182,7 +182,7 @@ def test_hash_equals_for_different_sequence_types(): def test_hash_ordered_mapping(): obj = {0: True, "key": "value", 17.0: None} - expected_hash = "GyxyI9-pq-EJJvSAIN509g==" + expected_hash = "zX6MzhWGAOvxateHIPxOvA==" fingerprint = fingerprinting.hash_mapping(obj, ignore_order=False) assert fingerprint == expected_hash @@ -197,7 +197,7 @@ def test_hash_mapping_where_order_matters(): def test_hash_unordered_mapping(): obj = {0: True, "key": "value", 17.0: None} - expected_hash = "cDuuL2eA3DaSWlWW3u7o9g==" + expected_hash = "4cnTFA4MEEzmBN4a04k6tA==" fingerprint = fingerprinting.hash_mapping(obj, ignore_order=True) assert fingerprint == expected_hash @@ -212,7 +212,7 @@ def test_hash_mapping_where_order_doesnt_matter(): def test_hash_set(): obj = {0, True, "key", "value", 17.0, None} - expected_hash = "E_f_tjbi6qn7KL3NUCZayg==" + expected_hash = "mswHhNBBYN5mv6i-LcEeVw==" fingerprint = fingerprinting.hash_set(obj) assert fingerprint == expected_hash @@ -221,7 +221,7 @@ def test_hash_numpy(): # dtype is pinned explicitly so the literal digest is reproducible across # platforms (the default integer dtype is platform-dependent). array = np.array([[0, 1], [2, 3]], dtype=np.int64) - expected_hash = "024zwZIcWy6r4dlX4AMTow==" + expected_hash = "Y1uek_eQTHejo2YtRvdWPQ==" fingerprint = fingerprinting.hash_value(array) assert fingerprint == expected_hash