From 5786439638476d0b01171d5e140f16a34e9372c4 Mon Sep 17 00:00:00 2001 From: Brian Thorne Date: Mon, 1 May 2023 12:31:43 +1200 Subject: [PATCH] Function to compute Dice coefficients of bitarray pairs (#567) * Function in anonlink.similarities to compute the Dice coefficient on pairs of bitarrays * Add changelog entry cleanup PR * Test with all zeros * Remove ubuntu-18.04 unittests --- .github/workflows/unittests.yml | 2 +- CHANGELOG.rst | 5 ++++ anonlink/similarities/__init__.py | 3 +- anonlink/similarities/_dice_python.py | 41 +++++++++++++++++++++++++ setup.py | 2 +- tests/test_e2e.py | 43 +++++++++++++++++++++++++++ tests/test_similarity_dice.py | 16 ++++++++++ 7 files changed, 109 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index e4cac323..f53fe1c9 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -15,7 +15,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [macos-latest, windows-latest, ubuntu-18.04, ubuntu-20.04] + os: [macos-latest, windows-latest, ubuntu-20.04] python: ["3.8", "3.9", "3.10", "3.11"] steps: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 40bbda4e..ff04268b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,8 @@ +0.15.3 +====== + +- Added function to compute Dice coefficients of bitarray pairs. #567 + 0.15.2 ====== diff --git a/anonlink/similarities/__init__.py b/anonlink/similarities/__init__.py index 08c6cff7..f7b5001d 100644 --- a/anonlink/similarities/__init__.py +++ b/anonlink/similarities/__init__.py @@ -10,7 +10,8 @@ functions are possible as well. """ -from anonlink.similarities._dice_python import dice_coefficient_python +from anonlink.similarities._dice_python import (dice_coefficient_python, + dice_coefficient_pairs_python) from anonlink.similarities._smc import (hamming_similarity, simple_matching_coefficient) diff --git a/anonlink/similarities/_dice_python.py b/anonlink/similarities/_dice_python.py index 2f0b5fe5..ffe30132 100644 --- a/anonlink/similarities/_dice_python.py +++ b/anonlink/similarities/_dice_python.py @@ -2,6 +2,7 @@ from itertools import repeat from typing import Iterable, Optional, Sequence, Tuple +import numpy as np from bitarray import bitarray from anonlink.similarities._utils import (sort_similarities_inplace, @@ -77,3 +78,43 @@ def dice_coefficient_python( sort_similarities_inplace(result_sims, result_indices0, result_indices1) return result_sims, (result_indices0, result_indices1) + + + +def dice_coefficient_pairs_python( + datasets: Sequence[Tuple[bitarray, bitarray]] +): + """Find Dice coefficients of bitarray pairs. + + This version is written in Python, so it does not rely on + architecture-specific instructions. It may be slower than an + accelerated version. + + A similarity is computed for every pair of bitarrays in the input + datasets, the similarity for each pair is returned as a floating-point + value. + + :param datasets: A sequence of candidate pairs. Each pair in a tuple + of bitarrays. + + :return: Similarity scores for every input pair as an array of + floating-point values. + """ + candidate_pair_count = len(datasets) + + # Preallocate the result array. + result_sims = np.zeros(candidate_pair_count, dtype=np.float64) + + for i, (f0, f1) in enumerate(datasets): + f0_count = f0.count() + f1_count = f1.count() + combined_count = f0_count + f1_count + + if combined_count: + score: float = (2.0 * (f0 & f1).count() / combined_count) + else: # Avoid division by zero. + score = 0.0 + + result_sims[i] = score + + return result_sims diff --git a/setup.py b/setup.py index cde1a3b0..433ccea4 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,7 @@ setup( name="anonlink", - version='0.15.2', + version='0.15.3', description='Anonymous linkage using cryptographic hashes and bloom filters', long_description=readme, long_description_content_type='text/x-rst', diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 5f8a4d9a..dbd79926 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -273,5 +273,48 @@ def test_greedy_chunked_matching_works(self): assert mapping == merged_mapping + +class TestSimilarityStream(EntityHelperMixin, unittest.TestCase): + + proportion = 0.8 + sample = 150 + + def setUp(self): + self.nl = randomnames.NameList(300) + self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion) + self.key_lists = generate_key_lists('secret', len(self.nl.schema_types)) + self.f1 = tuple(map(itemgetter(0), + bloomfilter.stream_bloom_filters( + self.s1, self.key_lists, self.nl.SCHEMA))) + self.f2 = tuple(map(itemgetter(0), + bloomfilter.stream_bloom_filters( + self.s2, self.key_lists, self.nl.SCHEMA))) + + def test_similarity_stream(self): + candidate_pairs = [] + for f1 in self.f1: + for f2 in self.f2: + candidate_pairs.append((f1, f2)) + + similarity_stream = anonlink.similarities.dice_coefficient_pairs_python( + candidate_pairs + ) + + assert len(similarity_stream) == len(self.f1) * len(self.f2) + + candidate_pairs = anonlink.candidate_generation.find_candidate_pairs( + (self.f1, self.f2), + anonlink.similarities.dice_coefficient_accelerated, + threshold=0.0, + ) + + scores, _, (l_indicies, r_indicies) = candidate_pairs + + for score, l_index, r_index in zip(scores, l_indicies, r_indicies): + # Calculate the index in the streamed candidate pairs list + index = l_index * len(self.f2) + r_index + assert similarity_stream[index] == score + + if __name__ == '__main__': unittest.main() diff --git a/tests/test_similarity_dice.py b/tests/test_similarity_dice.py index b40316d7..d751bb54 100644 --- a/tests/test_similarity_dice.py +++ b/tests/test_similarity_dice.py @@ -4,6 +4,7 @@ from clkhash.key_derivation import generate_key_lists from hypothesis import given, strategies +import anonlink.similarities from anonlink import similarities FLOAT_ARRAY_TYPES = 'fd' @@ -258,6 +259,21 @@ def test_all_low(self, sim_fun, k, threshold): assert (rec_is0.typecode in UINT_ARRAY_TYPES and rec_is1.typecode in UINT_ARRAY_TYPES) + def test_candidate_stream_right_low(self): + datasets = list(zip(*[[bitarray('01001011') * 8], + [bitarray('00000000') * 8]])) + sims = anonlink.similarities.dice_coefficient_pairs_python(datasets) + assert len(sims) == 1 + assert all(s == 0.0 for s in sims) + + def test_candidate_stream_all_low(self): + datasets = list(zip(*[[bitarray('00000000') * 8], + [bitarray('00000000') * 8]])) + sims = anonlink.similarities.dice_coefficient_pairs_python(datasets) + + assert len(sims) == 1 + assert all(s == 0.0 for s in sims) + @pytest.mark.parametrize('sim_fun', SIM_FUNS) def test_order(self, sim_fun): similarity = sim_fun(