Closes Bears-R-Us#3009: indexof1d to handle null values (Bears-R-Us…

…#3169) * Closes Bears-R-Us#3009: `indexof1d` to handle null values This PR (closes Bears-R-Us#3009) refactors `indexof1d` to use `find` since they have similar functionality and `find` is fairly optimized and correctly handles null values (once we `dropna=False` to the `Groupby`). The two major difference is when there are how missing values are handled and how many indices get returned when there are duplicates in the search space. `find` would only return the index of the first occurrence and use `-1` to denote missing values, but `indexof1d` returns the indices of all occurrences and removes missing values. To enable this, I added the flags `all_occurrences` and `remove_missing` to `find` The approach I took involved adding a segmented `mink/maxk`, which I went back and forth on whether it should be user facing. I implemented this by permuting the values and calling the existing `mink/maxk`. I'm not sure if this is the most efficient approach, but my goal was focus on correctness first and we can optimize later if needed. Wrote tests for `indexof1d` both for the reproducer and in general. * update and add examples in response to PR feedback --------- Co-authored-by: Tess Hayes <stress-tess@users.noreply.github.com>
ajpotts · May 22, 2024 · c05c599 · c05c599
1 parent f0d559f
commit c05c599
Show file tree

Hide file tree

Showing 8 changed files with 383 additions and 168 deletions.
diff --git a/PROTO_tests/tests/setops_test.py b/PROTO_tests/tests/setops_test.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 import pytest
 
 import arkouda as ak
@@ -50,8 +51,8 @@ def make_np_arrays_cross_type(dtype1, dtype2):
             a = np.array([-1, -3, 0, 1, 2, 3]).astype(dtype1)
             c = np.array([-1, 0, 0, 7, 8, 3]).astype(dtype1)
         elif dtype1 == ak.bigint:
-            a = np.array([-1, -3, 0, 1, 2, 3]).astype(ak.uint64) + 2 ** 200
-            c = np.array([-1, 0, 0, 7, 8, 3]).astype(ak.uint64) + 2 ** 200
+            a = np.array([-1, -3, 0, 1, 2, 3]).astype(ak.uint64) + 2**200
+            c = np.array([-1, 0, 0, 7, 8, 3]).astype(ak.uint64) + 2**200
         elif dtype1 == ak.bool:
             a = np.array([True, False, False, True, True])
             c = np.array([True, True, False, False, True])
@@ -62,8 +63,8 @@ def make_np_arrays_cross_type(dtype1, dtype2):
             b = np.array([-1, -11, 0, 4, 5, 3]).astype(dtype2)
             d = np.array([-1, -4, 0, 7, 8, 3]).astype(dtype2)
         elif dtype2 == ak.bigint:
-            b = np.array([-1, -11, 0, 4, 5, 3]).astype(ak.uint64) + 2 ** 200
-            d = np.array([-1, -4, 0, 7, 8, 3]).astype(ak.uint64) + 2 ** 200
+            b = np.array([-1, -11, 0, 4, 5, 3]).astype(ak.uint64) + 2**200
+            d = np.array([-1, -4, 0, 7, 8, 3]).astype(ak.uint64) + 2**200
         elif dtype2 == ak.bool:
             b = np.array([True, True, False, False, True])
             d = np.array([True, True, False, False, True])
@@ -674,3 +675,47 @@ def test_multiarray_validation(self):
         x = [ak.arange(3, dtype=ak.uint64), ak.arange(3)]
         with pytest.raises(TypeError):
             ak.pdarraysetops.multiarray_setop_validation(x, y)
+
+    def test_index_of(self):
+        # index of nan (reproducer from #3009)
+        s = ak.Series(ak.array([1, 2, 3]), index=ak.array([1, 2, np.nan]))
+        assert ak.indexof1d(ak.array([np.nan]), s.index.values).to_list() == [2]
+
+        select_from_list = [
+            ak.randint(-(2**32), 2**32, 10),
+            ak.linspace(-(2**32), 2**32, 10),
+            ak.random_strings_uniform(1, 16, 10),
+        ]
+        for select_from in select_from_list:
+            arr1 = select_from[ak.randint(0, select_from.size, 20)]
+
+            # test unique search space, this should be identical to find
+            # be sure to test when all items are present and when there are items missing
+            for arr2 in select_from, select_from[:5], select_from[5:]:
+                found_in_second = ak.in1d(arr1, arr2)
+                idx_of_first_in_second = ak.indexof1d(arr1, arr2)
+
+                # ensure we match find
+                assert (idx_of_first_in_second == ak.find(arr1, arr2, remove_missing=True)).all()
+
+                # if an element of arr1 is found in arr2, return the index of that item in arr2
+                assert (arr2[idx_of_first_in_second] == arr1[found_in_second]).all()
+
+            # test duplicate items in search space, the easiest way I can think
+            # of to do this is to compare against pandas series getitem
+            arr2 = select_from[ak.randint(0, select_from.size, 20)]
+            pd_s = pd.Series(index=arr1.to_ndarray(), data=arr2.to_ndarray())
+            ak_s = ak.Series(index=arr1, data=arr2)
+
+            arr1_keys = ak.GroupBy(arr1).unique_keys
+            arr2_keys = ak.GroupBy(arr2).unique_keys
+            in_both = ak.intersect1d(arr1_keys, arr2_keys)
+
+            for i in in_both.to_list():
+                pd_i = pd_s[i]
+                ak_i = ak_s[i]
+                if isinstance(pd_i, pd.Series):
+                    assert isinstance(ak_i, ak.Series)
+                    assert pd_i.values.tolist() == ak_i.values.to_list()
+                else:
+                    assert pd_i == ak_i
diff --git a/arkouda/alignment.py b/arkouda/alignment.py
@@ -5,13 +5,14 @@
 import numpy as np  # type: ignore
 
 from arkouda.categorical import Categorical
+from arkouda.client import generic_msg
 from arkouda.dtypes import bigint
 from arkouda.dtypes import float64 as akfloat64
 from arkouda.dtypes import int64 as akint64
 from arkouda.dtypes import uint64 as akuint64
 from arkouda.groupbyclass import GroupBy, broadcast, unique
-from arkouda.numeric import where
-from arkouda.pdarrayclass import pdarray
+from arkouda.numeric import cumsum, where
+from arkouda.pdarrayclass import create_pdarray, pdarray
 from arkouda.pdarraycreation import arange, full, ones, zeros
 from arkouda.pdarraysetops import concatenate, in1d
 from arkouda.sorting import argsort, coargsort
@@ -109,23 +110,102 @@ class NonUniqueError(ValueError):
     pass
 
 
-def find(query, space):
+def find(query, space, all_occurrences=False, remove_missing=False):
     """
-    Return indices of query items in a search list of items (-1 if not found).
+    Return indices of query items in a search list of items.
 
     Parameters
     ----------
     query : (sequence of) array-like
         The items to search for. If multiple arrays, each "row" is an item.
     space : (sequence of) array-like
         The set of items in which to search. Must have same shape/dtype as query.
+    all_occurrences: bool
+        When duplicate terms are present in search space, if all_occurrences is True,
+        return all occurrences found as a SegArray, otherwise return only the first
+        occurrences as a pdarray. Defaults to only finding the first occurrence.
+        Finding all occurrences is not yet supported on sequences of arrays
+    remove_missing: bool
+        If False, return -1 for any items in query not found in space. If True,
+        remove these and only return indices of items that are found.
 
     Returns
     -------
-    indices : pdarray, int64
-        For each item in query, its index in space or -1 if not found.
-    """
+    indices : pdarray or SegArray
+        For each item in query, its index in space. If remove_missing is True,
+        exclued missing values otherwise return -1. If all_occurrences is False,
+        the return will be a pdarray of the first index where each value in the
+        query appears in the space. if all_occurrences is True, the return will be
+        a SegArray containing every index where each value in the query appears in
+        the space.
 
+    Examples
+    --------
+    >>> select_from = ak.arange(10)
+    >>> arr1 = select_from[ak.randint(0, select_from.size, 20, seed=10)]
+    >>> arr2 = select_from[ak.randint(0, select_from.size, 20, seed=11)]
+    # remove some values to ensure we have some values
+    # which don't appear in the search space
+    >>> arr2 = arr2[arr2 != 9]
+    >>> arr2 = arr2[arr2 != 3]
+
+    # find with defaults (all_occurrences and remove_missing both False)
+    >>> ak.find(arr1, arr2)
+    array([-1 -1 -1 0 1 -1 -1 -1 2 -1 5 -1 8 -1 5 -1 -1 11 5 0])
+
+     # set remove_missing to True, only difference from default
+     # is missing values are excluded
+     >>> ak.find(arr1, arr2, remove_missing=True)
+    array([0 1 2 5 8 5 11 5 0])
+
+    # set all_occurrences to True, the first index of each list
+    # is the first occurence and should match the default
+    >>> ak.find(arr1, arr2, all_occurrences=True).to_list()
+    [[-1],
+     [-1],
+     [-1],
+     [0, 4],
+     [1, 3, 10],
+     [-1],
+     [-1],
+     [-1],
+     [2, 6, 12, 13],
+     [-1],
+     [5, 7],
+     [-1],
+     [8, 9, 14],
+     [-1],
+     [5, 7],
+     [-1],
+     [-1],
+     [11, 15],
+     [5, 7],
+     [0, 4]]
+
+    # set both remove_missing and all_occurrences to True, missing values
+    # will be empty segments
+    >>> ak.find(arr1, arr2, remove_missing=True, all_occurrences=True).to_list()
+    [[],
+     [],
+     [],
+     [0, 4],
+     [1, 3, 10],
+     [],
+     [],
+     [],
+     [2, 6, 12, 13],
+     [],
+     [5, 7],
+     [],
+     [8, 9, 14],
+     [],
+     [5, 7],
+     [],
+     [],
+     [11, 15],
+     [5, 7],
+     [0, 4]]
+    """
     # Concatenate the space and query in fast (block interleaved) mode
     if isinstance(query, (pdarray, Strings, Categorical)):
         if type(query) is not type(space):
@@ -151,15 +231,48 @@ def find(query, space):
     # All space indices are less than all query indices
     i = concatenate((arange(spacesize), arange(spacesize, spacesize + querysize)), ordered=False)
     # Group on terms
-    g = GroupBy(c)
+    g = GroupBy(c, dropna=False)
     # For each term, count how many times it appears in the search space
     space_multiplicity = g.sum(i < spacesize)[1]
-    # Warn of any duplicate terms in space
-    if (space_multiplicity > 1).any():
-        warn(
-            "Duplicate terms present in search space. Only first instance of each query term\
-            will be reported."
-        )
+    has_duplicates = (space_multiplicity > 1).any()
+    # handle duplicate terms in space
+    if has_duplicates:
+        if all_occurrences:
+            if isinstance(query, Sequence):
+                raise TypeError("finding all_occurrences is not yet supported on sequences of arrays")
+
+            from arkouda.segarray import SegArray
+
+            # use segmented mink to select space_multiplicity number of elements
+            # and create a segarray which contains all the indices
+            # in our query space, instead of just the min for each segment
+
+            # only calculate where to place the negatives if remove_missing is false
+            negative_at = "" if remove_missing else space_multiplicity == 0
+            repMsg = generic_msg(
+                cmd="segmentedExtremaK",
+                args={
+                    "vals": i[g.permutation],
+                    "segs": g.segments,
+                    "segLens": g.size()[1],
+                    "kArray": space_multiplicity,
+                    "isMin": True,
+                    "removeMissing": remove_missing,
+                    "negativeAt": negative_at,
+                },
+            )
+            min_k_vals = create_pdarray(repMsg)
+            seg_idx = g.broadcast(arange(g.segments.size))[i >= spacesize]
+            if not remove_missing:
+                space_multiplicity += negative_at
+            min_k_segs = cumsum(space_multiplicity) - space_multiplicity
+            sa = SegArray(min_k_segs, min_k_vals)
+            return sa[seg_idx]
+        else:
+            warn(
+                "Duplicate terms present in search space. Only first instance of each query term"
+                " will be reported. To return all occurrences, set all_occurrences=True."
+            )
     # For query terms in the space, the min combined index will be the first index of that
     # term in the space
     uspaceidx = g.min(i)[1]
@@ -169,7 +282,8 @@ def find(query, space):
     # Broadcast unique term indices to combined list of space and query terms
     spaceidx = g.broadcast(uspaceidx)
     # Return only the indices of the query terms (remove the search space)
-    return spaceidx[i >= spacesize]
+    pda = spaceidx[i >= spacesize]
+    return pda[pda != -1] if remove_missing else pda
 
 
 def lookup(keys, values, arguments, fillvalue=-1):