diff --git a/PROTO_tests/tests/setops_test.py b/PROTO_tests/tests/setops_test.py index 3b757f8066..0fc0002c3d 100644 --- a/PROTO_tests/tests/setops_test.py +++ b/PROTO_tests/tests/setops_test.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import pytest import arkouda as ak @@ -50,8 +51,8 @@ def make_np_arrays_cross_type(dtype1, dtype2): a = np.array([-1, -3, 0, 1, 2, 3]).astype(dtype1) c = np.array([-1, 0, 0, 7, 8, 3]).astype(dtype1) elif dtype1 == ak.bigint: - a = np.array([-1, -3, 0, 1, 2, 3]).astype(ak.uint64) + 2 ** 200 - c = np.array([-1, 0, 0, 7, 8, 3]).astype(ak.uint64) + 2 ** 200 + a = np.array([-1, -3, 0, 1, 2, 3]).astype(ak.uint64) + 2**200 + c = np.array([-1, 0, 0, 7, 8, 3]).astype(ak.uint64) + 2**200 elif dtype1 == ak.bool: a = np.array([True, False, False, True, True]) c = np.array([True, True, False, False, True]) @@ -62,8 +63,8 @@ def make_np_arrays_cross_type(dtype1, dtype2): b = np.array([-1, -11, 0, 4, 5, 3]).astype(dtype2) d = np.array([-1, -4, 0, 7, 8, 3]).astype(dtype2) elif dtype2 == ak.bigint: - b = np.array([-1, -11, 0, 4, 5, 3]).astype(ak.uint64) + 2 ** 200 - d = np.array([-1, -4, 0, 7, 8, 3]).astype(ak.uint64) + 2 ** 200 + b = np.array([-1, -11, 0, 4, 5, 3]).astype(ak.uint64) + 2**200 + d = np.array([-1, -4, 0, 7, 8, 3]).astype(ak.uint64) + 2**200 elif dtype2 == ak.bool: b = np.array([True, True, False, False, True]) d = np.array([True, True, False, False, True]) @@ -674,3 +675,47 @@ def test_multiarray_validation(self): x = [ak.arange(3, dtype=ak.uint64), ak.arange(3)] with pytest.raises(TypeError): ak.pdarraysetops.multiarray_setop_validation(x, y) + + def test_index_of(self): + # index of nan (reproducer from #3009) + s = ak.Series(ak.array([1, 2, 3]), index=ak.array([1, 2, np.nan])) + assert ak.indexof1d(ak.array([np.nan]), s.index.values).to_list() == [2] + + select_from_list = [ + ak.randint(-(2**32), 2**32, 10), + ak.linspace(-(2**32), 2**32, 10), + ak.random_strings_uniform(1, 16, 10), + ] + for select_from in select_from_list: + arr1 = select_from[ak.randint(0, select_from.size, 20)] + + # test unique search space, this should be identical to find + # be sure to test when all items are present and when there are items missing + for arr2 in select_from, select_from[:5], select_from[5:]: + found_in_second = ak.in1d(arr1, arr2) + idx_of_first_in_second = ak.indexof1d(arr1, arr2) + + # ensure we match find + assert (idx_of_first_in_second == ak.find(arr1, arr2, remove_missing=True)).all() + + # if an element of arr1 is found in arr2, return the index of that item in arr2 + assert (arr2[idx_of_first_in_second] == arr1[found_in_second]).all() + + # test duplicate items in search space, the easiest way I can think + # of to do this is to compare against pandas series getitem + arr2 = select_from[ak.randint(0, select_from.size, 20)] + pd_s = pd.Series(index=arr1.to_ndarray(), data=arr2.to_ndarray()) + ak_s = ak.Series(index=arr1, data=arr2) + + arr1_keys = ak.GroupBy(arr1).unique_keys + arr2_keys = ak.GroupBy(arr2).unique_keys + in_both = ak.intersect1d(arr1_keys, arr2_keys) + + for i in in_both.to_list(): + pd_i = pd_s[i] + ak_i = ak_s[i] + if isinstance(pd_i, pd.Series): + assert isinstance(ak_i, ak.Series) + assert pd_i.values.tolist() == ak_i.values.to_list() + else: + assert pd_i == ak_i diff --git a/arkouda/alignment.py b/arkouda/alignment.py index 9af0e70cdb..b67532ed09 100644 --- a/arkouda/alignment.py +++ b/arkouda/alignment.py @@ -5,13 +5,14 @@ import numpy as np # type: ignore from arkouda.categorical import Categorical +from arkouda.client import generic_msg from arkouda.dtypes import bigint from arkouda.dtypes import float64 as akfloat64 from arkouda.dtypes import int64 as akint64 from arkouda.dtypes import uint64 as akuint64 from arkouda.groupbyclass import GroupBy, broadcast, unique -from arkouda.numeric import where -from arkouda.pdarrayclass import pdarray +from arkouda.numeric import cumsum, where +from arkouda.pdarrayclass import create_pdarray, pdarray from arkouda.pdarraycreation import arange, full, ones, zeros from arkouda.pdarraysetops import concatenate, in1d from arkouda.sorting import argsort, coargsort @@ -109,9 +110,9 @@ class NonUniqueError(ValueError): pass -def find(query, space): +def find(query, space, all_occurrences=False, remove_missing=False): """ - Return indices of query items in a search list of items (-1 if not found). + Return indices of query items in a search list of items. Parameters ---------- @@ -119,13 +120,92 @@ def find(query, space): The items to search for. If multiple arrays, each "row" is an item. space : (sequence of) array-like The set of items in which to search. Must have same shape/dtype as query. + all_occurrences: bool + When duplicate terms are present in search space, if all_occurrences is True, + return all occurrences found as a SegArray, otherwise return only the first + occurrences as a pdarray. Defaults to only finding the first occurrence. + Finding all occurrences is not yet supported on sequences of arrays + remove_missing: bool + If False, return -1 for any items in query not found in space. If True, + remove these and only return indices of items that are found. Returns ------- - indices : pdarray, int64 - For each item in query, its index in space or -1 if not found. - """ + indices : pdarray or SegArray + For each item in query, its index in space. If remove_missing is True, + exclued missing values otherwise return -1. If all_occurrences is False, + the return will be a pdarray of the first index where each value in the + query appears in the space. if all_occurrences is True, the return will be + a SegArray containing every index where each value in the query appears in + the space. + Examples + -------- + >>> select_from = ak.arange(10) + >>> arr1 = select_from[ak.randint(0, select_from.size, 20, seed=10)] + >>> arr2 = select_from[ak.randint(0, select_from.size, 20, seed=11)] + # remove some values to ensure we have some values + # which don't appear in the search space + >>> arr2 = arr2[arr2 != 9] + >>> arr2 = arr2[arr2 != 3] + + # find with defaults (all_occurrences and remove_missing both False) + >>> ak.find(arr1, arr2) + array([-1 -1 -1 0 1 -1 -1 -1 2 -1 5 -1 8 -1 5 -1 -1 11 5 0]) + + # set remove_missing to True, only difference from default + # is missing values are excluded + >>> ak.find(arr1, arr2, remove_missing=True) + array([0 1 2 5 8 5 11 5 0]) + + # set all_occurrences to True, the first index of each list + # is the first occurence and should match the default + >>> ak.find(arr1, arr2, all_occurrences=True).to_list() + [[-1], + [-1], + [-1], + [0, 4], + [1, 3, 10], + [-1], + [-1], + [-1], + [2, 6, 12, 13], + [-1], + [5, 7], + [-1], + [8, 9, 14], + [-1], + [5, 7], + [-1], + [-1], + [11, 15], + [5, 7], + [0, 4]] + + # set both remove_missing and all_occurrences to True, missing values + # will be empty segments + >>> ak.find(arr1, arr2, remove_missing=True, all_occurrences=True).to_list() + [[], + [], + [], + [0, 4], + [1, 3, 10], + [], + [], + [], + [2, 6, 12, 13], + [], + [5, 7], + [], + [8, 9, 14], + [], + [5, 7], + [], + [], + [11, 15], + [5, 7], + [0, 4]] + """ # Concatenate the space and query in fast (block interleaved) mode if isinstance(query, (pdarray, Strings, Categorical)): if type(query) is not type(space): @@ -151,15 +231,48 @@ def find(query, space): # All space indices are less than all query indices i = concatenate((arange(spacesize), arange(spacesize, spacesize + querysize)), ordered=False) # Group on terms - g = GroupBy(c) + g = GroupBy(c, dropna=False) # For each term, count how many times it appears in the search space space_multiplicity = g.sum(i < spacesize)[1] - # Warn of any duplicate terms in space - if (space_multiplicity > 1).any(): - warn( - "Duplicate terms present in search space. Only first instance of each query term\ - will be reported." - ) + has_duplicates = (space_multiplicity > 1).any() + # handle duplicate terms in space + if has_duplicates: + if all_occurrences: + if isinstance(query, Sequence): + raise TypeError("finding all_occurrences is not yet supported on sequences of arrays") + + from arkouda.segarray import SegArray + + # use segmented mink to select space_multiplicity number of elements + # and create a segarray which contains all the indices + # in our query space, instead of just the min for each segment + + # only calculate where to place the negatives if remove_missing is false + negative_at = "" if remove_missing else space_multiplicity == 0 + repMsg = generic_msg( + cmd="segmentedExtremaK", + args={ + "vals": i[g.permutation], + "segs": g.segments, + "segLens": g.size()[1], + "kArray": space_multiplicity, + "isMin": True, + "removeMissing": remove_missing, + "negativeAt": negative_at, + }, + ) + min_k_vals = create_pdarray(repMsg) + seg_idx = g.broadcast(arange(g.segments.size))[i >= spacesize] + if not remove_missing: + space_multiplicity += negative_at + min_k_segs = cumsum(space_multiplicity) - space_multiplicity + sa = SegArray(min_k_segs, min_k_vals) + return sa[seg_idx] + else: + warn( + "Duplicate terms present in search space. Only first instance of each query term" + " will be reported. To return all occurrences, set all_occurrences=True." + ) # For query terms in the space, the min combined index will be the first index of that # term in the space uspaceidx = g.min(i)[1] @@ -169,7 +282,8 @@ def find(query, space): # Broadcast unique term indices to combined list of space and query terms spaceidx = g.broadcast(uspaceidx) # Return only the indices of the query terms (remove the search space) - return spaceidx[i >= spacesize] + pda = spaceidx[i >= spacesize] + return pda[pda != -1] if remove_missing else pda def lookup(keys, values, arguments, fillvalue=-1): diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index e57efee4c4..1f51bc0bdd 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -13,15 +13,17 @@ from numpy._typing import _8Bit, _16Bit, _32Bit, _64Bit from typeguard import typechecked +from arkouda.alignment import find as akfind from arkouda.categorical import Categorical from arkouda.client import generic_msg, maxTransferBytes from arkouda.client_dtypes import BitVector, Fields, IPv4 from arkouda.dtypes import BigInt from arkouda.dtypes import bool as akbool +from arkouda.dtypes import dtype from arkouda.dtypes import float64 as akfloat64 from arkouda.dtypes import int64 as akint64 +from arkouda.dtypes import resolve_scalar_dtype from arkouda.dtypes import uint64 as akuint64 -from arkouda.dtypes import dtype, resolve_scalar_dtype from arkouda.groupbyclass import GROUPBY_REDUCTION_TYPES from arkouda.groupbyclass import GroupBy as akGroupBy from arkouda.groupbyclass import unique @@ -29,18 +31,18 @@ from arkouda.join import inner_join from arkouda.numeric import cast as akcast from arkouda.numeric import cumsum, where -from arkouda.pdarrayclass import RegistrationError, pdarray +from arkouda.pdarrayclass import RegistrationError from arkouda.pdarrayclass import any as akany +from arkouda.pdarrayclass import pdarray from arkouda.pdarrayclass import sum as aksum from arkouda.pdarraycreation import arange, array, create_pdarray, full, zeros -from arkouda.pdarraysetops import concatenate, in1d, intersect1d, indexof1d +from arkouda.pdarraysetops import concatenate, in1d, indexof1d, intersect1d from arkouda.row import Row from arkouda.segarray import SegArray from arkouda.series import Series, is_supported_scalar from arkouda.sorting import argsort, coargsort from arkouda.strings import Strings from arkouda.timeclass import Datetime, Timedelta -from arkouda.alignment import find as akfind # This is necessary for displaying DataFrames with BitVector columns, # because pandas _html_repr automatically truncates the number of displayed bits @@ -869,7 +871,8 @@ def validate_key(self, key): k = key[0] if len(self.columns) != 0 and resolve_scalar_dtype(k) != self.column_label_type(): raise TypeError( - "Expected key of type {}, received {}".format(type(self.columns[0]), type(k))) + "Expected key of type {}, received {}".format(type(self.columns[0]), type(k)) + ) return key raise TypeError("Indexing with keys of type {} not supported".format(type(key))) @@ -1026,11 +1029,12 @@ def __setitem__(self, key, value): if isinstance(value, DataFrame): if not len(key) == len(value.columns): raise ValueError( - f"Number of keys and values must match: {len(key)} != {len(value.columns)}") + f"Number of keys and values must match: {len(key)} != {len(value.columns)}" + ) else: raise ValueError("When setting multiple columns, value must be a DataFrame") - for (k, valueColumn) in zip(key.to_ndarray(), value.columns): + for k, valueColumn in zip(key.to_ndarray(), value.columns): v = value[valueColumn].values if len(v) != len(self): raise ValueError("Column length must match DataFrame length") @@ -1234,10 +1238,12 @@ def _get_head_tail_server(self): if isinstance(self[col].values, Categorical): msg_list.append( f"Categorical+{col}+{self[col].values.codes.name}" - f"+{self[col].values.categories.name}") + f"+{self[col].values.categories.name}" + ) elif isinstance(self[col].values, SegArray): msg_list.append( - f"SegArray+{col}+{self[col].values.segments.name}+{self[col].values.values.name}") + f"SegArray+{col}+{self[col].values.segments.name}+{self[col].values.values.name}" + ) elif isinstance(self[col].values, Strings): msg_list.append(f"Strings+{col}+{self[col].values.name}") elif isinstance(self[col].values, Fields): @@ -1348,8 +1354,9 @@ def _add_new_rows(self, key): for k in self._columns: current_col = UserDict.__getitem__(self, k) default_val = np.nan if current_col.dtype == akfloat64 else 0 - new_col = concatenate([current_col, - full(len(new_keys), default_val, dtype=current_col.dtype)]) + new_col = concatenate( + [current_col, full(len(new_keys), default_val, dtype=current_col.dtype)] + ) UserDict.__setitem__(self, k, new_col) self.update_nrows() @@ -5636,15 +5643,19 @@ def __getitem__(self, key): return self.df._get_rows(indexof1d(key, self.df.index.values)) if isinstance(key, slice): - if key.start is not None and akfind(array([key.start]), self.df.index.values) == -1: + if key.start is not None and akfind(array([key.start]), self.df.index.values)[0] == -1: raise KeyError(f"Index {key.start} not found in DataFrame index") - if key.stop is not None and akfind(array([key.stop]), self.df.index.values) == -1: + if key.stop is not None and akfind(array([key.stop]), self.df.index.values)[0] == -1: raise KeyError(f"Index {key.stop} not found in DataFrame index") - start_idx = (indexof1d(array([key.start]), self.df.index.values)[0] - if key.start is not None else 0) - stop_idx = (indexof1d(array([key.stop]), self.df.index.values)[0] + 1 - if key.stop is not None else self.df.index.size) + start_idx = ( + akfind(array([key.start]), self.df.index.values)[0] if key.start is not None else 0 + ) + stop_idx = ( + akfind(array([key.stop]), self.df.index.values)[0] + 1 + if key.stop is not None + else self.df.index.size + ) indices = arange(start_idx, stop_idx) return self.df._get_rows(indices) @@ -5677,14 +5688,14 @@ def _set_row_col(self, row_key, col_key, val): if is_supported_scalar(val): return self._set_row_col_scalar_val(row_key, col_key, val) else: - assert (isinstance(val, (pdarray, Series, Strings, SegArray))), "Invalid value type" + assert isinstance(val, (pdarray, Series, Strings, SegArray)), "Invalid value type" return self._set_row_col_vector_val(row_key, col_key, val) def _set_row_col_scalar_val(self, row_key, col_key, val): if is_supported_scalar(row_key): if not self.df.index.dtype == dtype(type(row_key)): raise TypeError("Row key must be of the same type as the DataFrame index") - if akfind(array([row_key]), self.df.index.values) == -1: + if akfind(array([row_key]), self.df.index.values)[0] == -1: self.df._add_new_rows(row_key) # updating a single row row_idx = indexof1d(array([row_key]), self.df.index.values) @@ -5703,15 +5714,24 @@ def _set_row_col_scalar_val(self, row_key, col_key, val): if isinstance(row_key, pdarray) and row_key.dtype == akbool: self.df.data[col_key][row_key] = val if isinstance(row_key, slice): - if row_key.start is not None and akfind(array([row_key.start]), self.df.index.values) == -1: + if ( + row_key.start is not None + and akfind(array([row_key.start]), self.df.index.values)[0] == -1 + ): raise KeyError(f"Index {row_key.start} not found in DataFrame index") - if row_key.stop is not None and akfind(array([row_key.stop]), self.df.index.values) == -1: + if row_key.stop is not None and akfind(array([row_key.stop]), self.df.index.values)[0] == -1: raise KeyError(f"Index {row_key.stop} not found in DataFrame index") - start_idx = (akfind(array([row_key.start]), self.df.index.values)[0] - if row_key.start is not None else 0) - stop_idx = (akfind(array([row_key.stop]), self.df.index.values)[0] + 1 - if row_key.stop is not None else self.df.index.size) + start_idx = ( + akfind(array([row_key.start]), self.df.index.values)[0] + if row_key.start is not None + else 0 + ) + stop_idx = ( + akfind(array([row_key.stop]), self.df.index.values)[0] + 1 + if row_key.stop is not None + else self.df.index.size + ) indices = arange(start_idx, stop_idx) self.df.data[col_key][indices] = val return None @@ -5728,17 +5748,25 @@ def _set_row_col_vector_val(self, row_key, col_key, val): row_idx = indexof1d(row_key, self.df.index.values) self.df.data[col_key][row_idx] = val if isinstance(row_key, slice): - if row_key.start is not None and akfind(array([row_key.start]), self.df.index.values) == -1: + if ( + row_key.start is not None + and akfind(array([row_key.start]), self.df.index.values)[0] == -1 + ): raise ValueError(f"Index {row_key.start} not found in DataFrame index") - if row_key.stop is not None and akfind(array([row_key.stop]), self.df.index.values) == -1: + if row_key.stop is not None and akfind(array([row_key.stop]), self.df.index.values)[0] == -1: raise ValueError(f"Index {row_key.stop} not found in DataFrame index") - start_idx = (indexof1d(array([row_key.start]), self.df.index.values)[0] - if row_key.start is not None - else 0) - stop_idx = (indexof1d(array([row_key.stop]), self.df.index.values)[0] + 1 - if row_key.stop is not None - else self.df.index.size) + start_idx = ( + akfind(array([row_key.start]), self.df.index.values)[0] + if row_key.start is not None + else 0 + ) + # should the below have + 1 like the other stop_idxs? + stop_idx = ( + akfind(array([row_key.stop]), self.df.index.values)[0] + if row_key.stop is not None + else self.df.index.size + ) indices = arange(start_idx, stop_idx) self.df.data[col_key][indices] = val @@ -5883,7 +5911,6 @@ def _set_row_col(self, row_key, col_key, val): class AtIndexer: - def __init__(self, df) -> None: self.df = df @@ -6235,8 +6262,7 @@ def _inner_join_merge( right_cols.remove(on) else: left_inds, right_inds = inner_join( - [left[col].values for col in on], - [right[col].values for col in on] + [left[col].values for col in on], [right[col].values for col in on] ) new_dict = {col: left[col].iloc[left_inds] for col in on} for col in on: @@ -6628,8 +6654,8 @@ def merge( if not isinstance(on, str): if not all( - isinstance(left[col].values, (pdarray, Strings)) and - isinstance(right[col].values, (pdarray, Strings)) + isinstance(left[col].values, (pdarray, Strings)) + and isinstance(right[col].values, (pdarray, Strings)) for col in on ): raise ValueError("All columns of a multi-column merge must be pdarrays") diff --git a/arkouda/pdarraysetops.py b/arkouda/pdarraysetops.py index 44d19d3c52..ef881014cf 100644 --- a/arkouda/pdarraysetops.py +++ b/arkouda/pdarraysetops.py @@ -7,10 +7,10 @@ from arkouda.client import generic_msg from arkouda.client_dtypes import BitVector +from arkouda.dtypes import bigint from arkouda.dtypes import bool as akbool from arkouda.dtypes import int64 as akint64 from arkouda.dtypes import uint64 as akuint64 -from arkouda.dtypes import bigint from arkouda.groupbyclass import GroupBy, groupable, groupable_element_type, unique from arkouda.logger import getArkoudaLogger from arkouda.pdarrayclass import create_pdarray, pdarray @@ -232,22 +232,40 @@ def in1dmulti(a, b, assume_unique=False, symmetric=False): return in1d(a, b, assume_unique=assume_unique, symmetric=symmetric) -def indexof1d(keys: groupable, arr: groupable) -> Union[pdarray, groupable]: +def indexof1d(query: groupable, space: groupable) -> pdarray: """ - Returns an integer array of the index values where the values of the first - array appear in the second. + Return indices of query items in a search list of items. Items not found will be excluded. + When duplicate terms are present in search space return indices of all occurrences. Parameters ---------- - keys : pdarray or Strings or Categorical - Input array of values to find the indices of in `arr`. - arr : pdarray or Strings or Categorical - The values to search. + query : (sequence of) pdarray or Strings or Categorical + The items to search for. If multiple arrays, each "row" is an item. + space : (sequence of) pdarray or Strings or Categorical + The set of items in which to search. Must have same shape/dtype as query. Returns ------- - pdarray, int - The indices of the values of `keys` in `arr`. + indices : pdarray, int64 + For each item in query, its index in space. + + Notes + ----- + This is an alias of + `ak.find(query, space, all_occurrences=True, remove_missing=True).values` + + Examples + -------- + >>> select_from = ak.arange(10) + >>> arr1 = select_from[ak.randint(0, select_from.size, 20, seed=10)] + >>> arr2 = select_from[ak.randint(0, select_from.size, 20, seed=11)] + # remove some values to ensure we have some values + # which don't appear in the search space + >>> arr2 = arr2[arr2 != 9] + >>> arr2 = arr2[arr2 != 3] + + >>> ak.indexof1d(arr1, arr2) + array([0 4 1 3 10 2 6 12 13 5 7 8 9 14 5 7 11 15 5 7 0 4]) Raises ------ @@ -257,19 +275,17 @@ def indexof1d(keys: groupable, arr: groupable) -> Union[pdarray, groupable]: RuntimeError Raised if the dtype of either array is not supported """ + from arkouda.alignment import find as akfind from arkouda.categorical import Categorical as Categorical_ - if isinstance(keys, (pdarray, Strings, Categorical_)): - if isinstance(keys, (Strings, Categorical_)) and not isinstance(arr, (Strings, Categorical_)): + if isinstance(query, (pdarray, Strings, Categorical_)): + if isinstance(query, (Strings, Categorical_)) and not isinstance(space, (Strings, Categorical_)): raise TypeError("Arguments must have compatible types, Strings/Categorical") - elif isinstance(keys, pdarray) and not isinstance(arr, pdarray): + elif isinstance(query, pdarray) and not isinstance(space, pdarray): raise TypeError("If keys is pdarray, arr must also be pdarray") - repMsg = generic_msg( - cmd="indexof1d", - args={"keys": keys, "arr": arr}, - ) - return create_pdarray(cast(str, repMsg)) + found = akfind(query, space, all_occurrences=True, remove_missing=True) + return found if isinstance(found, pdarray) else found.values # fmt: off diff --git a/src/In1d.chpl b/src/In1d.chpl index 30b761327b..99a944e1c1 100644 --- a/src/In1d.chpl +++ b/src/In1d.chpl @@ -89,25 +89,4 @@ module In1d } return truth; } - - // For each value in the first array, find the indices of their appearances - // in the second. Results are ordered by the order of the keys. - proc indexof1d(keys: ?t, arr: t) throws { - var l : list(int, false); - var indexLists: [0.. 0 && k > 0 { + extrema[extremaOff..#k] = computeExtremaValues(values[segOff..#segLen], k, isMin); + } + } + st.addEntry(rname, createSymEntry(extrema)); + } + else { + st.checkTable(negativeName); + const negative = toSymEntry(getGenericTypedArrayEntry(negativeName, st),bool).a; + + var k_with_negatives = kArray + negative; + const extremaOffset = (+ scan k_with_negatives) - k_with_negatives; + var extrema: [makeDistDom(+ reduce k_with_negatives)] int; + + // not terribly optimized. I think it would prob be more efficient to do something like segMin + forall (segOff, segLen, extremaOff, k, n) in zip(segments, segLens, extremaOffset, kArray, negative) { + if segLen > 0 && k > 0 { + extrema[extremaOff..#k] = computeExtremaValues(values[segOff..#segLen], k, isMin); + } + else if n { + extrema[extremaOff] = -1; + } + } + st.addEntry(rname, createSymEntry(extrema)); + } + + const repMsg = "created " + st.attrib(rname); + return new MsgTuple(repMsg, MsgType.NORMAL); + } + use CommandMap; registerFunction("segmentedReduction", segmentedReductionMsg, getModuleName()); registerFunction("sizeReduction", sizeReductionMsg, getModuleName()); -} \ No newline at end of file + registerFunction("segmentedExtremaK", segmentedExtremaKMsg, getModuleName()); +} diff --git a/tests/setops_test.py b/tests/setops_test.py index 218c288074..6d1e731568 100755 --- a/tests/setops_test.py +++ b/tests/setops_test.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from base_test import ArkoudaTest from context import arkouda as ak @@ -331,3 +332,49 @@ def test_multiarray_validation(self): x = [ak.arange(3, dtype=ak.uint64), ak.arange(3)] with self.assertRaises(TypeError): ak.pdarraysetops.multiarray_setop_validation(x, y) + + def test_index_of(self): + # index of nan (reproducer from #3009) + s = ak.Series(ak.array([1, 2, 3]), index=ak.array([1, 2, np.nan])) + self.assertTrue(ak.indexof1d(ak.array([np.nan]), s.index.values).to_list() == [2]) + + select_from_list = [ + ak.randint(-(2**32), 2**32, 10), + ak.linspace(-(2**32), 2**32, 10), + ak.random_strings_uniform(1, 16, 10), + ] + for select_from in select_from_list: + arr1 = select_from[ak.randint(0, select_from.size, 20)] + + # test unique search space, this should be identical to find + # be sure to test when all items are present and when there are items missing + for arr2 in select_from, select_from[:5], select_from[5:]: + found_in_second = ak.in1d(arr1, arr2) + idx_of_first_in_second = ak.indexof1d(arr1, arr2) + + # ensure we match find + self.assertTrue((idx_of_first_in_second == ak.find(arr1, arr2, remove_missing=True)).all()) + + # if an element of arr1 is found in arr2, return the index of that item in arr2 + self.assertTrue( + (arr2[idx_of_first_in_second] == arr1[found_in_second]).all() + ) + + # test duplicate items in search space, the easiest way I can think + # of to do this is to compare against pandas series getitem + arr2 = select_from[ak.randint(0, select_from.size, 20)] + pd_s = pd.Series(index=arr1.to_ndarray(), data=arr2.to_ndarray()) + ak_s = ak.Series(index=arr1, data=arr2) + + arr1_keys = ak.GroupBy(arr1).unique_keys + arr2_keys = ak.GroupBy(arr2).unique_keys + in_both = ak.intersect1d(arr1_keys, arr2_keys) + + for i in in_both.to_list(): + pd_i = pd_s[i] + ak_i = ak_s[i] + if isinstance(pd_i, pd.Series): + self.assertIsInstance(ak_i, ak.Series) + self.assertEqual(pd_i.values.tolist(), ak_i.values.to_list()) + else: + self.assertEqual(pd_i, ak_i)