From f9a7fa2d7644f801bc2d9d92590414c2267b8ab5 Mon Sep 17 00:00:00 2001 From: hokiegeek2 Date: Mon, 13 Jun 2022 13:26:12 -0400 Subject: [PATCH] 1077 update dtypes to better support scalars (#1492) * added updated dtypes logic, updated unit tests #1077 * fixed mypy error * updates for mypy * mypy updates * mypy updates * added tests for #1077) * fixed flake8 errors #1077 * updated formatting per PR review feedback #1077 * updated per PR feedback #1077 * fixed flake8 error #1077 --- arkouda/dtypes.py | 68 ++++++++++++++-- arkouda/groupbyclass.py | 100 +++++++++++++++++------ arkouda/pdarrayclass.py | 108 ++++++++++++++++++------ arkouda/timeclass.py | 93 +++++++++++++++------ tests/dtypes_tests.py | 31 ++++--- tests/extrema_test.py | 16 ++-- tests/numeric_test.py | 14 ++-- tests/pdarray_creation_test.py | 145 ++++++++++++++++++++++----------- 8 files changed, 410 insertions(+), 165 deletions(-) diff --git a/arkouda/dtypes.py b/arkouda/dtypes.py index 31ccf45878..fac6ad4c6e 100644 --- a/arkouda/dtypes.py +++ b/arkouda/dtypes.py @@ -56,12 +56,34 @@ # Union aliases used for static and runtime type checking bool_scalars = Union[builtins.bool, np.bool_] float_scalars = Union[float, np.float64] -int_scalars = Union[int, np.int64, np.uint64] -numeric_scalars = Union[float, np.float64, int, np.int64, np.uint8, np.uint64] +int_scalars = Union[ + int, + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, +] +numeric_scalars = Union[float_scalars, int_scalars] numeric_and_bool_scalars = Union[bool_scalars, numeric_scalars] -numpy_scalars = Union[np.float64, np.int64, np.bool_, np.uint8, np.str_, np.uint64] +numpy_scalars = Union[ + np.float64, + np.int8, + np.int16, + np.int32, + np.int64, + np.bool_, + np.str_, + np.uint8, + np.uint16, + np.uint32, + np.uint64, +] str_scalars = Union[str, np.str_] -all_scalars = Union[float, np.float64, int, np.int64, np.uint64, builtins.bool, np.bool_, str, np.str_] +all_scalars = Union[bool_scalars, numeric_scalars, numpy_scalars, str_scalars] """ The DType enum defines the supported Arkouda data types in string form. @@ -94,10 +116,34 @@ def __repr__(self) -> str: # type: ignore return self.value -ARKOUDA_SUPPORTED_INTS = (int, np.int64, np.uint64) +ARKOUDA_SUPPORTED_INTS = ( + int, + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, +) ARKOUDA_SUPPORTED_FLOATS = (float, np.float64) -ARKOUDA_SUPPORTED_NUMBERS = (int, np.int64, float, np.float64, np.uint64) -ARKOUDA_SUPPORTED_DTYPES = frozenset([member.value for _, member in DType.__members__.items()]) +ARKOUDA_SUPPORTED_NUMBERS = ( + int, + np.int8, + np.int16, + np.int32, + np.int64, + float, + np.float64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, +) +ARKOUDA_SUPPORTED_DTYPES = frozenset( + [member.value for _, member in DType.__members__.items()] +) DTypes = frozenset([member.value for _, member in DType.__members__.items()]) DTypeObjects = frozenset([bool, float, float64, int, int64, str, str_, uint8, uint64]) @@ -182,13 +228,17 @@ def resolve_scalar_dtype(val: object) -> str: # type: ignore ): return "bool" # Python int or np.int* or np.uint* - elif isinstance(val, int) or (hasattr(val, "dtype") and cast(np.uint, val).dtype.kind in "ui"): + elif isinstance(val, int) or ( + hasattr(val, "dtype") and cast(np.uint, val).dtype.kind in "ui" + ): if isinstance(val, np.uint64): return "uint64" else: return "int64" # Python float or np.float* - elif isinstance(val, float) or (hasattr(val, "dtype") and cast(np.float_, val).dtype.kind == "f"): + elif isinstance(val, float) or ( + hasattr(val, "dtype") and cast(np.float_, val).dtype.kind == "f" + ): return "float64" elif isinstance(val, builtins.str) or isinstance(val, np.str_): return "str" diff --git a/arkouda/groupbyclass.py b/arkouda/groupbyclass.py index a9895730da..d0aad6745f 100644 --- a/arkouda/groupbyclass.py +++ b/arkouda/groupbyclass.py @@ -87,7 +87,9 @@ def unique( if hasattr(pda, "_get_grouping_keys"): # Single groupable array nkeys = 1 - grouping_keys = cast(list, cast(groupable_element_type, pda)._get_grouping_keys()) + grouping_keys = cast( + list, cast(groupable_element_type, pda)._get_grouping_keys() + ) else: # Sequence of groupable arrays nkeys = len(pda) @@ -108,7 +110,11 @@ def unique( repMsg = generic_msg( cmd="unique", args="{} {} {:n} {} {}".format( - return_groups, assume_sorted, effectiveKeys, " ".join(keynames), " ".join(keytypes) + return_groups, + assume_sorted, + effectiveKeys, + " ".join(keynames), + " ".join(keytypes), ), ) if return_groups: @@ -217,7 +223,11 @@ class GroupBy: Reductions = GROUPBY_REDUCTION_TYPES def __init__( - self, keys: Optional[groupable], assume_sorted: bool = False, hash_strings: bool = True, **kwargs + self, + keys: Optional[groupable], + assume_sorted: bool = False, + hash_strings: bool = True, + **kwargs, ) -> None: # Type Checks required because @typechecked was removed for causing other issues # This prevents non-bool values that can be evaluated to true (ie non-empty arrays) @@ -340,7 +350,9 @@ def aggregate( operator = operator.lower() if operator not in self.Reductions: - raise ValueError(f"Unsupported reduction: {operator}\nMust be one of {self.Reductions}") + raise ValueError( + f"Unsupported reduction: {operator}\nMust be one of {self.Reductions}" + ) # TO DO: remove once logic is ported over to Chapel if operator == "nunique": @@ -348,7 +360,9 @@ def aggregate( # All other aggregations operate on pdarray if cast(pdarray, values).size != self.size: - raise ValueError("Attempt to group array using key array of different length") + raise ValueError( + "Attempt to group array using key array of different length" + ) if self.assume_sorted: permuted_values = cast(pdarray, values) @@ -356,11 +370,16 @@ def aggregate( permuted_values = cast(pdarray, values)[cast(pdarray, self.permutation)] cmd = "segmentedReduction" - args = "{} {} {} {}".format(permuted_values.name, self.segments.name, operator, skipna) + args = "{} {} {} {}".format( + permuted_values.name, self.segments.name, operator, skipna + ) repMsg = generic_msg(cmd=cmd, args=args) self.logger.debug(repMsg) if operator.startswith("arg"): - return (self.unique_keys, cast(pdarray, self.permutation[create_pdarray(repMsg)])) + return ( + self.unique_keys, + cast(pdarray, self.permutation[create_pdarray(repMsg)]), + ) else: return self.unique_keys, create_pdarray(repMsg) @@ -547,7 +566,9 @@ def min(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray] (array([2, 3, 4]), array([1, 1, 3])) """ if values.dtype == bool: - raise TypeError("min is only supported for pdarrays of dtype float64, uint64, and int64") + raise TypeError( + "min is only supported for pdarrays of dtype float64, uint64, and int64" + ) return self.aggregate(values, "min", skipna) def max(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray]: @@ -594,7 +615,9 @@ def max(self, values: pdarray, skipna: bool = True) -> Tuple[groupable, pdarray] (array([2, 3, 4]), array([4, 4, 3])) """ if values.dtype == bool: - raise TypeError("max is only supported for pdarrays of dtype float64, uint64, and int64") + raise TypeError( + "max is only supported for pdarrays of dtype float64, uint64, and int64" + ) return self.aggregate(values, "max", skipna) def argmin(self, values: pdarray) -> Tuple[groupable, pdarray]: @@ -751,7 +774,10 @@ def nunique(self, values: groupable) -> Tuple[groupable, pdarray]: # or Categorical (the last two have a .group() method). # Can't directly test Categorical due to circular import. if isinstance(values, pdarray): - if cast(pdarray, values).dtype != akint64 and cast(pdarray, values).dtype != akuint64: + if ( + cast(pdarray, values).dtype != akint64 + and cast(pdarray, values).dtype != akuint64 + ): raise TypeError("nunique unsupported for this dtype") togroup = [ukidx, values] elif hasattr(values, "group"): @@ -774,7 +800,9 @@ def nunique(self, values: groupable) -> Tuple[groupable, pdarray]: # Re-join unique counts with original keys (sorting guarantees same order) return self.unique_keys, nuniq - def any(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: + def any( + self, values: pdarray + ) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: """ Using the permutation stored in the GroupBy instance, group another array of values and perform an "or" reduction on each group. @@ -804,7 +832,9 @@ def any(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strin raise TypeError("any is only supported for pdarrays of dtype bool") return self.aggregate(values, "any") # type: ignore - def all(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: + def all( + self, values: pdarray + ) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: """ Using the permutation stored in the GroupBy instance, group another array of values and perform an "and" reduction on @@ -838,7 +868,9 @@ def all(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strin return self.aggregate(values, "all") # type: ignore - def OR(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: + def OR( + self, values: pdarray + ) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: """ Bitwise OR of values in each segment. @@ -870,11 +902,15 @@ def OR(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, String Raised if all is not supported for the values dtype """ if values.dtype != akint64 and values.dtype != akuint64: - raise TypeError("OR is only supported for pdarrays of dtype int64 or uint64") + raise TypeError( + "OR is only supported for pdarrays of dtype int64 or uint64" + ) return self.aggregate(values, "or") # type: ignore - def AND(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: + def AND( + self, values: pdarray + ) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: """ Bitwise AND of values in each segment. @@ -906,11 +942,15 @@ def AND(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strin Raised if all is not supported for the values dtype """ if values.dtype != akint64 and values.dtype != akuint64: - raise TypeError("AND is only supported for pdarrays of dtype int64 or uint64") + raise TypeError( + "AND is only supported for pdarrays of dtype int64 or uint64" + ) return self.aggregate(values, "and") # type: ignore - def XOR(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: + def XOR( + self, values: pdarray + ) -> Tuple[Union[pdarray, List[Union[pdarray, Strings]]], pdarray]: """ Bitwise XOR of values in each segment. @@ -942,7 +982,9 @@ def XOR(self, values: pdarray) -> Tuple[Union[pdarray, List[Union[pdarray, Strin Raised if all is not supported for the values dtype """ if values.dtype != akint64 and values.dtype != akuint64: - raise TypeError("XOR is only supported for pdarrays of dtype int64 or uint64") + raise TypeError( + "XOR is only supported for pdarrays of dtype int64 or uint64" + ) return self.aggregate(values, "xor") # type: ignore @@ -1053,7 +1095,9 @@ def build_from_components(user_defined_name: str = None, **kwargs) -> GroupBy: if "segments" not in kwargs: missingKeys.append("segments") - raise ValueError(f"Can't build GroupBy. kwargs is missing required keys: {missingKeys}.") + raise ValueError( + f"Can't build GroupBy. kwargs is missing required keys: {missingKeys}." + ) def _get_groupby_required_pieces(self) -> Dict: """ @@ -1114,13 +1158,17 @@ def register(self, user_defined_name: str) -> GroupBy: if isinstance(self.keys, (Strings, pdarray, Categorical)): self.keys.register(f"{user_defined_name}_{self.keys.objtype}.keys") - self.unique_keys.register(f"{user_defined_name}_{self.keys.objtype}.unique_keys") + self.unique_keys.register( + f"{user_defined_name}_{self.keys.objtype}.unique_keys" + ) elif isinstance(self.keys, Sequence): for x in range(len(self.keys)): # Possible for multiple types in a sequence, so we have to check each key's # type individually if isinstance(self.keys[x], (Strings, pdarray, Categorical)): - self.keys[x].register(f"{x}_{user_defined_name}_{self.keys[x].objtype}.keys") + self.keys[x].register( + f"{x}_{user_defined_name}_{self.keys[x].objtype}.keys" + ) self.unique_keys[x].register( f"{x}_{user_defined_name}_{self.keys[x].objtype}.unique_keys" ) @@ -1211,7 +1259,9 @@ def is_registered(self) -> bool: f"^\\d+_{self.name}_.+\\.keys$|^\\d+_{self.name}_.+\\.unique_keys$|" f"^\\d+_{self.name}_.+\\.unique_keys(?=\\.categories$)" ) - cat_regEx = compile(f"^\\d+_{self.name}_{Categorical.objtype}\\.keys(?=\\.codes$)") + cat_regEx = compile( + f"^\\d+_{self.name}_{Categorical.objtype}\\.keys(?=\\.codes$)" + ) simple_registered = list(filter(regEx.match, registry)) cat_registered = list(filter(cat_regEx.match, registry)) @@ -1296,7 +1346,9 @@ def attach(user_defined_name: str) -> GroupBy: matches.sort() if len(matches) == 0: - raise RegistrationError(f"No registered elements with name '{user_defined_name}'") + raise RegistrationError( + f"No registered elements with name '{user_defined_name}'" + ) for name in matches: # Parse the name for the dtype and use the proper create method to create the element @@ -1513,7 +1565,7 @@ def broadcast( else: pname = permutation.name permute = True - size = permutation.size + size = cast(Union[int, np.int64, np.uint64], permutation.size) if size < 1: raise ValueError("result size must be greater than zero") cmd = "broadcast" diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py index 9c6952522d..604c4d7382 100755 --- a/arkouda/pdarrayclass.py +++ b/arkouda/pdarrayclass.py @@ -2,7 +2,7 @@ import builtins import json -from typing import List, Sequence, cast +from typing import List, Sequence, Union, cast import numpy as np # type: ignore from typeguard import typechecked @@ -157,7 +157,9 @@ class pdarray: "**", ] ) - OpEqOps = frozenset(["+=", "-=", "*=", "/=", "//=", "&=", "|=", "^=", "<<=", ">>=", "**="]) + OpEqOps = frozenset( + ["+=", "-=", "*=", "/=", "//=", "&=", "|=", "^=", "<<=", ">>=", "**="] + ) objtype = "pdarray" __array_priority__ = 1000 @@ -204,7 +206,9 @@ def __str__(self): def __repr__(self): from arkouda.client import pdarrayIterThresh - return generic_msg(cmd="repr", args="{} {}".format(self.name, pdarrayIterThresh)) + return generic_msg( + cmd="repr", args="{} {}".format(self.name, pdarrayIterThresh) + ) def format_other(self, other: object) -> str: """ @@ -289,7 +293,9 @@ def _binop(self, other: pdarray, op: str) -> pdarray: if dt not in DTypes: raise TypeError(f"Unhandled scalar type: {other} ({type(other)})") cmd = "binopvs" - args = "{} {} {} {}".format(op, self.name, dt, NUMBER_FORMAT_STRINGS[dt].format(other)) + args = "{} {} {} {}".format( + op, self.name, dt, NUMBER_FORMAT_STRINGS[dt].format(other) + ) repMsg = generic_msg(cmd=cmd, args=args) return create_pdarray(repMsg) @@ -334,7 +340,9 @@ def _r_binop(self, other: pdarray, op: str) -> pdarray: if dt not in DTypes: raise TypeError(f"Unhandled scalar type: {other} ({type(other)})") cmd = "binopsv" - args = "{} {} {} {}".format(op, dt, NUMBER_FORMAT_STRINGS[dt].format(other), self.name) + args = "{} {} {} {}".format( + op, dt, NUMBER_FORMAT_STRINGS[dt].format(other), self.name + ) repMsg = generic_msg(cmd=cmd, args=args) return create_pdarray(repMsg) @@ -434,13 +442,17 @@ def __ge__(self, other): return self._binop(other, ">=") def __eq__(self, other): - if (self.dtype == bool) and (isinstance(other, pdarray) and (other.dtype == bool)): + if (self.dtype == bool) and ( + isinstance(other, pdarray) and (other.dtype == bool) + ): return ~(self ^ other) else: return self._binop(other, "==") def __ne__(self, other): - if (self.dtype == bool) and (isinstance(other, pdarray) and (other.dtype == bool)): + if (self.dtype == bool) and ( + isinstance(other, pdarray) and (other.dtype == bool) + ): return self ^ other else: return self._binop(other, "!=") @@ -476,7 +488,9 @@ def opeq(self, other, op): raise TypeError(f"Unhandled scalar type: {other} ({type(other)})") cmd = "opeqvs" - args = "{} {} {} {}".format(op, self.name, self.dtype.name, self.format_other(other)) + args = "{} {} {} {}".format( + op, self.name, self.dtype.name, self.format_other(other) + ) generic_msg(cmd=cmd, args=args) return self @@ -541,7 +555,9 @@ def __getitem__(self, key): # value = fields[2] return parse_single_value(" ".join(fields[1:])) else: - raise IndexError(f"[int] {orig_key} is out of bounds with size {self.size}") + raise IndexError( + f"[int] {orig_key} is out of bounds with size {self.size}" + ) if isinstance(key, slice): (start, stop, stride) = key.indices(self.size) logger.debug("start: {} stop: {} stride: {}".format(start, stop, stride)) @@ -555,7 +571,9 @@ def __getitem__(self, key): raise TypeError(f"unsupported pdarray index type {key.dtype}") if kind == "bool" and self.size != key.size: raise ValueError(f"size mismatch {self.size} {key.size}") - repMsg = generic_msg(cmd="[pdarray]", args="{} {}".format(self.name, key.name)) + repMsg = generic_msg( + cmd="[pdarray]", args="{} {}".format(self.name, key.name) + ) return create_pdarray(repMsg) else: raise TypeError(f"Unhandled key type: {key} ({type(key)})") @@ -569,14 +587,19 @@ def __setitem__(self, key, value): if key >= 0 and key < self.size: generic_msg( cmd="[int]=val", - args="{} {} {} {}".format(self.name, key, self.dtype.name, self.format_other(value)), + args="{} {} {} {}".format( + self.name, key, self.dtype.name, self.format_other(value) + ), ) else: - raise IndexError(f"index {orig_key} is out of bounds with size {self.size}") + raise IndexError( + f"index {orig_key} is out of bounds with size {self.size}" + ) elif isinstance(key, pdarray): if isinstance(value, pdarray): generic_msg( - cmd="[pdarray]=pdarray", args="{} {} {}".format(self.name, key.name, value.name) + cmd="[pdarray]=pdarray", + args="{} {} {}".format(self.name, key.name, value.name), ) else: generic_msg( @@ -591,13 +614,20 @@ def __setitem__(self, key, value): if isinstance(value, pdarray): generic_msg( cmd="[slice]=pdarray", - args="{} {} {} {} {}".format(self.name, start, stop, stride, value.name), + args="{} {} {} {} {}".format( + self.name, start, stop, stride, value.name + ), ) else: generic_msg( cmd="[slice]=val", args="{} {} {} {} {} {}".format( - self.name, start, stop, stride, self.dtype.name, self.format_other(value) + self.name, + start, + stop, + stride, + self.dtype.name, + self.format_other(value), ), ) else: @@ -618,7 +648,10 @@ def fill(self, value: numeric_scalars) -> None: Raised if value is not an int, int64, float, or float64 """ generic_msg( - cmd="set", args="{} {} {}".format(self.name, self.dtype.name, self.format_other(value)) + cmd="set", + args="{} {} {}".format( + self.name, self.dtype.name, self.format_other(value) + ), ) def any(self) -> np.bool_: @@ -1035,7 +1068,8 @@ def to_ndarray(self) -> np.ndarray: ) # The reply from the server will be binary data data = cast( - memoryview, generic_msg(cmd="tondarray", args="{}".format(self.name), recv_binary=True) + memoryview, + generic_msg(cmd="tondarray", args="{}".format(self.name), recv_binary=True), ) # Make sure the received data has the expected length if len(data) != self.size * self.dtype.itemsize: @@ -1232,14 +1266,24 @@ def save( generic_msg( cmd=cmd, args="{} {} {} {} {} {} {}".format( - self.name, dataset, m, json_array, self.dtype, strings_placeholder, compressed + self.name, + dataset, + m, + json_array, + self.dtype, + strings_placeholder, + compressed, ), ), ) @typechecked def save_parquet( - self, prefix_path: str, dataset: str = "array", mode: str = "truncate", compressed: bool = False + self, + prefix_path: str, + dataset: str = "array", + mode: str = "truncate", + compressed: bool = False, ) -> str: """ Save the pdarray to Parquet. The result is a collection of Parquet files, @@ -1311,7 +1355,9 @@ def save_parquet( ) @typechecked - def save_hdf(self, prefix_path: str, dataset: str = "array", mode: str = "truncate") -> str: + def save_hdf( + self, prefix_path: str, dataset: str = "array", mode: str = "truncate" + ) -> str: """ Save the pdarray to HDF5. The result is a collection of HDF5 files, one file per locale of the arkouda server, where each filename starts @@ -1374,7 +1420,11 @@ def save_hdf(self, prefix_path: str, dataset: str = "array", mode: str = "trunca True """ return self.save( - prefix_path=prefix_path, dataset=dataset, mode=mode, compressed=False, file_format="HDF5" + prefix_path=prefix_path, + dataset=dataset, + mode=mode, + compressed=False, + file_format="HDF5", ) @typechecked @@ -1427,7 +1477,9 @@ def register(self, user_defined_name: str) -> pdarray: >>> b.unregister() """ try: - rep_msg = generic_msg(cmd="register", args=f"{self.name} {user_defined_name}") + rep_msg = generic_msg( + cmd="register", args=f"{self.name} {user_defined_name}" + ) if isinstance(rep_msg, bytes): rep_msg = str(rep_msg, "UTF-8") if rep_msg != "success": @@ -1436,7 +1488,9 @@ def register(self, user_defined_name: str) -> pdarray: RuntimeError, RegistrationError, ): # Registering two objects with the same name is not allowed - raise RegistrationError(f"Server was unable to register {user_defined_name}") + raise RegistrationError( + f"Server was unable to register {user_defined_name}" + ) self.name = user_defined_name return self @@ -1538,7 +1592,9 @@ def _get_grouping_keys(self) -> List[pdarray]: # Integral pdarrays are their own grouping keys return [self] else: - raise TypeError("Grouping is only supported on numeric data (integral types) and bools.") + raise TypeError( + "Grouping is only supported on numeric data (integral types) and bools." + ) # end pdarray class def @@ -1864,7 +1920,9 @@ def mean(pda: pdarray) -> np.float64: RuntimeError Raised if there's a server-side error thrown """ - return np.float64(pda.sum()) / pda.size + return np.float64(cast(Union[int, np.int64, np.float64], pda.sum())) / cast( + Union[int, np.int64], pda.size + ) @typechecked diff --git a/arkouda/timeclass.py b/arkouda/timeclass.py index 207b87cc41..a9963f39da 100644 --- a/arkouda/timeclass.py +++ b/arkouda/timeclass.py @@ -1,5 +1,4 @@ import datetime -from typing import Union from warnings import warn import numpy as np # type: ignore @@ -10,7 +9,7 @@ from pandas import timedelta_range as pd_timedelta_range # type: ignore from pandas import to_datetime, to_timedelta # type: ignore -from arkouda.dtypes import int64, intTypes, isSupportedInt +from arkouda.dtypes import int64, int_scalars, intTypes, isSupportedInt from arkouda.numeric import abs as akabs from arkouda.numeric import cast from arkouda.pdarrayclass import pdarray @@ -34,13 +33,13 @@ } _unit2factor = { - "w": 7 * 24 * 60 * 60 * 10**9, - "d": 24 * 60 * 60 * 10**9, - "h": 60 * 60 * 10**9, - "m": 60 * 10**9, - "s": 10**9, - "ms": 10**6, - "us": 10**3, + "w": 7 * 24 * 60 * 60 * 10 ** 9, + "d": 24 * 60 * 60 * 10 ** 9, + "h": 60 * 60 * 10 ** 9, + "m": 60 * 10 ** 9, + "s": 10 ** 9, + "ms": 10 ** 6, + "us": 10 ** 3, "ns": 1, } @@ -66,7 +65,9 @@ class _Timescalar: def __init__(self, scalar): if isinstance(scalar, np.datetime64) or isinstance(scalar, datetime.datetime): scalar = to_datetime(scalar).to_numpy() - elif isinstance(scalar, np.timedelta64) or isinstance(scalar, datetime.timedelta): + elif isinstance(scalar, np.timedelta64) or isinstance( + scalar, datetime.timedelta + ): scalar = to_timedelta(scalar).to_numpy() self.unit = np.datetime_data(scalar.dtype)[0] self._factor = _get_factor(self.unit) @@ -93,12 +94,16 @@ def __init__(self, array, unit: str = _BASE_UNIT): # type: ignore # Convert the input to int64 pdarray of nanoseconds elif isinstance(array, pdarray): if array.dtype not in intTypes: - raise TypeError(f"{self.__class__.__name__} array must have int64 dtype") + raise TypeError( + f"{self.__class__.__name__} array must have int64 dtype" + ) # Already int64 pdarray, just scale self.unit = unit self._factor = _get_factor(self.unit) # This makes a copy of the input array, to leave input unchanged - self.values = cast(self._factor * array, int64) # Mimics a datetime64[ns] array + self.values = cast( + self._factor * array, int64 + ) # Mimics a datetime64[ns] array elif hasattr(array, "dtype"): # Handles all pandas and numpy datetime/timedelta arrays if array.dtype.kind not in ("M", "m"): @@ -202,7 +207,8 @@ def round(self, freq): def to_ndarray(self): __doc__ = super().to_ndarray.__doc__ # noqa return np.array( - self.values.to_ndarray(), dtype="{}64[ns]".format(self.__class__.__name__.lower()) + self.values.to_ndarray(), + dtype="{}64[ns]".format(self.__class__.__name__.lower()), ) def __str__(self): @@ -231,7 +237,9 @@ def _binop(self, other, op): # 2) Get other's int64 data to combine with self's data if isinstance(other, Datetime) or self._is_datetime_scalar(other): if op not in self.supported_with_datetime: - raise TypeError(f"{op} not supported between {self.__class__.__name__} and Datetime") + raise TypeError( + f"{op} not supported between {self.__class__.__name__} and Datetime" + ) otherclass = "Datetime" if self._is_datetime_scalar(other): otherdata = _Timescalar(other).value @@ -239,15 +247,21 @@ def _binop(self, other, op): otherdata = other.values elif isinstance(other, Timedelta) or self._is_timedelta_scalar(other): if op not in self.supported_with_timedelta: - raise TypeError(f"{op} not supported between {self.__class__.__name__} and Timedelta") + raise TypeError( + f"{op} not supported between {self.__class__.__name__} and Timedelta" + ) otherclass = "Timedelta" if self._is_timedelta_scalar(other): otherdata = _Timescalar(other).value else: otherdata = other.values - elif (isinstance(other, pdarray) and other.dtype in intTypes) or isSupportedInt(other): + elif (isinstance(other, pdarray) and other.dtype in intTypes) or isSupportedInt( + other + ): if op not in self.supported_with_pdarray: - raise TypeError(f"{op} not supported between {self.__class__.__name__} and integer") + raise TypeError( + f"{op} not supported between {self.__class__.__name__} and integer" + ) otherclass = "pdarray" otherdata = other else: @@ -265,7 +279,9 @@ def _r_binop(self, other, op): # First case is pdarray self if isinstance(other, pdarray) and other.dtype in intTypes: if op not in self.supported_with_r_pdarray: - raise TypeError(f"{op} not supported between int64 and {self.__class__.__name__}") + raise TypeError( + f"{op} not supported between int64 and {self.__class__.__name__}" + ) callback = self._get_callback("pdarray", op) # Need to use other._binop because self.values._r_binop can only handle scalars return callback(other._binop(self.values, op)) @@ -286,7 +302,9 @@ def _r_binop(self, other, op): otherdata = _Timescalar(other).value elif isSupportedInt(other): if op not in self.supported_with_r_pdarray: - raise TypeError(f"{op} not supported between int64 and {self.__class__.__name__}") + raise TypeError( + f"{op} not supported between int64 and {self.__class__.__name__}" + ) otherclass = "pdarray" otherdata = other else: @@ -298,7 +316,9 @@ def _r_binop(self, other, op): def opeq(self, other, op): if isinstance(other, Timedelta) or self._is_timedelta_scalar(other): if op not in self.supported_opeq: - raise TypeError(f"{self.__class__.__name__} {op} Timedelta not supported") + raise TypeError( + f"{self.__class__.__name__} {op} Timedelta not supported" + ) if self._is_timedelta_scalar(other): otherdata = _Timescalar(other).value else: @@ -478,8 +498,12 @@ class Timedelta(_AbstractBaseTime): supported_with_datetime = frozenset(("+")) supported_with_r_datetime = frozenset(("+", "-", "/", "//", "%")) - supported_with_timedelta = frozenset(("==", "!=", "<", "<=", ">", ">=", "+", "-", "/", "//", "%")) - supported_with_r_timedelta = frozenset(("==", "!=", "<", "<=", ">", ">=", "+", "-", "/", "//", "%")) + supported_with_timedelta = frozenset( + ("==", "!=", "<", "<=", ">", ">=", "+", "-", "/", "//", "%") + ) + supported_with_r_timedelta = frozenset( + ("==", "!=", "<", "<=", ">", ">=", "+", "-", "/", "//", "%") + ) supported_opeq = frozenset(("+=", "-=", "%=")) supported_with_pdarray = frozenset(("*", "//")) supported_with_r_pdarray = frozenset(("*")) @@ -516,7 +540,7 @@ def to_pandas(self): """ return to_timedelta(self.to_ndarray()) - def std(self, ddof: Union[int, np.int64, np.uint64] = 0): + def std(self, ddof: int_scalars = 0): """ Returns the standard deviation as a pd.Timedelta object """ @@ -593,16 +617,29 @@ def date_range( """ if closed is not None: warn( - "closed has been deprecated. Please use the inclusive parameter instead.", DeprecationWarning + "closed has been deprecated. Please use the inclusive parameter instead.", + DeprecationWarning, ) inclusive = closed return Datetime( - pd_date_range(start, end, periods, freq, tz, normalize, name, inclusive=inclusive, **kwargs) + pd_date_range( + start, + end, + periods, + freq, + tz, + normalize, + name, + inclusive=inclusive, + **kwargs, + ) ) -def timedelta_range(start=None, end=None, periods=None, freq=None, name=None, closed=None, **kwargs): +def timedelta_range( + start=None, end=None, periods=None, freq=None, name=None, closed=None, **kwargs +): """Return a fixed frequency TimedeltaIndex, with day as the default frequency. Alias for ``ak.Timedelta(pd.timedelta_range(args))``. Subject to size limit imposed by client.maxTransferBytes. @@ -637,4 +674,6 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, name=None, cl To learn more about the frequency strings, please see `this link `__. """ - return Timedelta(pd_timedelta_range(start, end, periods, freq, name, closed, **kwargs)) + return Timedelta( + pd_timedelta_range(start, end, periods, freq, name, closed, **kwargs) + ) diff --git a/tests/dtypes_tests.py b/tests/dtypes_tests.py index d13d7ed9fe..4df6a7fe6c 100644 --- a/tests/dtypes_tests.py +++ b/tests/dtypes_tests.py @@ -150,23 +150,20 @@ def test_SeriesDTypes(self): def test_scalars(self): self.assertEqual("typing.Union[bool, numpy.bool_]", str(ak.bool_scalars)) - self.assertEqual("typing.Union[float, numpy.float64]", str(ak.float_scalars)) - self.assertEqual("typing.Union[int, numpy.int64, numpy.uint64]", str(ak.int_scalars)) - self.assertEqual( - "typing.Union[float, numpy.float64, int, numpy.int64, numpy.uint8, numpy.uint64]", - str(ak.numeric_scalars), - ) - self.assertEqual("typing.Union[str, numpy.str_]", str(ak.str_scalars)) - self.assertEqual( - "typing.Union[numpy.float64, numpy.int64, numpy.bool_, numpy.uint8, " - "numpy.str_, numpy.uint64]", - str(ak.numpy_scalars), - ) - self.assertEqual( - "typing.Union[float, numpy.float64, int, numpy.int64, numpy.uint64, bool, " - "numpy.bool_, str, numpy.str_]", - str(ak.all_scalars), - ) + self.assertEqual('typing.Union[float, numpy.float64]', str(ak.float_scalars)) + self.assertEqual(('typing.Union[int, numpy.int8, numpy.int16, numpy.int32, numpy.int64, ' + + 'numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64]'), str(ak.int_scalars)) + self.assertEqual(('typing.Union[float, numpy.float64, int, numpy.int8, numpy.int16, numpy.int32, ' + + 'numpy.int64, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64]'), + str(ak.numeric_scalars)) + self.assertEqual('typing.Union[str, numpy.str_]', str(ak.str_scalars)) + self.assertEqual(('typing.Union[numpy.float64, numpy.int8, numpy.int16, numpy.int32, ' + + 'numpy.int64, numpy.bool_, numpy.str_, numpy.uint8, numpy.uint16, numpy.uint32, ' + + 'numpy.uint64]'), + str(ak.numpy_scalars)) + self.assertEqual(('typing.Union[bool, numpy.bool_, float, numpy.float64, int, numpy.int8, ' + + 'numpy.int16, numpy.int32, numpy.int64, numpy.uint8, numpy.uint16, numpy.uint32,' + + ' numpy.uint64, numpy.str_, str]'),str(ak.all_scalars)) def test_number_format_strings(self): self.assertEqual("{}", dtypes.NUMBER_FORMAT_STRINGS["bool"]) diff --git a/tests/extrema_test.py b/tests/extrema_test.py index b48717a521..4f6e88bbc4 100644 --- a/tests/extrema_test.py +++ b/tests/extrema_test.py @@ -87,7 +87,7 @@ def test_error_handling(self): ak.mink(list(range(0, 10)), 1) with self.assertRaises(TypeError): - ak.mink(testArray, "1") + ak.mink(testArray, '1') with self.assertRaises(ValueError): ak.mink(testArray, -1) @@ -110,11 +110,11 @@ def test_error_handling(self): testArray = ak.randint(0, 100, 100) with self.assertRaises(TypeError): - ak.maxk(list(range(0, 10)), 1) + ak.maxk(list(range(0,10)), 1) with self.assertRaises(TypeError): - ak.maxk(testArray, "1") - + ak.maxk(testArray, '1') + with self.assertRaises(ValueError): ak.maxk(testArray, -1) @@ -136,10 +136,10 @@ def test_error_handling(self): testArray = ak.randint(0, 100, 100) with self.assertRaises(TypeError): - ak.argmink(list(range(0, 10)), 1) + ak.argmink(list(range(0,10)), 1) with self.assertRaises(TypeError): - ak.argmink(testArray, "1") + ak.argmink(testArray, '1') with self.assertRaises(ValueError): ak.argmink(testArray, -1) @@ -162,10 +162,10 @@ def test_error_handling(self): testArray = ak.randint(0, 100, 100) with self.assertRaises(TypeError): - ak.argmaxk(list(range(0, 10)), 1) + ak.argmaxk(list(range(0,10)), 1) with self.assertRaises(TypeError): - ak.argmaxk(testArray, "1") + ak.argmaxk(testArray, '1') with self.assertRaises(ValueError): ak.argmaxk(testArray, -1) diff --git a/tests/numeric_test.py b/tests/numeric_test.py index 1f260f681a..4981866cc9 100644 --- a/tests/numeric_test.py +++ b/tests/numeric_test.py @@ -128,13 +128,13 @@ def testHistogram(self): self.assertEqual(int, result.dtype) with self.assertRaises(TypeError): - ak.histogram([range(0, 10)], bins=1) - + ak.histogram([range(0,10)], bins=1) + with self.assertRaises(TypeError): - ak.histogram(pda, bins="1") - + ak.histogram(pda, bins='1') + with self.assertRaises(TypeError): - ak.histogram([range(0, 10)], bins="1") + ak.histogram([range(0,10)], bins='1') def testLog(self): na = np.linspace(1, 10, 10) @@ -142,7 +142,7 @@ def testLog(self): self.assertTrue((np.log(na) == ak.log(pda).to_ndarray()).all()) with self.assertRaises(TypeError): - ak.log([range(0, 10)]) + ak.log([range(0,10)]) def testExp(self): na = np.linspace(1, 10, 10) @@ -150,7 +150,7 @@ def testExp(self): self.assertTrue((np.exp(na) == ak.exp(pda).to_ndarray()).all()) with self.assertRaises(TypeError): - ak.exp([range(0, 10)]) + ak.exp([range(0,10)]) def testAbs(self): na = np.linspace(1, 10, 10) diff --git a/tests/pdarray_creation_test.py b/tests/pdarray_creation_test.py index b774613c8e..d7741f7cd1 100644 --- a/tests/pdarray_creation_test.py +++ b/tests/pdarray_creation_test.py @@ -104,6 +104,9 @@ def test_arange_dtype(self): self.assertListEqual(expected.to_ndarray().tolist(), uint_input.to_ndarray().tolist()) self.assertEqual(ak.uint64, uint_input.dtype) + # test int_scalars covers uint8, uint16, uint32 + ak.arange(np.uint8(1), np.uint16(1000), np.uint32(1)) + def test_randint(self): testArray = ak.randint(0, 10, 5) self.assertIsInstance(testArray, ak.pdarray) @@ -159,14 +162,17 @@ def test_randint(self): with self.assertRaises(ValueError): ak.randint(low=1, high=0, size=1, dtype=ak.float64) - with self.assertRaises(TypeError): - ak.randint(0, 1, "1000") + with self.assertRaises(TypeError): + ak.randint(0,1,'1000') - with self.assertRaises(TypeError): - ak.randint("0", 1, 1000) + with self.assertRaises(TypeError): + ak.randint('0',1,1000) + + with self.assertRaises(TypeError): + ak.randint(0,'1',1000) - with self.assertRaises(TypeError): - ak.randint(0, "1", 1000) + # Test that int_scalars covers uint8, uint16, uint32 + ak.randint(low=np.uint8(1), high=np.uint16(100), size=np.uint32(100)) def test_randint_with_seed(self): values = ak.randint(1, 5, 10, seed=2) @@ -204,6 +210,9 @@ def test_randint_with_seed(self): (ak.array([False, True, True, True, True, False, True, True, True, True]) == values).all() ) + # Test that int_scalars covers uint8, uint16, uint32 + ak.randint(np.uint8(1), np.uint32(5), np.uint16(10), seed=np.uint8(2)) + def test_uniform(self): testArray = ak.uniform(3) self.assertIsInstance(testArray, ak.pdarray) @@ -232,7 +241,10 @@ def test_uniform(self): ak.uniform(low=0, high="5", size=100) with self.assertRaises(TypeError): - ak.uniform(low=0, high=5, size="100") + ak.uniform(low=0, high=5, size='100') + + # Test that int_scalars covers uint8, uint16, uint32 + ak.uniform(low=np.uint8(0), high=5, size=np.uint32(100)) def test_zeros(self): intZeros = ak.zeros(5, dtype=ak.int64) @@ -258,9 +270,14 @@ def test_zeros(self): ak.zeros(5, dtype=ak.uint8) with self.assertRaises(TypeError): - ak.zeros(5, dtype=str) - - def test_ones(self): + ak.zeros(5, dtype=str) + + # Test that int_scalars covers uint8, uint16, uint32 + ak.zeros(np.uint8(5), dtype=ak.int64) + ak.zeros(np.uint16(5), dtype=ak.int64) + ak.zeros(np.uint32(5), dtype=ak.int64) + + def test_ones(self): intOnes = ak.ones(5, dtype=int) self.assertIsInstance(intOnes, ak.pdarray) self.assertEqual(int, intOnes.dtype) @@ -289,7 +306,12 @@ def test_ones(self): with self.assertRaises(TypeError): ak.ones(5, dtype=str) - def test_ones_like(self): + # Test that int_scalars covers uint8, uint16, uint32 + ak.ones(np.uint8(5), dtype=ak.int64) + ak.ones(np.uint16(5), dtype=ak.int64) + ak.ones(np.uint32(5), dtype=ak.int64) + + def test_ones_like(self): intOnes = ak.ones(5, dtype=ak.int64) intOnesLike = ak.ones_like(intOnes) @@ -344,6 +366,11 @@ def test_full(self): with self.assertRaises(TypeError): ak.full(5, 8, dtype=str) + # Test that int_scalars covers uint8, uint16, uint32 + ak.full(np.uint8(5), np.uint16(5), dtype=int) + ak.full(np.uint8(5), np.uint32(5), dtype=int) + ak.full(np.uint16(5), np.uint32(5), dtype=int) + def test_full_like(self): int_full = ak.full(5, 6, dtype=ak.int64) int_full_like = ak.full_like(int_full, 6) @@ -397,15 +424,20 @@ def test_linspace(self): pda = ak.linspace(start=float(5.0), stop=float(0.0), length=np.int64(6)) self.assertEqual(5.0000, pda[0]) self.assertEqual(0.0000, pda[5]) + + with self.assertRaises(TypeError): + ak.linspace(0,'100', 1000) - with self.assertRaises(TypeError): - ak.linspace(0, "100", 1000) + with self.assertRaises(TypeError): + ak.linspace('0',100, 1000) - with self.assertRaises(TypeError): - ak.linspace("0", 100, 1000) + with self.assertRaises(TypeError): + ak.linspace(0,100,'1000') - with self.assertRaises(TypeError): - ak.linspace(0, 100, "1000") + # Test that int_scalars covers uint8, uint16, uint32 + ak.linspace(np.uint8(0),np.uint16(100),np.uint32(1000)) + ak.linspace(np.uint32(0),np.uint16(100),np.uint8(1000)) + ak.linspace(np.uint16(0),np.uint8(100),np.uint8(1000)) def test_standard_normal(self): pda = ak.standard_normal(100) @@ -425,18 +457,23 @@ def test_standard_normal(self): npda = pda.to_ndarray() pda = ak.standard_normal(np.int64(100), np.int64(1)) + + self.assertTrue((npda == pda.to_ndarray()).all()) - self.assertTrue((npda == pda.to_ndarray()).all()) - - with self.assertRaises(TypeError): - ak.standard_normal("100") - - with self.assertRaises(TypeError): - ak.standard_normal(100.0) + with self.assertRaises(TypeError): + ak.standard_normal('100') + + with self.assertRaises(TypeError): + ak.standard_normal(100.0) - with self.assertRaises(ValueError): + with self.assertRaises(ValueError): ak.standard_normal(-1) + # Test that int_scalars covers uint8, uint16, uint32 + ak.standard_normal(np.uint8(100)) + ak.standard_normal(np.uint16(100)) + ak.standard_normal(np.uint32(100)) + def test_random_strings_uniform(self): pda = ak.random_strings_uniform(minlen=1, maxlen=5, size=100) nda = pda.to_ndarray() @@ -466,15 +503,16 @@ def test_random_strings_uniform(self): with self.assertRaises(ValueError): ak.random_strings_uniform(maxlen=5, minlen=5, size=10) + + with self.assertRaises(TypeError): + ak.random_strings_uniform(minlen='1', maxlen=5, size=10) - with self.assertRaises(TypeError): - ak.random_strings_uniform(minlen="1", maxlen=5, size=10) + with self.assertRaises(TypeError): + ak.random_strings_uniform( minlen=1, maxlen='5', size=10) - with self.assertRaises(TypeError): - ak.random_strings_uniform(minlen=1, maxlen="5", size=10) + with self.assertRaises(TypeError): + ak.random_strings_uniform(minlen=1, maxlen=5, size='10') - with self.assertRaises(TypeError): - ak.random_strings_uniform(minlen=1, maxlen=5, size="10") def test_random_strings_uniform_with_seed(self): pda = ak.random_strings_uniform(minlen=1, maxlen=5, seed=1, size=10) @@ -496,6 +534,9 @@ def test_random_strings_uniform_with_seed(self): (ak.array(["+5", "fp-P", "3Q4k", "~H", "F", "F=`,", "E", "YD", "kBa'", "(t5"]) == pda).all() ) + # Test that int_scalars covers uint8, uint16, uint32 + pda = ak.random_strings_uniform(minlen=np.uint8(1), maxlen=np.uint32(5), seed=np.uint16(1), size=np.uint8(10), characters='printable') + def test_random_strings_lognormal(self): pda = ak.random_strings_lognormal(2, 0.25, 100, characters="printable") self.assertIsInstance(pda, ak.Strings) @@ -542,16 +583,19 @@ def test_random_strings_lognormal(self): self.assertIsInstance(pda, ak.Strings) self.assertEqual(100, len(pda)) self.assertEqual(str, pda.dtype) - - with self.assertRaises(TypeError): - ak.random_strings_lognormal("2", 0.25, 100) - - with self.assertRaises(TypeError): - ak.random_strings_lognormal(2, 0.25, "100") - - with self.assertRaises(TypeError): + + with self.assertRaises(TypeError): + ak.random_strings_lognormal('2', 0.25, 100) + + with self.assertRaises(TypeError): + ak.random_strings_lognormal(2, 0.25, '100') + + with self.assertRaises(TypeError): ak.random_strings_lognormal(2, 0.25, 100, 1000000) + # Test that int_scalars covers uint8, uint16, uint32 + ak.random_strings_lognormal(np.uint8(2), 0.25, np.uint16(100)) + def test_random_strings_lognormal_with_seed(self): pda = ak.random_strings_lognormal(2, 0.25, 10, seed=1) @@ -727,15 +771,20 @@ def test_fill(self): ones.fill(2) self.assertTrue((2 == ones.to_ndarray()).all()) - - ones.fill(np.int64(2)) - self.assertTrue((np.int64(2) == ones.to_ndarray()).all()) - - ones.fill(float(2)) - self.assertTrue((float(2) == ones.to_ndarray()).all()) - - ones.fill(np.float64(2)) - self.assertTrue((np.float64(2) == ones.to_ndarray()).all()) + + ones.fill(np.int64(2)) + self.assertTrue((np.int64(2) == ones.to_ndarray()).all()) + + ones.fill(float(2)) + self.assertTrue((float(2) == ones.to_ndarray()).all()) + + ones.fill(np.float64(2)) + self.assertTrue((np.float64(2) == ones.to_ndarray()).all()) + + # Test that int_scalars covers uint8, uint16, uint32 + ones.fill(np.uint8(2)) + ones.fill(np.uint16(2)) + ones.fill(np.uint32(2)) def test_endian(self): N = 100