From 65ad5008238cd96896f422764408f58385965f01 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 19 Nov 2020 10:33:48 -0500 Subject: [PATCH 01/68] add the suffix array function to Arkouda --- src/SACA.chpl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 src/SACA.chpl diff --git a/src/SACA.chpl b/src/SACA.chpl new file mode 100644 index 0000000000..b3b53267f6 --- /dev/null +++ b/src/SACA.chpl @@ -0,0 +1,19 @@ +module SACA{ +// In this module, different algorithms to construct suffix array are provided +//Nov.15, 2020 + +// The first algorithm is divsufsort which is the fastest sequential and OpenMP c codes on suffix array +require "../../../SA/libdivsufsort/include/config.h"; +require "../../../SA/libdivsufsort/include/divsufsort.h"; +require "../../../SA/libdivsufsort/include/divsufsort_private.h"; +require "../../../SA/libdivsufsort/include/lfs.h"; + +require "../../../SA/libdivsufsort/lib/divsufsort.c"; +require "../../../SA/libdivsufsort/lib/sssort.c"; +require "../../../SA/libdivsufsort/lib/trsort.c"; +require "../../../SA/libdivsufsort/lib/utils.c"; +/* +require "/home/z/zd4/SA/nong/saca-k-tois-20130413/saca-k/saca-k.cc"; +*/ +extern proc divsufsort(inputstr:[] uint(8),suffixarray:[] int(32),totallen:int(32)); +} From 7af0b51ac416db0be7a2181a454b03dfb4591247 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 19 Nov 2020 11:36:20 -0500 Subject: [PATCH 02/68] add suffix array benchmark sa.py --- arkouda/pdarrayclass.py | 31 +++ arkouda/pdarraycreation.py | 61 ++++- arkouda/strings.py | 310 +++++++++++++++++++++++++- benchmarks/gather.py | 16 +- benchmarks/sa.py | 67 ++++++ benchmarks/scatter.py | 4 + src/SegmentedArray.chpl | 444 ++++++++++++++++++++++++++++++++++++- src/SegmentedMsg.chpl | 88 ++++++++ src/arkouda_server.chpl | 1 + 9 files changed, 1015 insertions(+), 7 deletions(-) create mode 100755 benchmarks/sa.py diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py index 86ce546f11..f7724d8e07 100755 --- a/arkouda/pdarrayclass.py +++ b/arkouda/pdarrayclass.py @@ -60,6 +60,37 @@ def unescape(s): raise ValueError(("unsupported value from server {} {}".\ format(mydtype.name, value))) + + + +@typechecked +def parse_single_int_array_value(msg : str) -> object: + """ + Attempt to convert a scalar return value from the arkouda server to a + numpy string in Python. The user should not call this function directly. + + Parameters + ---------- + msg : str + scalar value in string form to be converted to a numpy string + + Returns + ------- + object numpy scalar + """ + fields = msg.split(" ",1) + dtname=fields[0] + mydtype = dtype(dtname) + if mydtype == bool: + if value == "True": + return bool(True) + elif value == "False": + return bool(False) + else: + raise ValueError(("unsupported value from server {} {}".\ + format(mydtype.name, value))) + return fields[1] + # class for the pdarray class pdarray: """ diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 1de19e178d..e827c7a33c 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -8,10 +8,11 @@ from arkouda.dtypes import dtype as akdtype from arkouda.pdarrayclass import pdarray, create_pdarray from arkouda.strings import Strings +from arkouda.strings import Pdarrays __all__ = ["array", "zeros", "ones", "zeros_like", "ones_like", "arange", "linspace", "randint", "uniform", "standard_normal", - "random_strings_uniform", "random_strings_lognormal"] + "random_strings_uniform", "random_strings_lognormal" ,"suffix_array"] numericDTypes = frozenset(["bool", "int64", "float64"]) @@ -600,10 +601,17 @@ def random_strings_uniform(minlen : int, maxlen : int, size : int, "uniform", characters, NUMBER_FORMAT_STRINGS['int64'].format(minlen), NUMBER_FORMAT_STRINGS['int64'].format(maxlen)) +# print("In random_strings_uniform, msg={}".format(msg)) repMsg = generic_msg(msg) +# print("In random_strings_uniform,repMsg={}".format(repMsg)) +# print("In random_strings_uniform,split={}".format(repMsg.split('+'))) +# print("In random_strings_uniform,*split={}".format(*(repMsg.split('+')))) return Strings(*(repMsg.split('+'))) + + + def random_strings_lognormal(logmean : Union[float, int], logstd : float, size : int, characters : str='uppercase') -> Strings: """ @@ -662,3 +670,54 @@ def random_strings_lognormal(logmean : Union[float, int], logstd : float, NUMBER_FORMAT_STRINGS['float64'].format(logstd)) repMsg = generic_msg(msg) return Strings(*(repMsg.split('+'))) + + + +@typechecked +def suffix_array( strings : Strings) -> Pdarrays: + """ + Return the suffix arrays of given strings. The size/shape of each suffix + arrays is the same as the corresponding strings. + A simple example of suffix array is as follow. Given string "banana$", + all the suffixes are as follows. + s[0]="banana$" + s[1]="anana$" + s[2]="nana$" + s[3]="ana$" + s[4]="na$" + s[5]="a$" + s[6]="$" + The suffix array of string "banana$" is the array of indices of sorted suffixes. + s[6]="$" + s[5]="a$" + s[3]="ana$" + s[1]="anana$" + s[0]="banana$" + s[4]="na$" + s[2]="nana$" + so sa=[6,5,3,1,0,4,2] + + Returns + ------- + pdarray + The suffix arrays of the given strings + + See Also + -------- + + Notes + ----- + + Raises + ------ + RuntimeError + Raised if there is a server-side error in executing group request or + creating the pdarray encapsulating the return message + """ + msg = "segmentedSuffixAry {} {} {}".format( strings.objtype, + strings.offsets.name, + strings.bytes.name) + repMsg = generic_msg(msg) + pdarrays= Pdarrays(*(repMsg.split('+'))) + return pdarrays + diff --git a/arkouda/strings.py b/arkouda/strings.py index 6d0bee61f3..eaca43bfae 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import Tuple, Union from arkouda.client import generic_msg, pdarrayIterThresh -from arkouda.pdarrayclass import pdarray, create_pdarray, parse_single_value +from arkouda.pdarrayclass import pdarray, create_pdarray, parse_single_value,parse_single_int_array_value from arkouda.dtypes import * from arkouda.dtypes import NUMBER_FORMAT_STRINGS from arkouda.logger import getArkoudaLogger @@ -11,7 +11,7 @@ global verbose global pdarrayIterThresh -__all__ = ['Strings'] +__all__ = ['Strings','Pdarrays'] class Strings: """ @@ -75,16 +75,20 @@ def __init__(self, offset_attrib : Union[pdarray,np.ndarray], from either the offset_attrib or bytes_attrib parameter """ if isinstance(offset_attrib, pdarray): +# print("In Strings init 1 offset_attrib={}".format(offset_attrib)) self.offsets = offset_attrib else: try: +# print("In Strings init 2 offset_attrib={}".format(offset_attrib)) self.offsets = create_pdarray(offset_attrib) except Exception as e: raise RuntimeError(e) if isinstance(bytes_attrib, pdarray): +# print("In Strings init 1 bytes_attrib={}".format(bytes_attrib)) self.bytes = bytes_attrib else: try: +# print("In Strings init 1 bytes_attrib={}".format(bytes_attrib)) self.bytes = create_pdarray(bytes_attrib) except Exception as e: raise RuntimeError(e) @@ -241,6 +245,7 @@ def get_lengths(self) -> pdarray: msg = "segmentLengths {} {} {}".\ format(self.objtype, self.offsets.name, self.bytes.name) repMsg = generic_msg(msg) + print("executed get_lengths and repMsg={}".format(repMsg)) return create_pdarray(repMsg) def contains(self, substr : Union[str, bytes]) -> pdarray: @@ -671,6 +676,7 @@ def group(self) -> pdarray: repMsg = generic_msg(msg) return create_pdarray(repMsg) + def to_ndarray(self) -> np.ndarray: """ Convert the array to a np.ndarray, transferring array data from the @@ -719,6 +725,9 @@ def to_ndarray(self) -> np.ndarray: res[i] = np.str_(''.join(chr(b) for b in npvalues[o:o+l])) return res + + + def save(self, prefix_path : str, dataset : str='strings_array', mode : str='truncate') -> None: """ @@ -778,3 +787,300 @@ def attach(user_defined_name : str) -> 'Strings': return Strings(pdarray.attach(user_defined_name+'_offsets'), pdarray.attach(user_defined_name+'_bytes')) + +class Pdarrays: + """ + Represents an array of (suffix) arrays whose data resides on the + arkouda server. The user should not call this class directly; + rather its instances are created by other arkouda functions. It is + very similar to Strings and the difference is that its content is + an int arrays instead of strings. + + Attributes + ---------- + offsets : pdarray + The starting indices for each suffix array + bytes : pdarray + The raw integer indices of all suffix arrays + size : int + The number of suffix arrays in the array + nbytes : int + The total number of indices in all suffix arrays + ndim : int + The rank of the array (currently only rank 1 arrays supported) + shape : tuple + The sizes of each dimension of the array + dtype : dtype + The dtype is np.int + logger : ArkoudaLogger + Used for all logging operations + + Notes + ----- + Pdarrays is composed of two pdarrays: (1) offsets, which contains the + starting indices for each string's suffix array and (2) bytes, which contains the + raw indices of all suffix arrays,no any spliter between the arrays. + """ + + BinOps = frozenset(["==", "!="]) + objtype = "int" + + def __init__(self, offset_attrib : Union[pdarray,np.ndarray], + bytes_attrib : Union[pdarray,np.ndarray]) -> None: + """ + Initializes the Pdarrays instance by setting all instance + attributes, some of which are derived from the array parameters. + + Parameters + ---------- + offset_attrib : Union[pdarray, np.ndarray,array] + the array containing the offsets + bytes_attrib : Union[pdarray, np.ndarray,array] + the array containing the suffix array values + + Returns + ------- + None + + Raises + ------ + RuntimeError + Raised if there's an error converting a Numpy array or standard + Python array to either the offset_attrib or bytes_attrib + ValueError + Raised if there's an error in generating instance attributes + from either the offset_attrib or bytes_attrib parameter + """ + if isinstance(offset_attrib, pdarray): + self.offsets = offset_attrib + else: + try: + self.offsets = create_pdarray(offset_attrib) + except Exception as e: + raise RuntimeError(e) + if isinstance(bytes_attrib, pdarray): + self.bytes = bytes_attrib + else: + try: + self.bytes = create_pdarray(bytes_attrib) + except Exception as e: + raise RuntimeError(e) + try: + self.size = self.offsets.size + self.nbytes = self.bytes.size + self.ndim = self.offsets.ndim + self.shape = self.offsets.shape + except Exception as e: + raise ValueError(e) + self.dtype = np.str + self.logger = getArkoudaLogger(name=__class__.__name__) + + def __iter__(self): + raise NotImplementedError('Strings does not support iteration') + + def __len__(self) -> int: + return self.shape[0] + + def __str__(self) -> str: + if self.size <= pdarrayIterThresh: + vals = ["'{}'".format(self[i]) for i in range(self.size)] + else: + vals = ["'{}'".format(self[i]) for i in range(3)] + vals.append('... ') + vals.extend([self[i] for i in range(self.size-3, self.size)]) + return "[{}]".format(', '.join(vals)) + + def __repr__(self) -> str: + return "array({})".format(self.__str__()) + + def _binop(self, other : Pdarrays, op : str) -> pdarray: + """ + Executes the requested binop on this Pdarrays instance and the + parameter Pdarrays object and returns the results within + a pdarray object. + + Parameters + ---------- + other : Pdarrays + the other object is a Pdarrays object + op : str + name of the binary operation to be performed + + Returns + ------- + pdarray + encapsulating the results of the requested binop + + Raises + - ----- + ValueError + Raised if (1) the op is not in the self.BinOps set, or (2) if the + sizes of this and the other instance don't match, or (3) the other + object is not a Pdarrays object + RuntimeError + Raised if a server-side error is thrown while executing the + binary operation + """ + if op not in self.BinOps: + raise ValueError("Pdarrays: unsupported operator: {}".format(op)) + if isinstance(other, Pdarrays): + if self.size != other.size: + raise ValueError("Pdarrays: size mismatch {} {}".\ + format(self.size, other.size)) + msg = "segmentedBinopvv {} {} {} {} {} {} {}".format(op, + self.objtype, + self.offsets.name, + self.bytes.name, + other.objtype, + other.offsets.name, + other.bytes.name) + elif resolve_scalar_dtype(other) == 'int': + msg = "segmentedBinopvs {} {} {} {} {} {}".format(op, + self.objtype, + self.offsets.name, + self.bytes.name, + self.objtype, + json.dumps([other])) + else: + raise ValueError("Pdarrays: {} not supported between Pdarrays and {}"\ + .format(op, other.__class__.__name__)) + repMsg = generic_msg(msg) + return create_pdarray(repMsg) + + def __eq__(self, other) -> bool: + return self._binop(other, "==") + + def __ne__(self, other : object) -> bool: + return self._binop(other, "!=") + + def __getitem__(self, key): + if np.isscalar(key) and resolve_scalar_dtype(key) == 'int64': + orig_key = key + if key < 0: + # Interpret negative key as offset from end of array + key += self.size + if (key >= 0 and key < self.size): + msg = "segmentedIndex {} {} {} {} {}".format('intIndex', + self.objtype, + self.offsets.name, + self.bytes.name, + key) + repMsg = generic_msg(msg) + _, value = repMsg.split(maxsplit=1) + return parse_single_int_array_value(value) + else: + raise IndexError("[int] {} is out of bounds with size {}".\ + format(orig_key,self.size)) + elif isinstance(key, slice): + (start,stop,stride) = key.indices(self.size) + self.logger.debug('start: {}; stop: {}; stride: {}'.format(start,stop,stride)) + msg = "segmentedIndex {} {} {} {} {} {} {}".format('sliceIndex', + self.objtype, + self.offsets.name, + self.bytes.name, + start, + stop, + stride) + repMsg = generic_msg(msg) + offsets, values = repMsg.split('+') + return Pdarrays(offsets, values); + elif isinstance(key, pdarray): + kind, _ = translate_np_dtype(key.dtype) + if kind not in ("bool", "int"): + raise TypeError("unsupported pdarray index type {}".format(key.dtype)) + if kind == "bool" and self.size != key.size: + raise ValueError("size mismatch {} {}".format(self.size,key.size)) + msg = "segmentedIndex {} {} {} {} {}".format('pdarrayIndex', + self.objtype, + self.offsets.name, + self.bytes.name, + key.name) + repMsg = generic_msg(msg) + offsets, values = repMsg.split('+') + return Pdarrays(offsets, values) + else: + raise TypeError("unsupported pdarray index type {}".format(key.__class__.__name__)) + + def get_lengths(self) -> pdarray: + """ + Return the length of each suffix array in the array. + + Returns + ------- + pdarray, int + The length of each string + + Raises + ------ + RuntimeError + Raised if there is a server-side error thrown + """ + msg = "segmentLengths {} {} {}".\ + format(self.objtype, self.offsets.name, self.bytes.name) + repMsg = generic_msg(msg) + return create_pdarray(repMsg) + + def __add__(self, other : Pdarrays) -> Pdarrays: + return self.stick(other) + + + def save(self, prefix_path : str, dataset : str='int_array', + mode : str='truncate') -> None: + """ + Save the Pdarrays object to HDF5. The result is a collection of HDF5 files, + one file per locale of the arkouda server, where each filename starts + with prefix_path. Each locale saves its chunk of the array to its + corresponding file. + + Parameters + ---------- + prefix_path : str + Directory and filename prefix that all output files share + dataset : str + The name of the Pdarrays dataset to be written, defaults to int_array + mode : str {'truncate' | 'append'} + By default, truncate (overwrite) output files, if they exist. + If 'append', create a new Pdarrays dataset within existing files. + + Returns + ------- + None + + Raises + ------ + ValueError + Raised if the lengths of columns and values differ, or the mode is + neither 'truncate' nor 'append' + + See Also + -------- + pdarrayIO.save + + Notes + ----- + Important implementation notes: (1) Pdarrays state is saved as two datasets + within an hdf5 group, (2) the hdf5 group is named via the dataset parameter, + (3) the hdf5 group encompasses the two pdarrays composing a Pdarrays object: + segments and values and (4) save logic is delegated to pdarray.save + """ + self.bytes.save(prefix_path=prefix_path, + dataset='{}/values'.format(dataset), mode=mode) + + @classmethod + def register_helper(cls, offsets, bytes): + return cls(offsets, bytes) + + def register(self, user_defined_name : str) -> 'Pdarrays': + return self.register_helper(self.offsets.register(user_defined_name+'_offsets'), + self.bytes.register(user_defined_name+'_bytes')) + + def unregister(self) -> None: + self.offsets.unregister() + self.bytes.unregister() + + @staticmethod + def attach(user_defined_name : str) -> 'Pdarrays': + return Strings(pdarray.attach(user_defined_name+'_offsets'), + pdarray.attach(user_defined_name+'_bytes')) + + diff --git a/benchmarks/gather.py b/benchmarks/gather.py index ecfddcd5c4..ec40c6056a 100755 --- a/benchmarks/gather.py +++ b/benchmarks/gather.py @@ -26,12 +26,26 @@ def time_ak_gather(isize, vsize, trials, dtype, random): v = ak.random_strings_uniform(1, 16, Nv) else: v = ak.ones(Nv, dtype=dtype) - + print("v={}".format(v)) + print("v.offsets={}".format(v.offsets)) + print("v.nbytes={}".format(v.nbytes)) + print("v[1]={}".format(v[1])) + print("In Gather size={}".format(v.size)) + print("In Gather nbytes={}".format(v.nbytes)) + print("In Gather ndim={}".format(v.ndim)) + print("In Gather shape={}".format(v.shape)) + print("In Gather offsets name ={}".format(v.offsets.name)) + print("In Gather offsets size={}".format(v.offsets.size)) + print("In Gather bytes name ={}".format(v.bytes.name)) + print("In Gather bytes size={}".format(v.bytes.size)) timings = [] for _ in range(trials): + print("In Gather loop i={}".format(i)) + print("In Gather v[i]={}".format(v[i])) start = time.time() c = v[i] end = time.time() + print("In Gather loop c={}".format(c)) timings.append(end - start) tavg = sum(timings) / trials diff --git a/benchmarks/sa.py b/benchmarks/sa.py new file mode 100755 index 0000000000..b33d175984 --- /dev/null +++ b/benchmarks/sa.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import time, argparse +import numpy as np +import arkouda as ak + +TYPES = ('int64', 'float64', 'bool', 'str') + +def time_ak_sa( vsize, trials, dtype): + print(">>> arkouda suffix array") + cfg = ak.get_config() + Nv = vsize * cfg["numLocales"] + print("numLocales = {}, num of strings = {:,}".format(cfg["numLocales"], Nv)) + v = ak.random_strings_uniform(90000000, 100000000, Nv) +# print("All the random strings are as follows") +# print(v) + c=ak.suffix_array(v) + print("size of suffix array={}".format(c.bytes.size)) +# for k in range(vsize): +# print("the {} th suffix array ={}".format(k,c[k])) + timings = [] + for _ in range(trials): + start = time.time() + ak.suffix_array(v) + end = time.time() + timings.append(end - start) + tavg = sum(timings) / trials + + print("Average time = {:.4f} sec".format(tavg)) + if dtype == 'str': + offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize + bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) + bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg + else: + bytes_per_sec = (c.size * c.itemsize * 3) / tavg +# print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) + +def time_np_sa(Ni, Nv, trials, dtype, random): + print("to be done") + +def check_correctness(dtype, random): + print("to be done") + +def create_parser(): + parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= V") + parser.add_argument('hostname', help='Hostname of arkouda server') + parser.add_argument('port', type=int, help='Port of arkouda server') + parser.add_argument('-v', '--value-size', type=int, help='Length of array from which values are gathered') + parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') + parser.add_argument('-d', '--dtype', default='str', help='Dtype of value array ({})'.format(', '.join(TYPES))) + return parser + +if __name__ == "__main__": + import sys + parser = create_parser() + args = parser.parse_args() + args.value_size = args.size if args.value_size is None else args.value_size + if args.dtype not in TYPES: + raise ValueError("Dtype must be {}, not {}".format('/'.join(TYPES), args.dtype)) + ak.verbose = False + ak.connect(args.hostname, args.port) + + print("size of values array = {:,}".format(args.value_size)) + print("number of trials = ", args.trials) + time_ak_sa( args.value_size, args.trials, args.dtype) + + sys.exit(0) diff --git a/benchmarks/scatter.py b/benchmarks/scatter.py index 932b7ef360..f3fc8fa7b7 100755 --- a/benchmarks/scatter.py +++ b/benchmarks/scatter.py @@ -25,9 +25,13 @@ def time_ak_scatter(isize, vsize, trials, dtype, random): timings = [] for _ in range(trials): + print("i={},c[i]={}".format(i, c[i])) + print("v={}".format(v)) start = time.time() c[i] = v end = time.time() + print("i={},c[i]={}".format(i, c[i])) + print("v={}".format(v)) timings.append(end - start) tavg = sum(timings) / trials diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index 148fc98b18..80c25d869b 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -134,7 +134,14 @@ module SegmentedArray { end = offsets.a[idx+1] - 1; } // Take the slice of the bytearray and "cast" it to a chpl string - var s = interpretAsString(values.a[start..end]); +// var s = interpretAsString(values.a[start..end]); + var tmp=values.a[start..end]; + var s: string; + var i:int; + s=""; + for i in tmp do { + s=s+" "+ i:string; + } return s; } @@ -661,6 +668,430 @@ module SegmentedArray { } // class SegString + + /** + * Represents an array of arrays, implemented as a segmented array of integers. + * Instances are ephemeral, not stored in the symbol table. Instead, attributes + * of this class refer to symbol table entries that persist. This class is a + * convenience for bundling those persistent objects and defining suffix array-relevant + * operations. + */ + class SegArray { + + /** + * The name of the SymEntry corresponding to the pdarray containing + * the offsets, which are start indices for each suffix array + */ + var offsetName: string; + + /** + * The pdarray containing the offsets, which are the start indices of + * the integerarrays, each of whichs corresponds to an individual suffix array. + */ + var offsets: borrowed SymEntry(int); + + /** + * The name of the SymEntry corresponding to the pdarray containing + * the suffix array values where each value is integer array. + */ + var valueName: string; + + /** + * The pdaray containing the complete byte array composed of bytes + * corresponding to each string, joined by nulls. Note: the null byte + * is int value of zero. + */ + var values: borrowed SymEntry(int); + + /** + * The number of strings in the segmented array + */ + var size: int; + + /** + * The total number of integer in the entire segmented array including + * the integer index corresonding to the suffix array. + */ + var nBytes: int; + + /* + * This version of the init method is the most common and is only used + * when the names of the segments (offsets) and values SymEntries are known. + */ + proc init(segName: string, valName: string, st: borrowed SymTab) { + offsetName = segName; + // The try! is needed here because init cannot throw + var gs = try! st.lookup(segName); + // I want this to be borrowed, but that throws a lifetime error + var segs = toSymEntry(gs, int): unmanaged SymEntry(int); + offsets = segs; + valueName = valName; + + var vs = try! st.lookup(valName); + var vals = toSymEntry(vs, int): unmanaged SymEntry(int); + values = vals; + size = segs.size; + nBytes = vals.size; + } + + /* + * This version of init method takes segments and values arrays as + * inputs, generates the SymEntry objects for each and passes the + * offset and value SymTab lookup names to the alternate init method + */ + proc init(segments: [] int, values: [] int, st: borrowed SymTab) { + var oName = st.nextName(); + var segEntry = new shared SymEntry(segments); + try! st.addEntry(oName, segEntry); + var vName = st.nextName(); + var valEntry = new shared SymEntry(values); + try! st.addEntry(vName, valEntry); + this.init(oName, vName, st); + } + + proc show(n: int = 3) throws { + if (size >= 2*n) { + for i in 0..#n { + writeln(this[i]); + } + writeln("..."); + for i in size-n..#n { + writeln(this[i]); + } + } else { + for i in 0..#size { + writeln(this[i]); + } + } + } + + /* Retrieve one string from the array */ + proc this(idx: int): string throws { + if (idx < offsets.aD.low) || (idx > offsets.aD.high) { + throw new owned OutOfBoundsError(); + } + // Start index of the string + var start = offsets.a[idx]; + // Index of last (null) byte in string + var end: int; + if (idx == size - 1) { + end = nBytes - 1; + } else { + end = offsets.a[idx+1] - 1; + } + // Take the slice of the bytearray and "cast" it to a chpl string + var s = interpretAsString(values.a[start..end]); + return s; + } + + /* Take a slice of strings from the array. The slice must be a + Chapel range, i.e. low..high by stride, not a Python slice. + Returns arrays for the segment offsets and bytes of the slice.*/ + proc this(const slice: range(stridable=true)) throws { + if (slice.low < offsets.aD.low) || (slice.high > offsets.aD.high) { + throw new owned OutOfBoundsError(); + } + // Early return for zero-length result + if (size == 0) || (slice.size == 0) { + return (makeDistArray(0, int), makeDistArray(0, int)); + } + // Start of bytearray slice + var start = offsets.a[slice.low]; + // End of bytearray slice + var end: int; + if (slice.high == offsets.aD.high) { + // if slice includes the last string, go to the end of values + end = values.aD.high; + } else { + end = offsets.a[slice.high+1] - 1; + } + // Segment offsets of the new slice + var newSegs = makeDistArray(slice.size, int); + ref oa = offsets.a; + // newSegs = offsets.a[slice] - start; + forall (i, ns) in zip(newSegs.domain, newSegs) with (var agg = newSrcAggregator(int)) { + agg.copy(ns, oa[slice.low + i]); + } + // Offsets need to be re-zeroed + newSegs -= start; + // Bytearray of the new slice + var newVals = makeDistArray(end - start + 1, int); + ref va = values.a; + // newVals = values.a[start..end]; + forall (i, nv) in zip(newVals.domain, newVals) with (var agg = newSrcAggregator(int)) { + agg.copy(nv, va[start + i]); + } + return (newSegs, newVals); + } + + /* Gather strings by index. Returns arrays for the segment offsets + and bytes of the gathered strings.*/ + proc this(iv: [?D] int) throws { + // Early return for zero-length result + if (D.size == 0) { + return (makeDistArray(0, int), makeDistArray(0, int)); + } + // Check all indices within bounds + var ivMin = min reduce iv; + var ivMax = max reduce iv; + if (ivMin < 0) || (ivMax >= offsets.size) { + throw new owned OutOfBoundsError(); + } + if v {writeln("Computing lengths and offsets"); stdout.flush();} + var t1 = getCurrentTime(); + ref oa = offsets.a; + const low = offsets.aD.low, high = offsets.aD.high; + // Gather the right and left boundaries of the indexed strings + // NOTE: cannot compute lengths inside forall because agg.copy will + // experience race condition with loop-private variable + var right: [D] int, left: [D] int; + forall (r, l, idx) in zip(right, left, iv) with (var agg = newSrcAggregator(int)) { + if (idx == high) { + agg.copy(r, values.size); + } else { + agg.copy(r, oa[idx+1]); + } + agg.copy(l, oa[idx]); + } + // Lengths of segments including null bytes + var gatheredLengths: [D] int = right - left; + // The returned offsets are the 0-up cumulative lengths + var gatheredOffsets = (+ scan gatheredLengths); + // The total number of bytes in the gathered strings + var retBytes = gatheredOffsets[D.high]; + gatheredOffsets -= gatheredLengths; + if v { + writeln(getCurrentTime() - t1, " seconds"); + writeln("Copying values"); stdout.flush(); + t1 = getCurrentTime(); + } + var gatheredVals = makeDistArray(retBytes, int); + // Multi-locale requires some extra localization work that is not needed + // in CHPL_COMM=none + if CHPL_COMM != 'none' { + // Compute the src index for each byte in gatheredVals + /* For performance, we will do this with a scan, so first we need an array + with the difference in index between the current and previous byte. For + the interior of a segment, this is just one, but at the segment boundary, + it is the difference between the src offset of the current segment ("left") + and the src index of the last byte in the previous segment (right - 1). + */ + var srcIdx = makeDistArray(retBytes, int); + srcIdx = 1; + var diffs: [D] int; + diffs[D.low] = left[D.low]; // first offset is not affected by scan + diffs[D.interior(D.size-1)] = left[D.interior(D.size-1)] - (right[D.interior(-(D.size-1))] - 1); + // Set srcIdx to diffs at segment boundaries + forall (go, d) in zip(gatheredOffsets, diffs) with (var agg = newDstAggregator(int)) { + agg.copy(srcIdx[go], d); + } + srcIdx = + scan srcIdx; + // Now srcIdx has a dst-local copy of the source index and vals can be efficiently gathered + ref va = values.a; + forall (v, si) in zip(gatheredVals, srcIdx) with (var agg = newSrcAggregator(int)) { + agg.copy(v, va[si]); + } + } else { + ref va = values.a; + // Copy string data to gathered result + forall (go, gl, idx) in zip(gatheredOffsets, gatheredLengths, iv) { + for pos in 0..#gl { + gatheredVals[go+pos] = va[oa[idx]+pos]; + } + } + } + if v {writeln(getCurrentTime() - t1, " seconds"); stdout.flush();} + return (gatheredOffsets, gatheredVals); + } + + /* Logical indexing (compress) of strings. */ + proc this(iv: [?D] bool) throws { + // Index vector must be same domain as array + if (D != offsets.aD) { + throw new owned OutOfBoundsError(); + } + if v {writeln("Computing lengths and offsets"); stdout.flush();} + var t1 = getCurrentTime(); + ref oa = offsets.a; + const low = offsets.aD.low, high = offsets.aD.high; + // Calculate the destination indices + var steps = + scan iv; + var newSize = steps[high]; + steps -= iv; + // Early return for zero-length result + if (newSize == 0) { + return (makeDistArray(0, int), makeDistArray(0,int)); + } + var segInds = makeDistArray(newSize, int); + forall (t, dst, idx) in zip(iv, steps, D) with (var agg = newDstAggregator(int)) { + if t { + agg.copy(segInds[dst], idx); + } + } + return this[segInds]; + + /* // Lengths of dest segments including null bytes */ + /* var gatheredLengths = makeDistArray(newSize, int); */ + /* forall (idx, present, i) in zip(D, iv, steps) { */ + /* if present { */ + /* segInds[i-1] = idx; */ + /* if (idx == high) { */ + /* gatheredLengths[i-1] = values.size - oa[high]; */ + /* } else { */ + /* gatheredLengths[i-1] = oa[idx+1] - oa[idx]; */ + /* } */ + /* } */ + /* } */ + /* // Make dest offsets from lengths */ + /* var gatheredOffsets = (+ scan gatheredLengths); */ + /* var retBytes = gatheredOffsets[newSize-1]; */ + /* gatheredOffsets -= gatheredLengths; */ + /* if v { */ + /* writeln(getCurrentTime() - t1, " seconds"); */ + /* writeln("Copying values"); stdout.flush(); */ + /* t1 = getCurrentTime(); */ + /* } */ + /* var gatheredVals = makeDistArray(retBytes, uint(8)); */ + /* ref va = values.a; */ + /* if DEBUG { */ + /* printAry("gatheredOffsets: ", gatheredOffsets); */ + /* printAry("gatheredLengths: ", gatheredLengths); */ + /* printAry("segInds: ", segInds); */ + /* } */ + /* // Copy string bytes from src to dest */ + /* forall (go, gl, idx) in zip(gatheredOffsets, gatheredLengths, segInds) { */ + /* gatheredVals[{go..#gl}] = va[{oa[idx]..#gl}]; */ + /* } */ + /* if v {writeln(getCurrentTime() - t1, " seconds"); stdout.flush();} */ + /* return (gatheredOffsets, gatheredVals); */ + } + + /* Apply a hash function to all strings. This is useful for grouping + and set membership. The hash used is SipHash128.*/ + proc hash() throws { + // 128-bit hash values represented as 2-tuples of uint(64) + var hashes: [offsets.aD] 2*uint(64); + // Early exit for zero-length result + if (size == 0) { + return hashes; + } + ref oa = offsets.a; + ref va = values.a; + // Compute lengths of strings + var lengths = getLengths(); + // Hash each string + // TO DO: test on clause with aggregator + forall (o, l, h) in zip(oa, lengths, hashes) { + const myRange = o..#l; + h = sipHash128(va, myRange); + /* // localize the string bytes */ + /* const myBytes = va[{o..#l}]; */ + /* h = sipHash128(myBytes, hashKey); */ + /* // Perf Note: localizing string bytes is ~3x faster on IB multilocale than this: */ + /* // h = sipHash128(va[{o..#l}]); */ + } + return hashes; + } + + /* Return a permutation that groups the strings. Because hashing is used, + this permutation will not sort the strings, but all equivalent strings + will fall in one contiguous block. */ + proc argGroup() throws { + var t = new Timer(); + if useHash { + // Hash all strings + if v { writeln("Hashing strings"); stdout.flush(); t.start(); } + var hashes = this.hash(); + if v { t.stop(); writeln("hashing took %t seconds\nSorting hashes".format(t.elapsed())); stdout.flush(); t.clear(); t.start(); } + // Return the permutation that sorts the hashes + var iv = radixSortLSD_ranks(hashes); + if v { t.stop(); writeln("sorting took %t seconds".format(t.elapsed())); stdout.flush(); } + if DEBUG { + var sortedHashes = [i in iv] hashes[i]; + var diffs = sortedHashes[(iv.domain.low+1)..#(iv.size-1)] - sortedHashes[(iv.domain.low)..#(iv.size-1)]; + printAry("diffs = ", diffs); + var nonDecreasing = [(d0,d1) in diffs] ((d0 > 0) || ((d0 == 0) && (d1 >= 0))); + writeln("Are hashes sorted? ", && reduce nonDecreasing); + } + return iv; + } else { + var iv = argsort(); + return iv; + } + } + + /* Return lengths of all strings, including null terminator. */ + proc getLengths() { + var lengths: [offsets.aD] int; + if (size == 0) { + return lengths; + } + ref oa = offsets.a; + const low = offsets.aD.low; + const high = offsets.aD.high; + forall (i, o, l) in zip(offsets.aD, oa, lengths) { + if (i == high) { + l = values.size - o; + } else { + l = oa[i+1] - o; + } + } + /* lengths[low..high-1] = (oa[low+1..high] - oa[low..high-1]); */ + /* lengths[high] = values.size - oa[high]; */ + return lengths; + } + + + + proc ediff():[offsets.aD] int { + var diff: [offsets.aD] int; + if (size < 2) { + return diff; + } + ref oa = offsets.a; + ref va = values.a; + const high = offsets.aD.high; + forall (i, a) in zip(offsets.aD, diff) { + if (i < high) { + var asc: bool; + const left = oa[i]..oa[i+1]-1; + if (i < high - 1) { + const right = oa[i+1]..oa[i+2]-1; + a = -memcmp(va, left, va, right); + } else { // i == high - 1 + const right = oa[i+1]..values.aD.high; + a = -memcmp(va, left, va, right); + } + } else { // i == high + a = 0; + } + } + return diff; + } + + proc isSorted():bool { + if (size < 2) { + return true; + } + return (&& reduce (ediff() >= 0)); + } + + proc argsort(checkSorted:bool=true): [offsets.aD] int throws { + const ref D = offsets.aD; + const ref va = values.a; + if checkSorted && isSorted() { + if DEBUG { writeln("argsort called on already sorted array"); stdout.flush(); } + var ranks: [D] int = [i in D] i; + return ranks; + } + var ranks = twoPhaseStringSort(this); + return ranks; + } + + } // class SegArray + + + inline proc memcmp(const ref x: [] uint(8), const xinds, const ref y: [] uint(8), const yinds): int { const l = min(xinds.size, yinds.size); var ret: int = 0; @@ -924,19 +1355,26 @@ module SegmentedArray { } /* Convert an array of raw bytes into a Chapel string. */ - inline proc interpretAsString(bytearray: [?D] uint(8)): string { + inline proc interpretAsString(bytearray: [?D] int(64)): string { // Byte buffer must be local in order to make a C pointer - var localBytes: [{0..#D.size}] uint(8) = bytearray; + var localBytes: [{0..#D.size}] int = bytearray; var cBytes = c_ptrTo(localBytes); // Byte buffer is null-terminated, so length is buffer.size - 1 // The contents of the buffer should be copied out because cBytes will go out of scope // var s = new string(cBytes, D.size-1, D.size, isowned=false, needToCopy=true); var s: string; + var i:int; + s=""; + for i in bytearray do { + s=s+" "+ i:string; + } +/* try { s = createStringWithNewBuffer(cBytes, D.size-1, D.size); } catch { s = ""; } +*/ return s; } } diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 853f4f9316..94fa69eb24 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -10,6 +10,10 @@ module SegmentedMsg { use IO; use GenSymIO only jsonToPdArray; + use SymArrayDmap; + use SACA; + + private config const DEBUG = false; proc randomStringsMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { @@ -59,6 +63,8 @@ module SegmentedMsg { return repMsg; } + + proc segmentLengthsMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { var pn = Reflection.getRoutineName(); @@ -367,6 +373,16 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var s = strings[idx]; return "item %s %jt".format("str", s); } + when "int" { + // Make a temporary strings array + var arrays = new owned SegArray(args[1], args[2], st); + // Parse the index + var idx = args[3]:int; + // TO DO: in the future, we will force the client to handle this + idx = convertPythonIndexToChapel(idx, arrays.size); + var s = arrays[idx]; + return "item %s %jt".format("int", s); + } otherwise { var errorMsg = notImplementedError(pn, objtype); writeln(generateErrorContext( @@ -658,4 +674,76 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } return "created " + st.attrib(rname); } + + + + proc segSuffixArrayMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var (objtype, segName, valName) = payload.decode().splitMsgToTuple(3); + var repMsg: string; + var x:int; + var y:int(32); + + // check to make sure symbols defined + st.check(segName); + st.check(valName); + + var strings = new owned SegString(segName, valName, st); + var size=strings.size; + var nBytes = strings.nBytes; + var length=strings.getLengths(); + var offsegs = (+ scan length) - length; + var startposition:int; + var endposition:int; + + select (objtype) { + when "str" { + // To be checked, I am not sure if this formula can estimate the total memory requirement + // Lengths + 2*segs + 2*vals (copied to SymTab) + overMemLimit(8*size + 16*size + nBytes); + + //allocate an offset array + var sasoff = offsegs; + //allocate an values array + var sasval:[0..(nBytes-1)] int; + + var i:int; + for i in 0..(size-1) do { + // the start position of ith string in value array + startposition = offsegs[i]; + endposition = startposition+length[i]-1; + var sasize=length[i]:int(32); + ref strArray=strings.values.a[startposition..endposition]; + var tmparray:[1..sasize] int(32); + divsufsort(strArray,tmparray,sasize); + for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do + x = y; + } + + var segName2 = st.nextName(); + var valName2 = st.nextName(); + + var segEntry = new shared SymEntry(sasoff); + var valEntry = new shared SymEntry(sasval); + + st.addEntry(segName2, segEntry); + st.addEntry(valName2, valEntry); + repMsg = 'created ' + st.attrib(segName2) + '+created ' + st.attrib(valName2); + return repMsg; + + } + otherwise { + var errorMsg = notImplementedError(pn, "("+objtype+")"); + writeln(generateErrorContext( + msg=errorMsg, + lineNumber=getLineNumber(), + moduleName=getModuleName(), + routineName=getRoutineName(), + errorClass="NotImplementedError")); + return errorMsg; + } + } + + } } + diff --git a/src/arkouda_server.chpl b/src/arkouda_server.chpl index a829ee9148..1404e465bd 100644 --- a/src/arkouda_server.chpl +++ b/src/arkouda_server.chpl @@ -219,6 +219,7 @@ proc main() { when "segmentedBinopvv" {repMsg = segBinopvvMsg(cmd, payload, st);} when "segmentedBinopvs" {repMsg = segBinopvsMsg(cmd, payload, st);} when "segmentedGroup" {repMsg = segGroupMsg(cmd, payload, st);} + when "segmentedSuffixAry"{repMsg = segSuffixArrayMsg(cmd, payload, st);} when "segmentedIn1d" {repMsg = segIn1dMsg(cmd, payload, st);} when "lshdf" {repMsg = lshdfMsg(cmd, payload, st);} when "readhdf" {repMsg = readhdfMsg(cmd, payload, st);} From d288c107cbe5a7f5927176f25f041d431d5427aa Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Tue, 24 Nov 2020 23:37:55 -0500 Subject: [PATCH 03/68] add read file suffix array function and all libdivsufsort files --- arkouda/pdarrayclass.py | 3 ++- benchmarks/sa.py | 13 ++++++++----- src/SegmentedMsg.chpl | 10 +++++----- thirdparty/SA/libdivsufsort | 1 + 4 files changed, 16 insertions(+), 11 deletions(-) create mode 160000 thirdparty/SA/libdivsufsort diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py index f7724d8e07..d8a0c5f3ae 100755 --- a/arkouda/pdarrayclass.py +++ b/arkouda/pdarrayclass.py @@ -89,7 +89,8 @@ def parse_single_int_array_value(msg : str) -> object: else: raise ValueError(("unsupported value from server {} {}".\ format(mydtype.name, value))) - return fields[1] + nfields = fields[1].split("\"") + return nfields[1] # class for the pdarray class pdarray: diff --git a/benchmarks/sa.py b/benchmarks/sa.py index b33d175984..41cc3ada92 100755 --- a/benchmarks/sa.py +++ b/benchmarks/sa.py @@ -11,13 +11,16 @@ def time_ak_sa( vsize, trials, dtype): cfg = ak.get_config() Nv = vsize * cfg["numLocales"] print("numLocales = {}, num of strings = {:,}".format(cfg["numLocales"], Nv)) - v = ak.random_strings_uniform(90000000, 100000000, Nv) -# print("All the random strings are as follows") -# print(v) +# v = ak.random_strings_uniform(90000000, 100000000, Nv) + v = ak.random_strings_uniform(1, 16, Nv) c=ak.suffix_array(v) print("size of suffix array={}".format(c.bytes.size)) -# for k in range(vsize): -# print("the {} th suffix array ={}".format(k,c[k])) +# print("All the random strings are as follows") + for k in range(vsize): + print("the {} th random tring ={}".format(k,v[k])) + print("the {} th suffix array ={}".format(k,c[k])) + print("") +# print(v) timings = [] for _ in range(trials): start = time.time() diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 94fa69eb24..ffaded812a 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -681,8 +681,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var pn = Reflection.getRoutineName(); var (objtype, segName, valName) = payload.decode().splitMsgToTuple(3); var repMsg: string; - var x:int; - var y:int(32); // check to make sure symbols defined st.check(segName); @@ -693,8 +691,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var nBytes = strings.nBytes; var length=strings.getLengths(); var offsegs = (+ scan length) - length; - var startposition:int; - var endposition:int; select (objtype) { when "str" { @@ -708,14 +704,18 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var sasval:[0..(nBytes-1)] int; var i:int; - for i in 0..(size-1) do { + forall i in 0..(size-1) do { // the start position of ith string in value array + var startposition:int; + var endposition:int; startposition = offsegs[i]; endposition = startposition+length[i]-1; var sasize=length[i]:int(32); ref strArray=strings.values.a[startposition..endposition]; var tmparray:[1..sasize] int(32); divsufsort(strArray,tmparray,sasize); + var x:int; + var y:int(32); for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do x = y; } diff --git a/thirdparty/SA/libdivsufsort b/thirdparty/SA/libdivsufsort new file mode 160000 index 0000000000..5f60d6f026 --- /dev/null +++ b/thirdparty/SA/libdivsufsort @@ -0,0 +1 @@ +Subproject commit 5f60d6f026c30fb4ac296f696b3c8b0eb71bd428 From 9a2270484269ff439858c75c9e6e2e44dfbbd9ae Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 25 Nov 2020 20:18:20 -0500 Subject: [PATCH 04/68] change name --- arkouda/pdarraycreation.py | 56 ++++++++++++++++++++++++--- arkouda/strings.py | 58 ++++++++++++++-------------- src/SACA.chpl | 18 ++++----- src/SegmentedArray.chpl | 4 +- src/SegmentedMsg.chpl | 78 +++++++++++++++++++++++++++++++++++++- src/arkouda_server.chpl | 1 + 6 files changed, 169 insertions(+), 46 deletions(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index e827c7a33c..c7410189ec 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -8,7 +8,8 @@ from arkouda.dtypes import dtype as akdtype from arkouda.pdarrayclass import pdarray, create_pdarray from arkouda.strings import Strings -from arkouda.strings import Pdarrays +from arkouda.strings import SArrays +from multipledispatch import dispatch __all__ = ["array", "zeros", "ones", "zeros_like", "ones_like", "arange", "linspace", "randint", "uniform", "standard_normal", @@ -673,12 +674,13 @@ def random_strings_lognormal(logmean : Union[float, int], logstd : float, -@typechecked -def suffix_array( strings : Strings) -> Pdarrays: +#@typechecked +@dispatch(Strings) +def suffix_array( strings : Strings) -> SArrays: """ Return the suffix arrays of given strings. The size/shape of each suffix arrays is the same as the corresponding strings. - A simple example of suffix array is as follow. Given string "banana$", + A simple example of suffix array is as follow. Given a string "banana$", all the suffixes are as follows. s[0]="banana$" s[1]="anana$" @@ -718,6 +720,50 @@ def suffix_array( strings : Strings) -> Pdarrays: strings.offsets.name, strings.bytes.name) repMsg = generic_msg(msg) - pdarrays= Pdarrays(*(repMsg.split('+'))) + pdarrays= SArrays(*(repMsg.split('+'))) return pdarrays +@dispatch(str) +def suffix_array(filename: str) -> SArrays: + """ + This function is major used for testing correctness and performance + Return the suffix array of given file name's content as a string. + A simple example of suffix array is as follow. Given string "banana$", + all the suffixes are as follows. + s[0]="banana$" + s[1]="anana$" + s[2]="nana$" + s[3]="ana$" + s[4]="na$" + s[5]="a$" + s[6]="$" + The suffix array of string "banana$" is the array of indices of sorted suffixes. + s[6]="$" + s[5]="a$" + s[3]="ana$" + s[1]="anana$" + s[0]="banana$" + s[4]="na$" + s[2]="nana$" + so sa=[6,5,3,1,0,4,2] + + Returns + ------- + pdarray + The suffix arrays of the given strings + See Also + -------- + + Notes + ----- + + Raises + ------ + RuntimeError + Raised if there is a server-side error in executing group request or + creating the pdarray encapsulating the return message + """ + msg = "segmentedSAFile {}".format( filename ) + repMsg = generic_msg(msg) + pdarrays= SArrays(*(repMsg.split('+'))) + return pdarrays diff --git a/arkouda/strings.py b/arkouda/strings.py index eaca43bfae..39e387bb3b 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -11,7 +11,7 @@ global verbose global pdarrayIterThresh -__all__ = ['Strings','Pdarrays'] +__all__ = ['Strings','SArrays'] class Strings: """ @@ -788,13 +788,13 @@ def attach(user_defined_name : str) -> 'Strings': pdarray.attach(user_defined_name+'_bytes')) -class Pdarrays: +class SArrays: """ Represents an array of (suffix) arrays whose data resides on the arkouda server. The user should not call this class directly; rather its instances are created by other arkouda functions. It is very similar to Strings and the difference is that its content is - an int arrays instead of strings. + int arrays instead of strings. Attributes ---------- @@ -806,6 +806,7 @@ class Pdarrays: The number of suffix arrays in the array nbytes : int The total number of indices in all suffix arrays + We have the same number indices as the number of characters/suffixes in strings ndim : int The rank of the array (currently only rank 1 arrays supported) shape : tuple @@ -817,9 +818,9 @@ class Pdarrays: Notes ----- - Pdarrays is composed of two pdarrays: (1) offsets, which contains the + SArrays is composed of two pdarrays: (1) offsets, which contains the starting indices for each string's suffix array and (2) bytes, which contains the - raw indices of all suffix arrays,no any spliter between the arrays. + indices of all suffix arrays, no any spliter between two index arrays. """ BinOps = frozenset(["==", "!="]) @@ -828,7 +829,7 @@ class Pdarrays: def __init__(self, offset_attrib : Union[pdarray,np.ndarray], bytes_attrib : Union[pdarray,np.ndarray]) -> None: """ - Initializes the Pdarrays instance by setting all instance + Initializes the SArrays instance by setting all instance attributes, some of which are derived from the array parameters. Parameters @@ -836,7 +837,7 @@ def __init__(self, offset_attrib : Union[pdarray,np.ndarray], offset_attrib : Union[pdarray, np.ndarray,array] the array containing the offsets bytes_attrib : Union[pdarray, np.ndarray,array] - the array containing the suffix array values + the array containing the suffix array indices Returns ------- @@ -872,11 +873,12 @@ def __init__(self, offset_attrib : Union[pdarray,np.ndarray], self.shape = self.offsets.shape except Exception as e: raise ValueError(e) +# maybe we need to change the dtype into int later self.dtype = np.str self.logger = getArkoudaLogger(name=__class__.__name__) def __iter__(self): - raise NotImplementedError('Strings does not support iteration') + raise NotImplementedError('SArrays does not support iteration now') def __len__(self) -> int: return self.shape[0] @@ -893,16 +895,16 @@ def __str__(self) -> str: def __repr__(self) -> str: return "array({})".format(self.__str__()) - def _binop(self, other : Pdarrays, op : str) -> pdarray: + def _binop(self, other : SArrays, op : str) -> pdarray: """ - Executes the requested binop on this Pdarrays instance and the - parameter Pdarrays object and returns the results within + Executes the requested binop on this SArrays instance and the + parameter SArrays object and returns the results within a pdarray object. Parameters ---------- - other : Pdarrays - the other object is a Pdarrays object + other : SArrays + the other object is a SArrays object op : str name of the binary operation to be performed @@ -916,16 +918,16 @@ def _binop(self, other : Pdarrays, op : str) -> pdarray: ValueError Raised if (1) the op is not in the self.BinOps set, or (2) if the sizes of this and the other instance don't match, or (3) the other - object is not a Pdarrays object + object is not a SArrays object RuntimeError Raised if a server-side error is thrown while executing the binary operation """ if op not in self.BinOps: - raise ValueError("Pdarrays: unsupported operator: {}".format(op)) - if isinstance(other, Pdarrays): + raise ValueError("SArrays: unsupported operator: {}".format(op)) + if isinstance(other, SArrays): if self.size != other.size: - raise ValueError("Pdarrays: size mismatch {} {}".\ + raise ValueError("SArrays: size mismatch {} {}".\ format(self.size, other.size)) msg = "segmentedBinopvv {} {} {} {} {} {} {}".format(op, self.objtype, @@ -942,7 +944,7 @@ def _binop(self, other : Pdarrays, op : str) -> pdarray: self.objtype, json.dumps([other])) else: - raise ValueError("Pdarrays: {} not supported between Pdarrays and {}"\ + raise ValueError("SArrays: {} not supported between SArrays and {}"\ .format(op, other.__class__.__name__)) repMsg = generic_msg(msg) return create_pdarray(repMsg) @@ -983,7 +985,7 @@ def __getitem__(self, key): stride) repMsg = generic_msg(msg) offsets, values = repMsg.split('+') - return Pdarrays(offsets, values); + return SArrays(offsets, values); elif isinstance(key, pdarray): kind, _ = translate_np_dtype(key.dtype) if kind not in ("bool", "int"): @@ -997,7 +999,7 @@ def __getitem__(self, key): key.name) repMsg = generic_msg(msg) offsets, values = repMsg.split('+') - return Pdarrays(offsets, values) + return SArrays(offsets, values) else: raise TypeError("unsupported pdarray index type {}".format(key.__class__.__name__)) @@ -1020,14 +1022,14 @@ def get_lengths(self) -> pdarray: repMsg = generic_msg(msg) return create_pdarray(repMsg) - def __add__(self, other : Pdarrays) -> Pdarrays: + def __add__(self, other : SArrays) -> SArrays: return self.stick(other) def save(self, prefix_path : str, dataset : str='int_array', mode : str='truncate') -> None: """ - Save the Pdarrays object to HDF5. The result is a collection of HDF5 files, + Save the SArrays object to HDF5. The result is a collection of HDF5 files, one file per locale of the arkouda server, where each filename starts with prefix_path. Each locale saves its chunk of the array to its corresponding file. @@ -1037,10 +1039,10 @@ def save(self, prefix_path : str, dataset : str='int_array', prefix_path : str Directory and filename prefix that all output files share dataset : str - The name of the Pdarrays dataset to be written, defaults to int_array + The name of the SArrays dataset to be written, defaults to int_array mode : str {'truncate' | 'append'} By default, truncate (overwrite) output files, if they exist. - If 'append', create a new Pdarrays dataset within existing files. + If 'append', create a new SArrays dataset within existing files. Returns ------- @@ -1058,9 +1060,9 @@ def save(self, prefix_path : str, dataset : str='int_array', Notes ----- - Important implementation notes: (1) Pdarrays state is saved as two datasets + Important implementation notes: (1) SArrays state is saved as two datasets within an hdf5 group, (2) the hdf5 group is named via the dataset parameter, - (3) the hdf5 group encompasses the two pdarrays composing a Pdarrays object: + (3) the hdf5 group encompasses the two pdarrays composing a SArrays object: segments and values and (4) save logic is delegated to pdarray.save """ self.bytes.save(prefix_path=prefix_path, @@ -1070,7 +1072,7 @@ def save(self, prefix_path : str, dataset : str='int_array', def register_helper(cls, offsets, bytes): return cls(offsets, bytes) - def register(self, user_defined_name : str) -> 'Pdarrays': + def register(self, user_defined_name : str) -> 'SArrays': return self.register_helper(self.offsets.register(user_defined_name+'_offsets'), self.bytes.register(user_defined_name+'_bytes')) @@ -1079,7 +1081,7 @@ def unregister(self) -> None: self.bytes.unregister() @staticmethod - def attach(user_defined_name : str) -> 'Pdarrays': + def attach(user_defined_name : str) -> 'SArrays': return Strings(pdarray.attach(user_defined_name+'_offsets'), pdarray.attach(user_defined_name+'_bytes')) diff --git a/src/SACA.chpl b/src/SACA.chpl index b3b53267f6..ace4bf3168 100644 --- a/src/SACA.chpl +++ b/src/SACA.chpl @@ -2,16 +2,16 @@ module SACA{ // In this module, different algorithms to construct suffix array are provided //Nov.15, 2020 -// The first algorithm is divsufsort which is the fastest sequential and OpenMP c codes on suffix array -require "../../../SA/libdivsufsort/include/config.h"; -require "../../../SA/libdivsufsort/include/divsufsort.h"; -require "../../../SA/libdivsufsort/include/divsufsort_private.h"; -require "../../../SA/libdivsufsort/include/lfs.h"; +// The first algorithm divsufsort is the fastest C codes on suffix array +require "../thirdparty/SA/libdivsufsort/include/config.h"; +require "../thirdparty/SA/libdivsufsort/include/divsufsort.h"; +require "../thirdparty/SA/libdivsufsort/include/divsufsort_private.h"; +require "../thirdparty/SA/libdivsufsort/include/lfs.h"; -require "../../../SA/libdivsufsort/lib/divsufsort.c"; -require "../../../SA/libdivsufsort/lib/sssort.c"; -require "../../../SA/libdivsufsort/lib/trsort.c"; -require "../../../SA/libdivsufsort/lib/utils.c"; +require "../thirdparty/SA/libdivsufsort/lib/divsufsort.c"; +require "../thirdparty/SA/libdivsufsort/lib/sssort.c"; +require "../thirdparty/SA/libdivsufsort/lib/trsort.c"; +require "../thirdparty/SA/libdivsufsort/lib/utils.c"; /* require "/home/z/zd4/SA/nong/saca-k-tois-20130413/saca-k/saca-k.cc"; */ diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index 80c25d869b..1a8e09daeb 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -676,7 +676,7 @@ module SegmentedArray { * convenience for bundling those persistent objects and defining suffix array-relevant * operations. */ - class SegArray { + class SegSArray { /** * The name of the SymEntry corresponding to the pdarray containing @@ -1088,7 +1088,7 @@ module SegmentedArray { return ranks; } - } // class SegArray + } // class SegSArray diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index ffaded812a..49a375ad44 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -375,7 +375,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } when "int" { // Make a temporary strings array - var arrays = new owned SegArray(args[1], args[2], st); + var arrays = new owned SegSArray(args[1], args[2], st); // Parse the index var idx = args[3]:int; // TO DO: in the future, we will force the client to handle this @@ -691,7 +691,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var nBytes = strings.nBytes; var length=strings.getLengths(); var offsegs = (+ scan length) - length; - + writeln("offsegs="); + writeln(offsegs); select (objtype) { when "str" { // To be checked, I am not sure if this formula can estimate the total memory requirement @@ -745,5 +746,78 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } } + + proc segSAFileMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); +// var (FileName) = payload.decode().splitMsgToTuple(1); + var FileName = payload.decode(); + var repMsg: string; + + var filesize:int(32); + var f = open(FileName, iomode.r); + var size:int=1; + var nBytes = f.size; + var length:[0..0] int =nBytes; + var offsegs:[0..0] int =0 ; + + select ("str") { + when "str" { + // To be checked, I am not sure if this formula can estimate the total memory requirement + // Lengths + 2*segs + 2*vals (copied to SymTab) + overMemLimit(8*size + 16*size + nBytes); + + //allocate an offset array + var sasoff = offsegs; + //allocate an values array + var sasval:[0..(nBytes-1)] int; + + var i:int; + forall i in 0..(size-1) do { + // the start position of ith string in value array + var startposition:int; + var endposition:int; + startposition = 0; + endposition = nBytes-1; + var sasize=nBytes:int(32); + var strArray:[startposition..endposition]uint(8); + var r = f.reader(kind=ionative); + r.read(strArray); + var tmparray:[1..sasize] int(32); + divsufsort(strArray,tmparray,sasize); + var x:int; + var y:int(32); + for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do + x = y; + } + + var segName2 = st.nextName(); + var valName2 = st.nextName(); + + var segEntry = new shared SymEntry(sasoff); + var valEntry = new shared SymEntry(sasval); + + st.addEntry(segName2, segEntry); + st.addEntry(valName2, valEntry); + repMsg = 'created ' + st.attrib(segName2) + '+created ' + st.attrib(valName2); + return repMsg; + + } + otherwise { + var errorMsg = notImplementedError(pn, "("+FileName+")"); + writeln(generateErrorContext( + msg=errorMsg, + lineNumber=getLineNumber(), + moduleName=getModuleName(), + routineName=getRoutineName(), + errorClass="NotImplementedError")); + return errorMsg; + } + } + + } + } + + + diff --git a/src/arkouda_server.chpl b/src/arkouda_server.chpl index 1404e465bd..22af3127bc 100644 --- a/src/arkouda_server.chpl +++ b/src/arkouda_server.chpl @@ -220,6 +220,7 @@ proc main() { when "segmentedBinopvs" {repMsg = segBinopvsMsg(cmd, payload, st);} when "segmentedGroup" {repMsg = segGroupMsg(cmd, payload, st);} when "segmentedSuffixAry"{repMsg = segSuffixArrayMsg(cmd, payload, st);} + when "segmentedSAFile" {repMsg = segSAFileMsg(cmd, payload, st);} when "segmentedIn1d" {repMsg = segIn1dMsg(cmd, payload, st);} when "lshdf" {repMsg = lshdfMsg(cmd, payload, st);} when "readhdf" {repMsg = readhdfMsg(cmd, payload, st);} From c81d755e7cbd9d25527a7dcd6380e6d1d71e4568 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 26 Nov 2020 17:47:45 -0500 Subject: [PATCH 05/68] add suffix_array Python test --- tests/suffixarray_test.py | 448 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 448 insertions(+) create mode 100644 tests/suffixarray_test.py diff --git a/tests/suffixarray_test.py b/tests/suffixarray_test.py new file mode 100644 index 0000000000..f7c9049bac --- /dev/null +++ b/tests/suffixarray_test.py @@ -0,0 +1,448 @@ +import numpy as np +from collections import Counter +from context import arkouda as ak +from base_test import ArkoudaTest +import pytest +import random +import string +ak.verbose = False + +N = 100 +UNIQUE = N//2 + +# test_strings = np.array(['These are', 'some', 'interesting', +# '~!@#$%^&*()_+', 'sarrays', '8675309.', +# 'These are', 'some', 'duplicates.', +# 'hello', 'world']) + +# test_suffix array = np.array([9, 5, 0, 6, 8, 4, 2, 1, 7, 3], +# [4, 3, 2, 1, 0], [11, 3, 5, 10, 8, 0, 9, 1, 4, 6, 2, 7] +# [13, 1, 3, 4, 5, 7, 9, 10, 8, 12, 2, 6, 11, 0], +# [7, 5, 3, 4, 2, 6, 0, 1],[8, 7, 5, 4, 3, 1, 2, 0, 6], +# [9, 5, 0, 6, 8, 4, 2, 1, 7, 3],[4, 3, 2, 1, 0], +# [10, 6, 5, 0, 8, 4, 3, 2, 9, 7, 1],[5, 1, 0, 2, 3, 4] +# [5, 4, 3, 1, 2, 0] +def compare_sas(a, b): + return all(x == y for x, y in zip(a, b)) + +errors = False + +def run_test_argsort(sarrays, test_sas, cat): + akperm = ak.argsort(sarrays) + aksorted = sarrays[akperm].to_ndarray() + npsorted = np.sort(test_sas) + assert((aksorted == npsorted).all()) + catperm = ak.argsort(cat) + catsorted = cat[catperm].to_ndarray() + assert((catsorted == npsorted).all()) + +def run_test_unique(sarrays, test_sas, cat): + # unique + akuniq = ak.unique(sarrays) + catuniq = ak.unique(cat) + akset = set(akuniq.to_ndarray()) + catset = set(catuniq.to_ndarray()) + assert(akset == catset) + # There should be no duplicates + assert(akuniq.size == len(akset)) + npset = set(np.unique(test_sas)) + # When converted to a set, should agree with numpy + assert(akset == npset) + return akset + +def run_test_index(sarrays, test_sas, cat): + # int index + assert(sarrays[N//3] == test_sas[N//3]) + #assert(cat[N//3] == test_sas[N//3]) + print("int index passed") + +def run_test_slice(sarrays, test_sas, cat): + assert(compare_sas(sarrays[N//4:N//3], + test_sas[N//4:N//3])) + #assert(compare_sas(cat[N//4:N//3].to_ndarray(), + # test_sas[N//4:N//3])) + +def run_test_pdarray_index(sarrays, test_sas, cat): + inds = ak.arange(0, len(sarrays), 10) + assert(compare_sas(sarrays[inds].to_ndarray(), test_sas[inds.to_ndarray()])) + #assert(compare_sas(cat[inds].to_ndarray(), test_sas[inds.to_ndarray()])) + +def run_comparison_test(sarrays, test_sas, cat): + akinds = (sarrays == test_sas[N//4]) + #catinds = (cat == test_sas[N//4]) + npinds = (test_sas == test_sas[N//4]) + assert(np.allclose(akinds, npinds)) + +def run_test_in1d(sarrays, cat, base_words): + more_choices = ak.randint(0, UNIQUE, 100) + #akwords = base_words[more_choices] + #more_words = akwords.to_ndarray() + matches = ak.in1d(sarrays, akwords) + catmatches = ak.in1d(cat, akwords) + assert((matches == catmatches).all()) + # Every word in matches should be in the target set + for word in sarrays[matches].to_ndarray(): + assert(word in more_words) + # Exhaustively find all matches to make sure we didn't miss any + inds = ak.zeros(sarrays.size, dtype=ak.bool) + for word in more_words: + inds |= (sarrays == word) + assert((inds == matches).all()) + +def run_test_groupby(sarrays, cat, akset): + g = ak.GroupBy(sarrays) + gc = ak.GroupBy(cat) + # Unique keys should be same result as ak.unique + assert(akset == set(g.unique_keys.to_ndarray())) + assert(akset == set(gc.unique_keys.to_ndarray())) + assert((gc.permutation == g.permutation).all()) + permStrings = sarrays[g.permutation].to_ndarray() + # Check each group individually + lengths = np.diff(np.hstack((g.segments.to_ndarray(), np.array([g.size])))) + for uk, s, l in zip(g.unique_keys.to_ndarray(), + g.segments.to_ndarray(), + lengths): + # All values in group should equal key + assert((permStrings[s:s+l] == uk).all()) + # Key should not appear anywhere outside of group + assert(not (permStrings[:s] == uk).any()) + assert(not (permStrings[s+l:] == uk).any()) + + +def run_test_contains(sarrays, test_sas, delim): + found = sarrays.contains(delim).to_ndarray() + npfound = np.array([s.count(delim) > 0 for s in test_sas]) + assert((found == npfound).all()) + +def run_test_starts_with(sarrays, test_sas, delim): + found = sarrays.startswith(delim).to_ndarray() + npfound = np.array([s.startswith(delim) for s in test_sas]) + assert((found == npfound).all()) + +def run_test_ends_with(sarrays, test_sas, delim): + found = sarrays.endswith(delim).to_ndarray() + npfound = np.array([s.endswith(delim) for s in test_sas]) + assert((found == npfound).all()) + +def run_test_peel(sarrays, test_sas, delim): + import itertools as it + tf = (True, False) + def munge(triple, inc, part): + ret = [] + for h, s, t in triple: + if not part and s == '': + ret.append(('', h)) + else: + if inc: + ret.append((h + s, t)) + else: + ret.append((h, t)) + l, r = tuple(zip(*ret)) + return np.array(l), np.array(r) + + def rmunge(triple, inc, part): + ret = [] + for h, s, t in triple: + if not part and s == '': + ret.append((t, '')) + else: + if inc: + ret.append((h, s + t)) + else: + ret.append((h, t)) + l, r = tuple(zip(*ret)) + return np.array(l), np.array(r) + + def slide(triple, delim): + h, s, t = triple + h2, s2, t2 = t.partition(delim) + newh = h + s + h2 + return newh, s2, t2 + + def rslide(triple, delim): + h, s, t = triple + h2, s2, t2 = h.rpartition(delim) + newt = t2 + s + t + return h2, s2, newt + + for times, inc, part in it.product(range(1,4), tf, tf): + ls, rs = sarrays.peel(delim, times=times, includeDelimiter=inc, keepPartial=part) + triples = [s.partition(delim) for s in test_sas] + for i in range(times-1): + triples = [slide(t, delim) for t in triples] + ltest, rtest = munge(triples, inc, part) + assert((ltest == ls.to_ndarray()).all() and (rtest == rs.to_ndarray()).all()) + + for times, inc, part in it.product(range(1,4), tf, tf): + ls, rs = sarrays.rpeel(delim, times=times, includeDelimiter=inc, keepPartial=part) + triples = [s.rpartition(delim) for s in test_sas] + for i in range(times-1): + triples = [rslide(t, delim) for t in triples] + ltest, rtest = rmunge(triples, inc, part) + assert((ltest == ls.to_ndarray()).all() and (rtest == rs.to_ndarray()).all()) + +def run_test_stick(sarrays, test_sas, base_words, delim): + test_sas2 = np.random.choice(base_words.to_ndarray(), N, replace=True) + sarrays2 = ak.array(test_sas2) + stuck = sarrays.stick(sarrays2, delimiter=delim).to_ndarray() + tstuck = np.array([delim.join((a, b)) for a, b in zip(test_sas, test_sas2)]) + assert ((stuck == tstuck).all()) + assert ((sarrays + sarrays2) == sarrays.stick(sarrays2, delimiter="")).all() + + lstuck = sarrays.lstick(sarrays2, delimiter=delim).to_ndarray() + tlstuck = np.array([delim.join((b, a)) for a, b in zip(test_sas, test_sas2)]) + assert ((lstuck == tlstuck).all()) + assert ((sarrays2 + sarrays) == sarrays.lstick(sarrays2, delimiter="")).all() + +def suffixArray(s): + suffixes = [(s[i:], i) for i in range(len(s))] + suffixes.sort(key=lambda x: x[0]) + sa= [s[1] for s in suffixes] + #sa.insert(0,len(sa)) + return sa + +def get_random_string(length): + letters = string.ascii_lowercase + result_str = ''.join(random.choice(letters) for i in range(length)) + return result_str +# print("Random string of length", length, "is:", result_str) + +def ascill_to_string(ini_list): + res="" + for val in ini_list: + res = res + chr(int(val)) + return res + + +def string_to_int(sa_str): + ary=[] + for val in sa_str: + ary.append(int(val)) + return ary + +def akstrings_to_suffix_array(ak_str): + ary=[] + for val in ak_str: + x=val.split(" ",1) + y=x[1] + z=y.split(" ") + s=ascill_to_string(z) + sa=suffixArray(s) + ary.append(sa) + return ary + +def aksa_to_int_array(ak_str): + ary=[] + for val in ak_str: + x=val.split(" ",1) + y=x[1] + z=y.split(" ") + intz= [int(z[i]) for i in range(len(z))] + ary.append(intz) + return ary +if __name__ == '__main__': + import sys + if len(sys.argv) > 1: + ak.connect(server=sys.argv[1], port=sys.argv[2]) + else: + ak.connect() + + # with open(__file__, 'r') as f: + # base_words = np.array(f.read().split()) + # test_sas = np.random.choice(base_words, N, replace=True) + # sarrays = ak.array(test_sas) + # generate a Strings object + base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') + # get the real strings + strings1 = [base_words1[i] for i in range(len(base_words1))] + # generate a Strings object + base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') + # get the real strings + strings2 = [base_words2[i] for i in range(len(base_words2))] + #Generate suffix array locally + sa_ori1=akstrings_to_suffix_array(strings1) + #Generate suffix array locally + sa_ori2=akstrings_to_suffix_array(strings2) + #Generate suffix array remotely + sa1=ak.suffix_array(base_words1) + #Generate suffix array remotely + sa2=ak.suffix_array(base_words2) + #get the suffix array from SArray object + suffixarray1=[sa1[i] for i in range(len(sa1))] + #transfer the string suffix array to real int suffix array + sa_test1=aksa_to_int_array(suffixarray1) + #get the suffix array from SArray object + suffixarray2=[sa2[i] for i in range(len(sa2))] + #transfer the string suffix array to real int suffix array + sa_test2=aksa_to_int_array(suffixarray2) + + cat=0 + # int index + run_test_index(sa_ori1, sa_test1, cat) + run_test_index(sa_ori2, sa_test2, cat) + print("int index passed") + + # slice + run_test_slice(sa_ori1, sa_test1, cat) + run_test_slice(sa_ori2, sa_test2, cat) + print("slice passed") + + # pdarray int index + #run_test_pdarray_index(sa_ori1, sa_test1, cat) + #run_test_pdarray_index(sa_ori2, sa_test2, cat) + #print("pdarray int index passed") + + # comparison + run_comparison_test(sa_ori1, sa_test1, cat) + run_comparison_test(sa_ori2, sa_test2, cat) + print("comparison passed") + + # pdarray bool index + #run_test_pdarray_index(sarrays, test_sas, cat) + #print("pdarray bool index passed") + + # in1d and iter + # more_words = np.random.choice(base_words, 100) + # akwords = ak.array(more_words) + #run_test_in1d(sa_ori1, sa_test1, cat) + #run_test_in1d(sa_ori2, sa_test2, cat) + #print("in1d and iter passed") + + # argsort + #run_test_argsort(sa_ori1, sa_test1, cat) + + # unique + #akset = run_test_unique(sarrays, test_sas, cat) + ''' + # groupby + run_test_groupby(sarrays, cat, akset) + print("groupby passed") + + # substring functions + x, w = tuple(zip(*Counter(''.join(base_words.to_ndarray())).items())) + delim = np.random.choice(x, p=(np.array(w)/sum(w))) + + # contains + run_test_contains(sarrays, test_sas, delim) + print("contains passed") + + # startswith + run_test_starts_with(sarrays, test_sas, delim) + print("startswith passed") + + # endswith + run_test_ends_with(sarrays, test_sas, delim) + print("endswith passed") + + # peel + run_test_peel(sarrays, test_sas, delim) + print("peel passed") + + # stick + run_test_stick(sarrays, test_sas, base_words, delim) + print("stick passed") + ''' +class SuffixArrayTest(ArkoudaTest): + + def setUp(self): + ArkoudaTest.setUp(self) + base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') + base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') + base_sas1 = ak.suffix_array(base_words1) + base_sas2 = ak.suffix_array(base_words2) + ''' + gremlins = ak.array([' ', '']) + self.base_words = ak.concatenate((base_words1, base_words2, gremlins)) + self.np_base_words = np.hstack((base_words1.to_ndarray(), base_words2.to_ndarray())) + choices = ak.randint(0, self.base_words.size, N) + self.sarrays = ak.concatenate((self.base_words[choices], gremlins)) + self.test_sas = self.sarrays.to_ndarray() + self.cat = ak.Categorical(self.sarrays) + x, w = tuple(zip(*Counter(''.join(self.base_words.to_ndarray())).items())) + self.delim = np.random.choice(x, p=(np.array(w)/sum(w))) + self.akset = set(ak.unique(self.sarrays).to_ndarray()) + ''' + + def test_compare_sarrays(self): + assert compare_sarrays(self.base_words.to_ndarray(), self.np_base_words) + + def test_argsort(self): + run_test_argsort(self.sarrays, self.test_sas, self.cat) + + def test_in1d(self): + run_test_in1d(self.sarrays, self.cat, self.base_words) + + def test_unique(self): + run_test_unique(self.sarrays, self.test_sas, self.cat) + + def test_groupby(self): + run_test_groupby(self.sarrays, self.cat, self.akset) + + @pytest.mark.skip(reason="awaiting bug fix.") + def test_index(self): + run_test_index(self.sarrays, self.test_sas, self.cat) + + def test_slice(self): + run_test_slice(self.sarrays, self.test_sas, self.cat) + + def test_pdarray_index(self): + run_test_pdarray_index(self.sarrays, self.test_sas, self.cat) + + def test_contains(self): + run_test_contains(self.sarrays, self.test_sas, self.delim) + + def test_starts_with(self): + run_test_starts_with(self.sarrays, self.test_sas, self.delim) + + @pytest.mark.skip(reason="awaiting bug fix.") + def test_ends_with(self): + run_test_ends_with(self.sarrays, self.test_sas, self.delim) + + def test_error_handling(self): + sarraysOne = ak.random_sarrays_uniform(1, 10, UNIQUE, + characters='printable') + sarraysTwo = ak.random_sarrays_uniform(1, 10, UNIQUE, + characters='printable') + + with self.assertRaises(TypeError) as cm: + sarraysOne.lstick(sarraysTwo, delimiter=1) + self.assertEqual('Delimiter must be a string, not int', + cm.exception.args[0]) + + with self.assertRaises(TypeError) as cm: + sarraysOne.lstick([1], 1) + self.assertEqual('stick: not supported between String and list', + cm.exception.args[0]) + + with self.assertRaises(TypeError) as cm: + sarraysOne.startswith(1) + self.assertEqual('Substring must be a string, not int', + cm.exception.args[0]) + + with self.assertRaises(TypeError) as cm: + sarraysOne.endswith(1) + self.assertEqual('Substring must be a string, not int', + cm.exception.args[0]) + + with self.assertRaises(TypeError) as cm: + sarraysOne.contains(1) + self.assertEqual('Substring must be a string, not int', + cm.exception.args[0]) + + with self.assertRaises(TypeError) as cm: + sarraysOne.peel(1) + self.assertEqual('Delimiter must be a string, not int', + cm.exception.args[0]) + + with self.assertRaises(ValueError) as cm: + sarraysOne.peel("",-5) + self.assertEqual('Times must be >= 1', + cm.exception.args[0]) + + @pytest.mark.skip(reason="awaiting bug fix.") + def test_peel(self): + run_test_peel(self.sarrays, self.test_sas, self.delim) + + @pytest.mark.skip(reson="awaiting bug fix.") + def test_stick(self): + run_test_stick(self.sarrays, self.test_sas, self.base_words, self.delim) From dbd6d96aa13a4b94fcfc48bbd502c0ac6648c99e Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Mon, 7 Dec 2020 19:35:43 -0500 Subject: [PATCH 06/68] add Chapel skew suffix array algorithm --- src/SACA.chpl | 228 +++++++++++++++++++++++++++++++++++++++++- src/SegmentedMsg.chpl | 28 ++++-- 2 files changed, 245 insertions(+), 11 deletions(-) diff --git a/src/SACA.chpl b/src/SACA.chpl index ace4bf3168..c3e9fb4ce8 100644 --- a/src/SACA.chpl +++ b/src/SACA.chpl @@ -1,7 +1,7 @@ module SACA{ // In this module, different algorithms to construct suffix array are provided //Nov.15, 2020 - +//Algorithm 1 // The first algorithm divsufsort is the fastest C codes on suffix array require "../thirdparty/SA/libdivsufsort/include/config.h"; require "../thirdparty/SA/libdivsufsort/include/divsufsort.h"; @@ -12,8 +12,228 @@ require "../thirdparty/SA/libdivsufsort/lib/divsufsort.c"; require "../thirdparty/SA/libdivsufsort/lib/sssort.c"; require "../thirdparty/SA/libdivsufsort/lib/trsort.c"; require "../thirdparty/SA/libdivsufsort/lib/utils.c"; -/* -require "/home/z/zd4/SA/nong/saca-k-tois-20130413/saca-k/saca-k.cc"; -*/ extern proc divsufsort(inputstr:[] uint(8),suffixarray:[] int(32),totallen:int(32)); + +//this is another saca algorithm +//require "../thirdparty/SA/SACA-K/saca-k.c"; + +//extern proc SACA_K(inputstr:[] uint(8), suffixarray:[] uint, n:uint, K:uint,m:uint, level:int); +//void SACA_K(unsigned char *s, unsigned int *SA, +// unsigned int n, unsigned int K, +// unsigned int m, int level) ; + + +//Algorithm 2 + +// The Chapel version of suffix array construction algorithm using skew algorithm +// Rewrite the algorithm and codes in paper +// "Simple Linear Work Suffix Array Construction" by Juha Karkkainen and Peter Sanders (2003) +// Dec.7, 2020 + +inline proc leq(a1 :int, a2:int, b1:int, b2:int) // lexicographic order +{ return(a1 < b1 || a1 == b1 && a2 <= b2); +} // for pairs + +inline proc leq(a1 :int, a2:int,a3:int, b1:int, b2:int, b3:int) // lexicographic order +{ return(a1 < b1 || a1 == b1 && leq(a2,a3, b2,b3)); +} // for pairs + +//stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r +proc radixPass(a:[] int, b:[] int, r:[] uint(8), n:int, K:int ) +{ // count occurrences + var c:[0..K] uint(8); // counter array + var x:uint(8); + var i=0:int; + var sum=0:int; + forall x in c do x=0; + for i in 0..n-1 do c[r[a[i]]]=c[r[a[i]]]+1; + var t:uint(8); + for i in 0..K do { + t=c[i]; + c[i]=sum; + sum+=t; + } + for i in 0..n-1 do { + b[c[r[a[i]]]] = a[i]; + c[r[a[i]]]=c[r[a[i]]]+1; + } +} + + +//stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r +//element a[i] is mapping to r[a[i]] and r is the alphabets with K+1 characters. +// a and b are bounded by n in calculation +proc radixPass(a:[] int, b:[] int, r:[] int, n:int, K:int ) +{// count occurrences + var c:[0..K] int; // counter array + var x:int; + var i:int; + var t:int; + var sum=0:int; + forall x in c do x=0; + // calculate the number of different characters in a + for i in 0..n-1 do c[r[a[i]]]=c[r[a[i]]]+1; + // calculate the presum of c, so c[i] will be the starting position of different characters + for i in 0..K do { + t=c[i]; + c[i]=sum; + sum+=t; + } + // let b[j] stores the position of each a[i] based on their order. + //The same character but following the previous suffix will be put at the next position. + for i in 0..n-1 do { + b[c[r[a[i]]]] = a[i]; + c[r[a[i]]]=c[r[a[i]]]+1; + } + +} +//stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r + +// find the suffix array SA of s[0..n-1] in {1..K}^n +// require s[n]=s[n+1]=s[n+2]=0, n>=2. So the size of s should be n+3 +proc SuffixArraySkew(s:[] int, SA: [] int, n:int, K: int) { + var n0=(n+2)/3:int; + var n1=(n+1)/3:int; + var n2=n/3:int; + var n02=n0+n2:int; + var n12=n1+n2:int; +//number of elements meet i %3 =0,1, and 2. +//s[i] is the ith suffix, i in 0..n-1 + var s12: [0..n02+2] int; + s12[n02]= 0; + s12[n02+1]= 0; + s12[n02+2]=0; +// Here n02 instead of n12=n1+n2 is used for the later s0 building based on n1 elements + var SA12:[0..n02 + 2] int; + SA12[n02]=0; + SA12[n02+1]=0; + SA12[n02+2]=0; + + var s0:[0.. n0+2] int; + var SA0:[0..n0+2] int; + var i=0:int; + var j=0:int; + var k=0:int; + +// generate positions of mod 1 and mod 2 suffixes +// n0-n1 is used for building s0, s1 has the same number of elements as s0 + for i in 0.. n+(n0-n1)-1 do { + if (i%3 != 0) { + s12[j] = i; + j=j+1; + } + } +// lsb radix sort the mod 1 and mod 2 triples + var tmps:[0..n+2] int; + forall i in 0..n-2 do tmps[i]=s[i+2]; + radixPass(s12 , SA12, tmps, n02, K); + forall i in 0..n-1 do tmps[i]=s[i+1]; + radixPass(SA12, s12 , tmps, n02, K); + radixPass(s12 , SA12, s , n02, K); + +// find lexicographic names of triples + + var name = 0:int, c0 = -1:int, c1 = -1:int, c2 = -1:int; + + for i in 0..n02-1 do { + if (s[SA12[i]] != c0 || s[SA12[i]+1] != c1 || s[SA12[i]+2] != c2) + { name=name+1; + c0 = s[SA12[i]]; + c1 = s[SA12[i]+1]; + c2 = s[SA12[i]+2]; + } + if (SA12[i] % 3 == 1) { + s12[SA12[i]/3] = name; + // mapping the suffix to small alphabets + } // left half + else { + s12[SA12[i]/3 + n0] = name; + } // right half + } + +// recurse if names are not yet unique + if (name < n02) { + SuffixArraySkew(s12, SA12, n02, name); +// store unique names in s12 using the suffix array + for i in 0..n02-1 do s12[SA12[i]] = i + 1; + //restore the value of s12 since we will change its values during the procedure + } else // generate the suffix array of s12 directly + for i in 0..n02-1 do SA12[s12[i] - 1] = i; + // here SA12 is in fact the ISA array. + +// stably sort the mod 0 suffixes from SA12 by their first character + j=0; + for i in 0..n02-1 do { +// here in fact we take advantage of the sorted SA12 the just sort s0 once to get its sorted array +// at first we think the postion i%3=1 is the position + if (SA12[i] < n0) { + s0[j] = 3*SA12[i]; + j=j+1; + } + } + radixPass(s0, SA0, s, n0, K); + +// merge sorted SA0 suffixes and sorted SA12 suffixes + var p=0:int;// first s0 position + var t=n0-n1:int;//first s1 position + k=0; + var i1:int , j1:int; + var tmpk:int; + for tmpk in 0..n-1 do { +//#define GetI() (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2) + proc GetI():int { + if (SA12[t] < n0 ) { return SA12[t] * 3 + 1 ; + } + else { + return (SA12[t] - n0) * 3 + 2; + } + } + i = GetI(); // pos of current offset 12 suffix + j = SA0[p]; // pos of current offset 0 suffix + var flag:bool; + if (SA12[t] < n0) { + // different compares for mod 1 and mod 2 suffixes + // i % 3 =1 + flag=leq(s[i], s12[SA12[t] + n0], s[j], s12[j/3]); + } else { + // i % 3 =2 + flag=leq(s[i],s[i+1],s12[SA12[t]-n0+1], s[j],s[j+1],s12[j/3+n0]); +// flag=leq(s[i],s[i+1],s12[SA12[t]-n0], s[j],s[j+1],s12[j/3+n0]); + } + if (flag) + {// suffix from SA12 is smaller + SA[k] = i; + k=k+1; + t=t+1; + if (t == n02) {// done --- only SA0 suffixes left + forall (i1,j1) in zip (k..n-1,p..p+n-k-1) do SA[i1] = SA0[j1]; + break; + } + } else {// suffix from SA0 is smaller + SA[k] = j; + k=k+1; + p=p+1; + var tmpt=t:int; + if (p == n0) { // done --- only SA12 suffixes left + for i1 in tmpt..n02-1 do { + SA[k] = GetI(); + t=t+1; + k=k+1; + } + break; + } + } + } +} + + + + + + + + + + + } diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 49a375ad44..fd18752d68 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -691,8 +691,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var nBytes = strings.nBytes; var length=strings.getLengths(); var offsegs = (+ scan length) - length; - writeln("offsegs="); - writeln(offsegs); select (objtype) { when "str" { // To be checked, I am not sure if this formula can estimate the total memory requirement @@ -705,19 +703,35 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var sasval:[0..(nBytes-1)] int; var i:int; + var j:int; forall i in 0..(size-1) do { +// for i in 0..(size-1) do { // the start position of ith string in value array var startposition:int; var endposition:int; startposition = offsegs[i]; endposition = startposition+length[i]-1; - var sasize=length[i]:int(32); +// var sasize=length[i]:int(32); +// ref strArray=strings.values.a[startposition..endposition]; +// var tmparray:[1..sasize] int(32); +// divsufsort(strArray,tmparray,sasize); +// var x:int; +// var y:int(32); +// for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do +// x = y; + + var sasize=length[i]:int; ref strArray=strings.values.a[startposition..endposition]; - var tmparray:[1..sasize] int(32); - divsufsort(strArray,tmparray,sasize); + var tmparray:[0..sasize+2] int; + var intstrArray:[0..sasize+2] int; + forall (x,y) in zip ( intstrArray[0..sasize-1],strings.values.a[startposition..endposition]) do x=y; + intstrArray[sasize]=0; + intstrArray[sasize+1]=0; + intstrArray[sasize+2]=0; + SuffixArraySkew(intstrArray,tmparray,sasize,256); var x:int; var y:int(32); - for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do + for (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do x = y; } @@ -755,7 +769,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var filesize:int(32); var f = open(FileName, iomode.r); - var size:int=1; + var size=1:int; var nBytes = f.size; var length:[0..0] int =nBytes; var offsegs:[0..0] int =0 ; From 21df3595c2ec21a355a2b90ece1e78583caf95a6 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Tue, 8 Dec 2020 14:19:37 -0500 Subject: [PATCH 07/68] add the lcp array method --- src/MultiTypeSymEntry.chpl | 4 +++ src/SegmentedMsg.chpl | 72 +++++++++++++++++++++++++++++++++----- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/src/MultiTypeSymEntry.chpl b/src/MultiTypeSymEntry.chpl index ab7b68b7c9..a5a49da381 100644 --- a/src/MultiTypeSymEntry.chpl +++ b/src/MultiTypeSymEntry.chpl @@ -78,6 +78,10 @@ module MultiTypeSymEntry :arg etype: type to be instantiated :type etype: type */ + var EnhancedInfo:string; + /* this entry is used to described the LCP (longest common prefix) array + of suffix array or any other information closely related to this entry + */ proc init(len: int, type etype) { super.init(etype, len); this.etype = etype; diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index fd18752d68..6252062ae4 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -701,6 +701,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var sasoff = offsegs; //allocate an values array var sasval:[0..(nBytes-1)] int; + var lcpval:[0..(nBytes-1)] int; var i:int; var j:int; @@ -724,28 +725,50 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string ref strArray=strings.values.a[startposition..endposition]; var tmparray:[0..sasize+2] int; var intstrArray:[0..sasize+2] int; + var x:int; + var y:int(32); +// var y:int; forall (x,y) in zip ( intstrArray[0..sasize-1],strings.values.a[startposition..endposition]) do x=y; intstrArray[sasize]=0; intstrArray[sasize+1]=0; intstrArray[sasize+2]=0; SuffixArraySkew(intstrArray,tmparray,sasize,256); - var x:int; - var y:int(32); for (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do x = y; +// Here we calculate the lcp(Longest Common Prefix) array value + forall j in startposition+1..endposition do{ + var tmpcount=0:int; + var tmpbefore=sasval[j-1]:int; + var tmpcur=sasval[j]:int; + var tmplen=min(sasize-tmpcur, sasize-tmpbefore); + var tmpi:int; + for tmpi in 0..tmplen-1 do { + if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { + break; + } + tmpcount+=1; + } + lcpval[j]=tmpcount; + } } var segName2 = st.nextName(); var valName2 = st.nextName(); + var lcpvalName = st.nextName(); var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); + var lcpvalEntry = new shared SymEntry(lcpval); + valEntry.EnhancedInfo=lcpvalName; + lcpvalEntry.EnhancedInfo=valName2; st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); + st.addEntry(lcpvalName, lcpvalEntry); repMsg = 'created ' + st.attrib(segName2) + '+created ' + st.attrib(valName2); return repMsg; + } otherwise { var errorMsg = notImplementedError(pn, "("+objtype+")"); @@ -761,13 +784,15 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } +// directly read a string from given file and generate its suffix array proc segSAFileMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { var pn = Reflection.getRoutineName(); // var (FileName) = payload.decode().splitMsgToTuple(1); var FileName = payload.decode(); var repMsg: string; - var filesize:int(32); +// var filesize:int(32); + var filesize:int; var f = open(FileName, iomode.r); var size=1:int; var nBytes = f.size; @@ -782,8 +807,9 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string //allocate an offset array var sasoff = offsegs; - //allocate an values array + //allocate a suffix array values array and lcp array var sasval:[0..(nBytes-1)] int; + var lcpval:[0..(nBytes-1)] int; var i:int; forall i in 0..(size-1) do { @@ -792,26 +818,54 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var endposition:int; startposition = 0; endposition = nBytes-1; - var sasize=nBytes:int(32); +// var sasize=nBytes:int(32); + var sasize=nBytes:int; var strArray:[startposition..endposition]uint(8); var r = f.reader(kind=ionative); r.read(strArray); - var tmparray:[1..sasize] int(32); - divsufsort(strArray,tmparray,sasize); +// var tmparray:[1..sasize] int(32); + var tmparray:[0..sasize+2] int; + var intstrArray:[0..sasize+2] int; var x:int; - var y:int(32); - for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do + var y:int; + forall (x,y) in zip ( intstrArray[0..sasize-1],strArray[startposition..endposition]) do x=y; + intstrArray[sasize]=0; + intstrArray[sasize+1]=0; + intstrArray[sasize+2]=0; + SuffixArraySkew(intstrArray,tmparray,sasize,256); +// divsufsort(strArray,tmparray,sasize); + forall (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do x = y; +// Here we calculate the lcp(Longest Common Prefix) array value + forall j in startposition+1..endposition do{ + var tmpcount=0:int; + var tmpbefore=sasval[j-1]:int; + var tmpcur=sasval[j]:int; + var tmplen=min(sasize-tmpcur, sasize-tmpbefore); + var tmpi:int; + for tmpi in 0..tmplen-1 do { + if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { + break; + } + tmpcount+=1; + } + lcpval[j]=tmpcount; + } } var segName2 = st.nextName(); var valName2 = st.nextName(); + var lcpvalName = st.nextName(); var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); + var lcpvalEntry = new shared SymEntry(lcpval); + valEntry.EnhancedInfo=lcpvalName; + lcpvalEntry.EnhancedInfo=valName2; st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); + st.addEntry(lcpvalName, lcpvalEntry); repMsg = 'created ' + st.attrib(segName2) + '+created ' + st.attrib(valName2); return repMsg; From 940cb792d62c3a3872a2415376ea23b691717cb9 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 9 Dec 2020 15:01:34 -0500 Subject: [PATCH 08/68] confirm submit all changes --- src/SACA.chpl | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/SACA.chpl b/src/SACA.chpl index c3e9fb4ce8..f305598d05 100644 --- a/src/SACA.chpl +++ b/src/SACA.chpl @@ -228,12 +228,4 @@ proc SuffixArraySkew(s:[] int, SA: [] int, n:int, K: int) { - - - - - - - - } From 3a220dc286ebfdd57e4941eb5e3ebbdd591cf6ef Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 13 Dec 2020 21:29:48 -0500 Subject: [PATCH 09/68] solve conflict --- arkouda/pdarrayclass.py | 1 - src/SegmentedArray.chpl | 38 ++++++++++++++++++++++++-------------- src/SegmentedMsg.chpl | 9 +++------ 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py index 9e668161d8..d530bc14d9 100755 --- a/arkouda/pdarrayclass.py +++ b/arkouda/pdarrayclass.py @@ -95,7 +95,6 @@ def parse_single_int_array_value(msg : str) -> object: raise ValueError(("unsupported value from server {} {}".\ format(mydtype.name, value))) nfields = fields[1].split("\"") - print(nfields) return nfields[1] # class for the pdarray diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index ad15b6bb2f..8ba193d6f8 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -703,11 +703,11 @@ module SegmentedArray { var valueName: string; /** - * The pdaray containing the complete byte array composed of bytes - * corresponding to each string, joined by nulls. Note: the null byte - * is uint(8) value of zero. + * The pdaray containing the complete int array composed of integer index + * corresponding to each string, */ - var values: borrowed SymEntry(uint(8)); +// var values: borrowed SymEntry(uint(8)); + var values: borrowed SymEntry(int); /** * The number of strings in the segmented array @@ -735,7 +735,8 @@ module SegmentedArray { valueName = valName; var vs = try! st.lookup(valName); - var vals = toSymEntry(vs, uint(8)): unmanaged SymEntry(uint(8)); +// var vals = toSymEntry(vs, uint(8)): unmanaged SymEntry(uint(8)); + var vals = toSymEntry(vs, int): unmanaged SymEntry(int); values = vals; size = segs.size; nBytes = vals.size; @@ -746,7 +747,8 @@ module SegmentedArray { * inputs, generates the SymEntry objects for each and passes the * offset and value SymTab lookup names to the alternate init method */ - proc init(segments: [] int, values: [] uint(8), st: borrowed SymTab) { +// proc init(segments: [] int, values: [] uint(8), st: borrowed SymTab) { + proc init(segments: [] int, values: [] int, st: borrowed SymTab) { var oName = st.nextName(); var segEntry = new shared SymEntry(segments); try! st.addEntry(oName, segEntry); @@ -829,10 +831,12 @@ module SegmentedArray { // Offsets need to be re-zeroed newSegs -= start; // Bytearray of the new slice - var newVals = makeDistArray(end - start + 1, uint(8)); +// var newVals = makeDistArray(end - start + 1, uint(8)); + var newVals = makeDistArray(end - start + 1, int); ref va = values.a; // newVals = values.a[start..end]; - forall (i, nv) in zip(newVals.domain, newVals) with (var agg = newSrcAggregator(uint(8))) { +// forall (i, nv) in zip(newVals.domain, newVals) with (var agg = newSrcAggregator(uint(8))) { + forall (i, nv) in zip(newVals.domain, newVals) with (var agg = newSrcAggregator(int)) { agg.copy(nv, va[start + i]); } return (newSegs, newVals); @@ -843,7 +847,8 @@ module SegmentedArray { proc this(iv: [?D] int) throws { // Early return for zero-length result if (D.size == 0) { - return (makeDistArray(0, int), makeDistArray(0, uint(8))); +// return (makeDistArray(0, int), makeDistArray(0, uint(8))); + return (makeDistArray(0, int), makeDistArray(0, int)); } // Check all indices within bounds var ivMin = min reduce iv; @@ -879,7 +884,8 @@ module SegmentedArray { writeln("Copying values"); stdout.flush(); t1 = getCurrentTime(); } - var gatheredVals = makeDistArray(retBytes, uint(8)); +// var gatheredVals = makeDistArray(retBytes, uint(8)); + var gatheredVals = makeDistArray(retBytes, int); // Multi-locale requires some extra localization work that is not needed // in CHPL_COMM=none if CHPL_COMM != 'none' { @@ -906,7 +912,8 @@ module SegmentedArray { srcIdx = + scan srcIdx; // Now srcIdx has a dst-local copy of the source index and vals can be efficiently gathered ref va = values.a; - forall (v, si) in zip(gatheredVals, srcIdx) with (var agg = newSrcAggregator(uint(8))) { +// forall (v, si) in zip(gatheredVals, srcIdx) with (var agg = newSrcAggregator(uint(8))) { + forall (v, si) in zip(gatheredVals, srcIdx) with (var agg = newSrcAggregator(int)) { agg.copy(v, va[si]); } } else { @@ -938,7 +945,8 @@ module SegmentedArray { steps -= iv; // Early return for zero-length result if (newSize == 0) { - return (makeDistArray(0, int), makeDistArray(0, uint(8))); +// return (makeDistArray(0, int), makeDistArray(0, uint(8))); + return (makeDistArray(0, int), makeDistArray(0, int)); } var segInds = makeDistArray(newSize, int); forall (t, dst, idx) in zip(iv, steps, D) with (var agg = newDstAggregator(int)) { @@ -1205,8 +1213,10 @@ module SegmentedArray { const leftOffsets = (+ scan leftLengths) - leftLengths; const rightOffsets = (+ scan rightLengths) - rightLengths; // Allocate values and fill - var leftVals = makeDistArray((+ reduce leftLengths), uint(8)); - var rightVals = makeDistArray((+ reduce rightLengths), uint(8)); +// var leftVals = makeDistArray((+ reduce leftLengths), uint(8)); +// var rightVals = makeDistArray((+ reduce rightLengths), uint(8)); + var leftVals = makeDistArray((+ reduce leftLengths), int); + var rightVals = makeDistArray((+ reduce rightLengths), int); ref va = values.a; // Fill left values forall (srcStart, dstStart, len) in zip(oa, leftOffsets, leftLengths) { diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 0e1ba2a626..cdfc256c79 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -375,15 +375,12 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } when "int" { // Make a temporary int array - writeln(args[1]); - writeln(args[2]); - writeln(args[3]); - var tmparrays = new owned SegSArray(args[1], args[2], st); + var arrays = new owned SegSArray(args[1], args[2], st); // Parse the index var idx = args[3]:int; // TO DO: in the future, we will force the client to handle this - idx = convertPythonIndexToChapel(idx, tmparrays.size); - var s = tmparrays[idx]; + idx = convertPythonIndexToChapel(idx, arrays.size); + var s = arrays[idx]; return "item %s %jt".format("int", s); } otherwise { From 413182023d3f6466f4705859502dcbce293de2cc Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 13 Dec 2020 21:56:00 -0500 Subject: [PATCH 10/68] add thirdparty files --- src/SACA.chpl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/SACA.chpl b/src/SACA.chpl index 0aaadfac92..eac9823b35 100644 --- a/src/SACA.chpl +++ b/src/SACA.chpl @@ -3,7 +3,6 @@ module SACA{ //Nov.15, 2020 //Algorithm 1 // The first algorithm divsufsort is the fastest C codes on suffix array -/* require "../thirdparty/SA/libdivsufsort/include/config.h"; require "../thirdparty/SA/libdivsufsort/include/divsufsort.h"; require "../thirdparty/SA/libdivsufsort/include/divsufsort_private.h"; @@ -22,7 +21,6 @@ extern proc divsufsort(inputstr:[] uint(8),suffixarray:[] int(32),totallen:int(3 //void SACA_K(unsigned char *s, unsigned int *SA, // unsigned int n, unsigned int K, // unsigned int m, int level) ; -*/ //Algorithm 2 From c9e03fb1d8f7d0d1633b80b50353346ad0ffd43f Mon Sep 17 00:00:00 2001 From: David Bader Date: Sun, 13 Dec 2020 22:21:31 -0500 Subject: [PATCH 11/68] updated --- arkouda/strings.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arkouda/strings.py b/arkouda/strings.py index cfad108e40..0cae48767f 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -781,11 +781,10 @@ def attach(user_defined_name : str) -> Strings: class SArrays: """ - Represents an array of (suffix) arrays whose data resides on the - arkouda server. The user should not call this class directly; - rather its instances are created by other arkouda functions. It is - very similar to Strings and the difference is that its content is - int arrays instead of strings. + Represents an array of (suffix) arrays whose data resides on the arkouda server. + The user should not call this class directly; rather its instances are created + by other arkouda functions. It is very similar to Strings and the difference is + that its content is int arrays instead of strings. Attributes ---------- From d184048544a07c21a9a06ccca47b5787f38abca7 Mon Sep 17 00:00:00 2001 From: Oliver Alvarado Rodriguez <41132909+alvaradoo@users.noreply.github.com> Date: Mon, 14 Dec 2020 13:39:23 -0500 Subject: [PATCH 12/68] Update SACA.chpl comments reworded and added some comments --- src/SACA.chpl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/SACA.chpl b/src/SACA.chpl index eac9823b35..21a3354b26 100644 --- a/src/SACA.chpl +++ b/src/SACA.chpl @@ -14,7 +14,7 @@ require "../thirdparty/SA/libdivsufsort/lib/trsort.c"; require "../thirdparty/SA/libdivsufsort/lib/utils.c"; extern proc divsufsort(inputstr:[] uint(8),suffixarray:[] int(32),totallen:int(32)); -//this is another saca algorithm +//Another possible SACA algorithm to utilize. //require "../thirdparty/SA/SACA-K/saca-k.c"; //extern proc SACA_K(inputstr:[] uint(8), suffixarray:[] uint, n:uint, K:uint,m:uint, level:int); @@ -78,7 +78,7 @@ proc radixPass(a:[] int, b:[] int, r:[] int, n:int, K:int ) c[i]=sum; sum+=t; } - // let b[j] stores the position of each a[i] based on their order. + // let b[j] store the position of each a[i] based on their order. //The same character but following the previous suffix will be put at the next position. for i in 0..n-1 do { b[c[r[a[i]]]] = a[i]; @@ -150,7 +150,7 @@ proc SuffixArraySkew(s:[] int, SA: [] int, n:int, K: int) { } // right half } -// recurse if names are not yet unique +// recurse if names are not unique if (name < n02) { SuffixArraySkew(s12, SA12, n02, name); // store unique names in s12 using the suffix array @@ -163,7 +163,7 @@ proc SuffixArraySkew(s:[] int, SA: [] int, n:int, K: int) { // stably sort the mod 0 suffixes from SA12 by their first character j=0; for i in 0..n02-1 do { -// here in fact we take advantage of the sorted SA12 the just sort s0 once to get its sorted array +// here in fact we take advantage of the sorted SA12 to just sort s0 once to get its sorted array // at first we think the postion i%3=1 is the position if (SA12[i] < n0) { s0[j] = 3*SA12[i]; @@ -197,7 +197,7 @@ proc SuffixArraySkew(s:[] int, SA: [] int, n:int, K: int) { } else { // i % 3 =2 flag=leq(s[i],s[i+1],s12[SA12[t]-n0+1], s[j],s[j+1],s12[j/3+n0]); -// flag=leq(s[i],s[i+1],s12[SA12[t]-n0], s[j],s[j+1],s12[j/3+n0]); + // flag=leq(s[i],s[i+1],s12[SA12[t]-n0], s[j],s[j+1],s12[j/3+n0]); } if (flag) {// suffix from SA12 is smaller From b6228e5b9863434b4ef6a7853d97ba930d4db5f1 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Mon, 14 Dec 2020 23:17:34 -0500 Subject: [PATCH 13/68] remove tab, remove unused codes --- arkouda/pdarrayclass.py | 10 +- arkouda/strings.py | 6 +- src/SACA.chpl | 224 ++++++++++++++++++++-------------------- src/SegmentedMsg.chpl | 98 +++++++++--------- 4 files changed, 164 insertions(+), 174 deletions(-) diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py index d530bc14d9..1d62407dce 100755 --- a/arkouda/pdarrayclass.py +++ b/arkouda/pdarrayclass.py @@ -85,15 +85,7 @@ def parse_single_int_array_value(msg : str) -> object: """ fields = msg.split(" ",1) dtname=fields[0] - mydtype = dtype(dtname) - if mydtype == bool: - if value == "True": - return bool(True) - elif value == "False": - return bool(False) - else: - raise ValueError(("unsupported value from server {} {}".\ - format(mydtype.name, value))) +# mydtype = dtype(dtname) nfields = fields[1].split("\"") return nfields[1] diff --git a/arkouda/strings.py b/arkouda/strings.py index 0cae48767f..efcb7c46f6 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -942,7 +942,7 @@ def _binop(self, other : SArrays, op : str) -> pdarray: def __eq__(self, other) -> bool: return self._binop(other, "==") - def __ne__(self, other : object) -> bool: + def __ne__(self, other : SArrays) -> bool: return self._binop(other, "!=") def __getitem__(self, key): @@ -1012,8 +1012,8 @@ def get_lengths(self) -> pdarray: repMsg = generic_msg(msg) return create_pdarray(repMsg) - def __add__(self, other : SArrays) -> SArrays: - return self.stick(other) +# def __add__(self, other : SArrays) -> SArrays: +# return self.stick(other) def save(self, prefix_path : str, dataset : str='int_array', diff --git a/src/SACA.chpl b/src/SACA.chpl index 21a3354b26..3aa2ba5bd2 100644 --- a/src/SACA.chpl +++ b/src/SACA.chpl @@ -30,17 +30,17 @@ extern proc divsufsort(inputstr:[] uint(8),suffixarray:[] int(32),totallen:int(3 // Dec.7, 2020 inline proc leq(a1 :int, a2:int, b1:int, b2:int) // lexicographic order -{ return(a1 < b1 || a1 == b1 && a2 <= b2); +{ return(a1 < b1 || a1 == b1 && a2 <= b2); } // for pairs inline proc leq(a1 :int, a2:int,a3:int, b1:int, b2:int, b3:int) // lexicographic order -{ return(a1 < b1 || a1 == b1 && leq(a2,a3, b2,b3)); +{ return(a1 < b1 || a1 == b1 && leq(a2,a3, b2,b3)); } // for pairs //stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r proc radixPass(a:[] int, b:[] int, r:[] uint(8), n:int, K:int ) -{ // count occurrences - var c:[0..K] uint(8); // counter array +{ // count occurrences + var c:[0..K] uint(8); // counter array var x:uint(8); var i=0:int; var sum=0:int; @@ -64,7 +64,7 @@ proc radixPass(a:[] int, b:[] int, r:[] uint(8), n:int, K:int ) // a and b are bounded by n in calculation proc radixPass(a:[] int, b:[] int, r:[] int, n:int, K:int ) {// count occurrences - var c:[0..K] int; // counter array + var c:[0..K] int; // counter array var x:int; var i:int; var t:int; @@ -91,138 +91,138 @@ proc radixPass(a:[] int, b:[] int, r:[] int, n:int, K:int ) // find the suffix array SA of s[0..n-1] in {1..K}^n // require s[n]=s[n+1]=s[n+2]=0, n>=2. So the size of s should be n+3 proc SuffixArraySkew(s:[] int, SA: [] int, n:int, K: int) { - var n0=(n+2)/3:int; - var n1=(n+1)/3:int; - var n2=n/3:int; - var n02=n0+n2:int; - var n12=n1+n2:int; + var n0=(n+2)/3:int; + var n1=(n+1)/3:int; + var n2=n/3:int; + var n02=n0+n2:int; + var n12=n1+n2:int; //number of elements meet i %3 =0,1, and 2. //s[i] is the ith suffix, i in 0..n-1 - var s12: [0..n02+2] int; - s12[n02]= 0; - s12[n02+1]= 0; - s12[n02+2]=0; + var s12: [0..n02+2] int; + s12[n02]= 0; + s12[n02+1]= 0; + s12[n02+2]=0; // Here n02 instead of n12=n1+n2 is used for the later s0 building based on n1 elements - var SA12:[0..n02 + 2] int; - SA12[n02]=0; - SA12[n02+1]=0; - SA12[n02+2]=0; + var SA12:[0..n02 + 2] int; + SA12[n02]=0; + SA12[n02+1]=0; + SA12[n02+2]=0; - var s0:[0.. n0+2] int; - var SA0:[0..n0+2] int; - var i=0:int; - var j=0:int; - var k=0:int; + var s0:[0.. n0+2] int; + var SA0:[0..n0+2] int; + var i=0:int; + var j=0:int; + var k=0:int; // generate positions of mod 1 and mod 2 suffixes // n0-n1 is used for building s0, s1 has the same number of elements as s0 - for i in 0.. n+(n0-n1)-1 do { - if (i%3 != 0) { - s12[j] = i; - j=j+1; - } - } + for i in 0.. n+(n0-n1)-1 do { + if (i%3 != 0) { + s12[j] = i; + j=j+1; + } + } // lsb radix sort the mod 1 and mod 2 triples - var tmps:[0..n+2] int; - forall i in 0..n-2 do tmps[i]=s[i+2]; - radixPass(s12 , SA12, tmps, n02, K); - forall i in 0..n-1 do tmps[i]=s[i+1]; - radixPass(SA12, s12 , tmps, n02, K); - radixPass(s12 , SA12, s , n02, K); + var tmps:[0..n+2] int; + forall i in 0..n-2 do tmps[i]=s[i+2]; + radixPass(s12 , SA12, tmps, n02, K); + forall i in 0..n-1 do tmps[i]=s[i+1]; + radixPass(SA12, s12 , tmps, n02, K); + radixPass(s12 , SA12, s , n02, K); // find lexicographic names of triples - var name = 0:int, c0 = -1:int, c1 = -1:int, c2 = -1:int; - - for i in 0..n02-1 do { - if (s[SA12[i]] != c0 || s[SA12[i]+1] != c1 || s[SA12[i]+2] != c2) - { name=name+1; - c0 = s[SA12[i]]; - c1 = s[SA12[i]+1]; - c2 = s[SA12[i]+2]; - } - if (SA12[i] % 3 == 1) { - s12[SA12[i]/3] = name; + var name = 0:int, c0 = -1:int, c1 = -1:int, c2 = -1:int; + + for i in 0..n02-1 do { + if (s[SA12[i]] != c0 || s[SA12[i]+1] != c1 || s[SA12[i]+2] != c2) + { name=name+1; + c0 = s[SA12[i]]; + c1 = s[SA12[i]+1]; + c2 = s[SA12[i]+2]; + } + if (SA12[i] % 3 == 1) { + s12[SA12[i]/3] = name; // mapping the suffix to small alphabets - } // left half - else { - s12[SA12[i]/3 + n0] = name; - } // right half - } + } // left half + else { + s12[SA12[i]/3 + n0] = name; + } // right half + } // recurse if names are not unique - if (name < n02) { - SuffixArraySkew(s12, SA12, n02, name); + if (name < n02) { + SuffixArraySkew(s12, SA12, n02, name); // store unique names in s12 using the suffix array - for i in 0..n02-1 do s12[SA12[i]] = i + 1; + for i in 0..n02-1 do s12[SA12[i]] = i + 1; //restore the value of s12 since we will change its values during the procedure - } else // generate the suffix array of s12 directly - for i in 0..n02-1 do SA12[s12[i] - 1] = i; - // here SA12 is in fact the ISA array. - + } else // generate the suffix array of s12 directly + { for i in 0..n02-1 do SA12[s12[i] - 1] = i; + // here SA12 is in fact the ISA array. + } // stably sort the mod 0 suffixes from SA12 by their first character - j=0; - for i in 0..n02-1 do { + j=0; + for i in 0..n02-1 do { // here in fact we take advantage of the sorted SA12 to just sort s0 once to get its sorted array // at first we think the postion i%3=1 is the position - if (SA12[i] < n0) { + if (SA12[i] < n0) { s0[j] = 3*SA12[i]; j=j+1; } - } - radixPass(s0, SA0, s, n0, K); + } + radixPass(s0, SA0, s, n0, K); // merge sorted SA0 suffixes and sorted SA12 suffixes - var p=0:int;// first s0 position - var t=n0-n1:int;//first s1 position - k=0; - var i1:int , j1:int; - var tmpk:int; - for tmpk in 0..n-1 do { + var p=0:int;// first s0 position + var t=n0-n1:int;//first s1 position + k=0; + var i1:int , j1:int; + var tmpk:int; + for tmpk in 0..n-1 do { //#define GetI() (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2) - proc GetI():int { - if (SA12[t] < n0 ) { return SA12[t] * 3 + 1 ; - } - else { - return (SA12[t] - n0) * 3 + 2; - } - } - i = GetI(); // pos of current offset 12 suffix - j = SA0[p]; // pos of current offset 0 suffix - var flag:bool; - if (SA12[t] < n0) { - // different compares for mod 1 and mod 2 suffixes - // i % 3 =1 - flag=leq(s[i], s12[SA12[t] + n0], s[j], s12[j/3]); - } else { - // i % 3 =2 - flag=leq(s[i],s[i+1],s12[SA12[t]-n0+1], s[j],s[j+1],s12[j/3+n0]); - // flag=leq(s[i],s[i+1],s12[SA12[t]-n0], s[j],s[j+1],s12[j/3+n0]); - } - if (flag) - {// suffix from SA12 is smaller - SA[k] = i; - k=k+1; - t=t+1; - if (t == n02) {// done --- only SA0 suffixes left - forall (i1,j1) in zip (k..n-1,p..p+n-k-1) do SA[i1] = SA0[j1]; - break; - } - } else {// suffix from SA0 is smaller - SA[k] = j; - k=k+1; - p=p+1; - var tmpt=t:int; - if (p == n0) { // done --- only SA12 suffixes left - for i1 in tmpt..n02-1 do { - SA[k] = GetI(); - t=t+1; - k=k+1; - } - break; - } - } - } + proc GetI():int { + if (SA12[t] < n0 ) { return SA12[t] * 3 + 1 ; + } + else { + return (SA12[t] - n0) * 3 + 2; + } + } + i = GetI(); // pos of current offset 12 suffix + j = SA0[p]; // pos of current offset 0 suffix + var flag:bool; + if (SA12[t] < n0) { + // different compares for mod 1 and mod 2 suffixes + // i % 3 =1 + flag=leq(s[i], s12[SA12[t] + n0], s[j], s12[j/3]); + } else { + // i % 3 =2 + flag=leq(s[i],s[i+1],s12[SA12[t]-n0+1], s[j],s[j+1],s12[j/3+n0]); + // flag=leq(s[i],s[i+1],s12[SA12[t]-n0], s[j],s[j+1],s12[j/3+n0]); + } + if (flag) + {// suffix from SA12 is smaller + SA[k] = i; + k=k+1; + t=t+1; + if (t == n02) {// done --- only SA0 suffixes left + forall (i1,j1) in zip (k..n-1,p..p+n-k-1) do SA[i1] = SA0[j1]; + break; + } + } else {// suffix from SA0 is smaller + SA[k] = j; + k=k+1; + p=p+1; + var tmpt=t:int; + if (p == n0) { // done --- only SA12 suffixes left + for i1 in tmpt..n02-1 do { + SA[k] = GetI(); + t=t+1; + k=k+1; + } + break; + } + } + } } diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index cdfc256c79..692dd8473f 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -698,16 +698,16 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string overMemLimit(8*size + 16*size + nBytes); //allocate an offset array - var sasoff = offsegs; + var sasoff = offsegs; //allocate an values array var sasval:[0..(nBytes-1)] int; var lcpval:[0..(nBytes-1)] int; - var i:int; - var j:int; + var i:int; + var j:int; forall i in 0..(size-1) do { // for i in 0..(size-1) do { - // the start position of ith string in value array + // the start position of ith string in value array var startposition:int; var endposition:int; startposition = offsegs[i]; @@ -728,39 +728,38 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var x:int; var y:int(32); // var y:int; - forall (x,y) in zip ( intstrArray[0..sasize-1],strings.values.a[startposition..endposition]) do x=y; - intstrArray[sasize]=0; - intstrArray[sasize+1]=0; - intstrArray[sasize+2]=0; - SuffixArraySkew(intstrArray,tmparray,sasize,256); + forall (x,y) in zip ( intstrArray[0..sasize-1],strings.values.a[startposition..endposition]) do x=y; + intstrArray[sasize]=0; + intstrArray[sasize+1]=0; + intstrArray[sasize+2]=0; + SuffixArraySkew(intstrArray,tmparray,sasize,256); for (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do x = y; // Here we calculate the lcp(Longest Common Prefix) array value forall j in startposition+1..endposition do{ - var tmpcount=0:int; - var tmpbefore=sasval[j-1]:int; - var tmpcur=sasval[j]:int; - var tmplen=min(sasize-tmpcur, sasize-tmpbefore); - var tmpi:int; + var tmpcount=0:int; + var tmpbefore=sasval[j-1]:int; + var tmpcur=sasval[j]:int; + var tmplen=min(sasize-tmpcur, sasize-tmpbefore); + var tmpi:int; for tmpi in 0..tmplen-1 do { - if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { - break; - } - tmpcount+=1; - } - lcpval[j]=tmpcount; + if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { + break; + } + tmpcount+=1; + } + lcpval[j]=tmpcount; } - } - + } var segName2 = st.nextName(); var valName2 = st.nextName(); var lcpvalName = st.nextName(); - var segEntry = new shared SymEntry(sasoff); + var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); var lcpvalEntry = new shared SymEntry(lcpval); - valEntry.EnhancedInfo=lcpvalName; - lcpvalEntry.EnhancedInfo=valName2; + valEntry.EnhancedInfo=lcpvalName; + lcpvalEntry.EnhancedInfo=valName2; st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); @@ -806,14 +805,14 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string overMemLimit(8*size + 16*size + nBytes); //allocate an offset array - var sasoff = offsegs; + var sasoff = offsegs; //allocate a suffix array values array and lcp array var sasval:[0..(nBytes-1)] int; var lcpval:[0..(nBytes-1)] int; - var i:int; + var i:int; forall i in 0..(size-1) do { - // the start position of ith string in value array + // the start position of ith string in value array var startposition:int; var endposition:int; startposition = 0; @@ -828,40 +827,39 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var intstrArray:[0..sasize+2] int; var x:int; var y:int; - forall (x,y) in zip ( intstrArray[0..sasize-1],strArray[startposition..endposition]) do x=y; - intstrArray[sasize]=0; - intstrArray[sasize+1]=0; - intstrArray[sasize+2]=0; - SuffixArraySkew(intstrArray,tmparray,sasize,256); + forall (x,y) in zip ( intstrArray[0..sasize-1],strArray[startposition..endposition]) do x=y; + intstrArray[sasize]=0; + intstrArray[sasize+1]=0; + intstrArray[sasize+2]=0; + SuffixArraySkew(intstrArray,tmparray,sasize,256); // divsufsort(strArray,tmparray,sasize); forall (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do x = y; // Here we calculate the lcp(Longest Common Prefix) array value forall j in startposition+1..endposition do{ - var tmpcount=0:int; - var tmpbefore=sasval[j-1]:int; - var tmpcur=sasval[j]:int; - var tmplen=min(sasize-tmpcur, sasize-tmpbefore); - var tmpi:int; + var tmpcount=0:int; + var tmpbefore=sasval[j-1]:int; + var tmpcur=sasval[j]:int; + var tmplen=min(sasize-tmpcur, sasize-tmpbefore); + var tmpi:int; for tmpi in 0..tmplen-1 do { - if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { - break; - } - tmpcount+=1; - } - lcpval[j]=tmpcount; - } - } - + if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { + break; + } + tmpcount+=1; + } + lcpval[j]=tmpcount; + } + } var segName2 = st.nextName(); var valName2 = st.nextName(); var lcpvalName = st.nextName(); - var segEntry = new shared SymEntry(sasoff); + var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); var lcpvalEntry = new shared SymEntry(lcpval); - valEntry.EnhancedInfo=lcpvalName; - lcpvalEntry.EnhancedInfo=valName2; + valEntry.EnhancedInfo=lcpvalName; + lcpvalEntry.EnhancedInfo=valName2; st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); From 42e3ba7e37fe1626b9a60c939bd5d4a1596b5542 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 17 Dec 2020 18:13:07 -0500 Subject: [PATCH 14/68] change to relative directory --- benchmarks/run_benchmarks.py | 2 +- thirdparty/SA/libdivsufsort/lib/divsufsort.c | 2 +- thirdparty/SA/libdivsufsort/lib/sssort.c | 2 +- thirdparty/SA/libdivsufsort/lib/trsort.c | 2 +- thirdparty/SA/libdivsufsort/lib/utils.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py index 42ec3a28d2..a116c7f182 100755 --- a/benchmarks/run_benchmarks.py +++ b/benchmarks/run_benchmarks.py @@ -19,7 +19,7 @@ logging.basicConfig(level=logging.INFO) -BENCHMARKS = ['stream', 'argsort', 'coargsort', 'gather', 'scatter', 'reduce', 'scan', 'noop', 'setops'] +BENCHMARKS = ['stream', 'argsort', 'coargsort', 'gather', 'scatter', 'reduce', 'scan', 'noop', 'setops','sa'] def get_chpl_util_dir(): """ Get the Chapel directory that contains graph generation utilities. """ diff --git a/thirdparty/SA/libdivsufsort/lib/divsufsort.c b/thirdparty/SA/libdivsufsort/lib/divsufsort.c index 3087ac3065..4835351f61 100644 --- a/thirdparty/SA/libdivsufsort/lib/divsufsort.c +++ b/thirdparty/SA/libdivsufsort/lib/divsufsort.c @@ -25,7 +25,7 @@ */ //#include "divsufsort_private.h" -#include "/home/z/zd4/SA/libdivsufsort/include/divsufsort_private.h" +#include "../include/divsufsort_private.h" #ifdef _OPENMP # include #endif diff --git a/thirdparty/SA/libdivsufsort/lib/sssort.c b/thirdparty/SA/libdivsufsort/lib/sssort.c index 025df4022e..9542b6dbb8 100644 --- a/thirdparty/SA/libdivsufsort/lib/sssort.c +++ b/thirdparty/SA/libdivsufsort/lib/sssort.c @@ -25,7 +25,7 @@ */ //#include "divsufsort_private.h" -#include "/home/z/zd4/SA/libdivsufsort/include/divsufsort_private.h" +#include "../include/divsufsort_private.h" /*- Private Functions -*/ diff --git a/thirdparty/SA/libdivsufsort/lib/trsort.c b/thirdparty/SA/libdivsufsort/lib/trsort.c index 327aac17fa..88123f5627 100644 --- a/thirdparty/SA/libdivsufsort/lib/trsort.c +++ b/thirdparty/SA/libdivsufsort/lib/trsort.c @@ -25,7 +25,7 @@ */ //#include "divsufsort_private.h" -#include "/home/z/zd4/SA/libdivsufsort/include/divsufsort_private.h" +#include "../include/divsufsort_private.h" /*- Private Functions -*/ diff --git a/thirdparty/SA/libdivsufsort/lib/utils.c b/thirdparty/SA/libdivsufsort/lib/utils.c index 697dbef10f..681d541596 100644 --- a/thirdparty/SA/libdivsufsort/lib/utils.c +++ b/thirdparty/SA/libdivsufsort/lib/utils.c @@ -25,7 +25,7 @@ */ //#include "divsufsort_private.h" -#include "/home/z/zd4/SA/libdivsufsort/include/divsufsort_private.h" +#include "../include/divsufsort_private.h" /*- Private Function -*/ From 9702d46e075fdab2f9cd468092bfa002f5a66283 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 20 Dec 2020 13:57:26 -0500 Subject: [PATCH 15/68] include sa.py into run_benchmarks.py --- benchmarks/sa.py | 49 +++++++++++++++++++++++++++++------------------- pytest.ini | 1 + 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/benchmarks/sa.py b/benchmarks/sa.py index 41cc3ada92..02750554f5 100755 --- a/benchmarks/sa.py +++ b/benchmarks/sa.py @@ -6,37 +6,38 @@ TYPES = ('int64', 'float64', 'bool', 'str') -def time_ak_sa( vsize, trials, dtype): +def time_ak_sa( vsize,strlen, trials, dtype): print(">>> arkouda suffix array") cfg = ak.get_config() Nv = vsize * cfg["numLocales"] print("numLocales = {}, num of strings = {:,}".format(cfg["numLocales"], Nv)) -# v = ak.random_strings_uniform(90000000, 100000000, Nv) - v = ak.random_strings_uniform(1, 16, Nv) + v = ak.random_strings_uniform(1, strlen, Nv) c=ak.suffix_array(v) - print("size of suffix array={}".format(c.bytes.size)) +# print("size of suffix array={}".format(c.bytes.size)) +# print("offset/number of suffix array={}".format(c.offsets.size)) +# print("itemsize of suffix array={}".format(c.offsets.itemsize)) # print("All the random strings are as follows") - for k in range(vsize): - print("the {} th random tring ={}".format(k,v[k])) - print("the {} th suffix array ={}".format(k,c[k])) - print("") -# print(v) +# for k in range(vsize): +# print("the {} th random tring ={}".format(k,v[k])) +# print("the {} th suffix array ={}".format(k,c[k])) +# print("") timings = [] for _ in range(trials): start = time.time() - ak.suffix_array(v) + c=ak.suffix_array(v) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': - offsets_transferred = 3 * c.offsets.size * c.offsets.itemsize - bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) + offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize +# bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) + bytes_transferred = (c.bytes.size * c.offsets.itemsize) + (0 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: - bytes_per_sec = (c.size * c.itemsize * 3) / tavg -# print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) + print("Wrong data type") + print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) def time_np_sa(Ni, Nv, trials, dtype, random): print("to be done") @@ -45,26 +46,36 @@ def check_correctness(dtype, random): print("to be done") def create_parser(): - parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= V") + parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") parser.add_argument('hostname', help='Hostname of arkouda server') parser.add_argument('port', type=int, help='Port of arkouda server') - parser.add_argument('-v', '--value-size', type=int, help='Length of array from which values are gathered') + parser.add_argument('-n', '--size', type=int, default=10**4, help='Problem size: length of strings') + parser.add_argument('-v', '--number', type=int, default=10,help='Number of strings') parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') parser.add_argument('-d', '--dtype', default='str', help='Dtype of value array ({})'.format(', '.join(TYPES))) +# parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') +# parser.add_argument('--correctness-only', default=False, action='store_true', help='Only check correctness, not performance.') return parser + + + return parser + + + + if __name__ == "__main__": import sys parser = create_parser() args = parser.parse_args() - args.value_size = args.size if args.value_size is None else args.value_size if args.dtype not in TYPES: raise ValueError("Dtype must be {}, not {}".format('/'.join(TYPES), args.dtype)) ak.verbose = False ak.connect(args.hostname, args.port) - print("size of values array = {:,}".format(args.value_size)) + print("length of strings = {:,}".format(args.size)) + print("number of strings = {:,}".format(args.number)) print("number of trials = ", args.trials) - time_ak_sa( args.value_size, args.trials, args.dtype) + time_ak_sa( args.number, args.size, args.trials, args.dtype) sys.exit(0) diff --git a/pytest.ini b/pytest.ini index dfa3e99a40..e9ea5428a4 100644 --- a/pytest.ini +++ b/pytest.ini @@ -20,6 +20,7 @@ testpaths = tests/string_test.py tests/where_test.py tests/extrema_test.py + tests/suffixarray_test.py norecursedirs = .git dist build *egg* tests/deprecated/* python_functions = test* env = From 7a0b1978f85f85f098fe744957daaea5378e71df Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 20 Dec 2020 14:44:20 -0500 Subject: [PATCH 16/68] suffix_arry_file updated --- arkouda/pdarraycreation.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 030a7e9e0f..c49f662191 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -10,7 +10,7 @@ from arkouda.pdarrayclass import pdarray, create_pdarray from arkouda.strings import Strings from arkouda.strings import SArrays -from multipledispatch import dispatch +#from multipledispatch import dispatch __all__ = ["array", "zeros", "ones", "zeros_like", "ones_like", "arange", "linspace", "randint", "uniform", "standard_normal", @@ -785,8 +785,7 @@ def random_strings_lognormal(logmean : Union[float, int], logstd : Union[float, -#@typechecked -@dispatch(Strings) +@typechecked def suffix_array( strings : Strings) -> SArrays: """ Return the suffix arrays of given strings. The size/shape of each suffix @@ -833,8 +832,8 @@ def suffix_array( strings : Strings) -> SArrays: repMsg = generic_msg(msg) pdarrays= SArrays(*(repMsg.split('+'))) return pdarrays -@dispatch(str) -def suffix_array(filename: str) -> SArrays: +@typechecked +def suffix_array_file(filename: str) -> SArrays: """ This function is major used for testing correctness and performance Return the suffix array of given file name's content as a string. From e90ca27a57a8f7f288134d446effd99cf84406fe Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 20 Dec 2020 14:56:33 -0500 Subject: [PATCH 17/68] remove tab in MultiTypeSymEntry.chpl --- src/MultiTypeSymEntry.chpl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/MultiTypeSymEntry.chpl b/src/MultiTypeSymEntry.chpl index a5a49da381..a8a32c4214 100644 --- a/src/MultiTypeSymEntry.chpl +++ b/src/MultiTypeSymEntry.chpl @@ -78,8 +78,8 @@ module MultiTypeSymEntry :arg etype: type to be instantiated :type etype: type */ - var EnhancedInfo:string; - /* this entry is used to described the LCP (longest common prefix) array + var EnhancedInfo:string; + /* this entry is used to described the LCP (longest common prefix) array of suffix array or any other information closely related to this entry */ proc init(len: int, type etype) { From 2db17d80a582a12ae2394da20b2b669ed98a2cca Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 20 Dec 2020 15:47:18 -0500 Subject: [PATCH 18/68] datatype in string.py --- arkouda/pdarraycreation.py | 12 +++++++----- arkouda/strings.py | 8 ++++---- src/MultiTypeSymEntry.chpl | 2 +- src/SegmentedMsg.chpl | 8 ++++---- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index c49f662191..5ae0e20912 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -609,7 +609,7 @@ def randint(low : Union[int,float], high : Union[int,float], size : int, dtype=i sizestr = NUMBER_FORMAT_STRINGS['int64'].format(size) repMsg = generic_msg("randint {} {} {} {} {}".\ format(sizestr, dtype.name, lowstr, highstr, seed)) - return create_pdarray(repMsg) + return create_pdarray(cast(str,repMsg)) @typechecked def uniform(size : int, low : float=0.0, high : float=1.0, seed: Union[None, int]=None) -> pdarray: @@ -830,8 +830,9 @@ def suffix_array( strings : Strings) -> SArrays: strings.offsets.name, strings.bytes.name) repMsg = generic_msg(msg) - pdarrays= SArrays(*(repMsg.split('+'))) - return pdarrays + return SArrays(*(cast(str,repMsg).split('+'))) +# pdarrays= SArrays(*(repMsg.split('+'))) + @typechecked def suffix_array_file(filename: str) -> SArrays: """ @@ -875,5 +876,6 @@ def suffix_array_file(filename: str) -> SArrays: """ msg = "segmentedSAFile {}".format( filename ) repMsg = generic_msg(msg) - pdarrays= SArrays(*(repMsg.split('+'))) - return pdarrays + return SArrays(*(cast(str,repMsg).split('+'))) +# pdarrays= SArrays(*(repMsg.split('+'))) +# return pdarrays diff --git a/arkouda/strings.py b/arkouda/strings.py index efcb7c46f6..1e83680a05 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -8,6 +8,7 @@ from arkouda.logger import getArkoudaLogger import numpy as np # type: ignore from arkouda.dtypes import str as akstr +from arkouda.dtypes import int64 as akint from arkouda.dtypes import NUMBER_FORMAT_STRINGS, resolve_scalar_dtype, \ translate_np_dtype import json @@ -863,9 +864,8 @@ def __init__(self, offset_attrib : Union[pdarray,np.ndarray], self.shape = self.offsets.shape except Exception as e: raise ValueError(e) -# maybe we need to change the dtype into int later - self.dtype = np.str - self.logger = getArkoudaLogger(name=__class__.__name__) + self.dtype = akint + self.logger = getArkoudaLogger(name=__class__.__name__) # type: ignore def __iter__(self): raise NotImplementedError('SArrays does not support iteration now') @@ -942,7 +942,7 @@ def _binop(self, other : SArrays, op : str) -> pdarray: def __eq__(self, other) -> bool: return self._binop(other, "==") - def __ne__(self, other : SArrays) -> bool: + def __ne__(self, other) -> bool: return self._binop(other, "!=") def __getitem__(self, key): diff --git a/src/MultiTypeSymEntry.chpl b/src/MultiTypeSymEntry.chpl index a8a32c4214..5af8af1152 100644 --- a/src/MultiTypeSymEntry.chpl +++ b/src/MultiTypeSymEntry.chpl @@ -78,7 +78,7 @@ module MultiTypeSymEntry :arg etype: type to be instantiated :type etype: type */ - var EnhancedInfo:string; + var enhancedInfo:string; /* this entry is used to described the LCP (longest common prefix) array of suffix array or any other information closely related to this entry */ diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 692dd8473f..edfdbeb246 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -758,8 +758,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); var lcpvalEntry = new shared SymEntry(lcpval); - valEntry.EnhancedInfo=lcpvalName; - lcpvalEntry.EnhancedInfo=valName2; + valEntry.enhancedInfo=lcpvalName; + lcpvalEntry.enhancedInfo=valName2; st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); @@ -858,8 +858,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); var lcpvalEntry = new shared SymEntry(lcpval); - valEntry.EnhancedInfo=lcpvalName; - lcpvalEntry.EnhancedInfo=valName2; + valEntry.enhancedInfo=lcpvalName; + lcpvalEntry.enhancedInfo=valName2; st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); From da30cfa2738f53428187f9e4ea8deb8d0c5f310d Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 20 Dec 2020 17:15:09 -0500 Subject: [PATCH 19/68] remove suffixarray_test.py --- arkouda/strings.py | 2 +- pytest.ini | 1 - tests/suffixarray_test.py | 448 -------------------------------------- 3 files changed, 1 insertion(+), 450 deletions(-) delete mode 100644 tests/suffixarray_test.py diff --git a/arkouda/strings.py b/arkouda/strings.py index 1e83680a05..7d403d9395 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -1072,7 +1072,7 @@ def unregister(self) -> None: @staticmethod def attach(user_defined_name : str) -> 'SArrays': - return Strings(pdarray.attach(user_defined_name+'_offsets'), + return SArrays(pdarray.attach(user_defined_name+'_offsets'), pdarray.attach(user_defined_name+'_bytes')) diff --git a/pytest.ini b/pytest.ini index e9ea5428a4..dfa3e99a40 100644 --- a/pytest.ini +++ b/pytest.ini @@ -20,7 +20,6 @@ testpaths = tests/string_test.py tests/where_test.py tests/extrema_test.py - tests/suffixarray_test.py norecursedirs = .git dist build *egg* tests/deprecated/* python_functions = test* env = diff --git a/tests/suffixarray_test.py b/tests/suffixarray_test.py deleted file mode 100644 index f7c9049bac..0000000000 --- a/tests/suffixarray_test.py +++ /dev/null @@ -1,448 +0,0 @@ -import numpy as np -from collections import Counter -from context import arkouda as ak -from base_test import ArkoudaTest -import pytest -import random -import string -ak.verbose = False - -N = 100 -UNIQUE = N//2 - -# test_strings = np.array(['These are', 'some', 'interesting', -# '~!@#$%^&*()_+', 'sarrays', '8675309.', -# 'These are', 'some', 'duplicates.', -# 'hello', 'world']) - -# test_suffix array = np.array([9, 5, 0, 6, 8, 4, 2, 1, 7, 3], -# [4, 3, 2, 1, 0], [11, 3, 5, 10, 8, 0, 9, 1, 4, 6, 2, 7] -# [13, 1, 3, 4, 5, 7, 9, 10, 8, 12, 2, 6, 11, 0], -# [7, 5, 3, 4, 2, 6, 0, 1],[8, 7, 5, 4, 3, 1, 2, 0, 6], -# [9, 5, 0, 6, 8, 4, 2, 1, 7, 3],[4, 3, 2, 1, 0], -# [10, 6, 5, 0, 8, 4, 3, 2, 9, 7, 1],[5, 1, 0, 2, 3, 4] -# [5, 4, 3, 1, 2, 0] -def compare_sas(a, b): - return all(x == y for x, y in zip(a, b)) - -errors = False - -def run_test_argsort(sarrays, test_sas, cat): - akperm = ak.argsort(sarrays) - aksorted = sarrays[akperm].to_ndarray() - npsorted = np.sort(test_sas) - assert((aksorted == npsorted).all()) - catperm = ak.argsort(cat) - catsorted = cat[catperm].to_ndarray() - assert((catsorted == npsorted).all()) - -def run_test_unique(sarrays, test_sas, cat): - # unique - akuniq = ak.unique(sarrays) - catuniq = ak.unique(cat) - akset = set(akuniq.to_ndarray()) - catset = set(catuniq.to_ndarray()) - assert(akset == catset) - # There should be no duplicates - assert(akuniq.size == len(akset)) - npset = set(np.unique(test_sas)) - # When converted to a set, should agree with numpy - assert(akset == npset) - return akset - -def run_test_index(sarrays, test_sas, cat): - # int index - assert(sarrays[N//3] == test_sas[N//3]) - #assert(cat[N//3] == test_sas[N//3]) - print("int index passed") - -def run_test_slice(sarrays, test_sas, cat): - assert(compare_sas(sarrays[N//4:N//3], - test_sas[N//4:N//3])) - #assert(compare_sas(cat[N//4:N//3].to_ndarray(), - # test_sas[N//4:N//3])) - -def run_test_pdarray_index(sarrays, test_sas, cat): - inds = ak.arange(0, len(sarrays), 10) - assert(compare_sas(sarrays[inds].to_ndarray(), test_sas[inds.to_ndarray()])) - #assert(compare_sas(cat[inds].to_ndarray(), test_sas[inds.to_ndarray()])) - -def run_comparison_test(sarrays, test_sas, cat): - akinds = (sarrays == test_sas[N//4]) - #catinds = (cat == test_sas[N//4]) - npinds = (test_sas == test_sas[N//4]) - assert(np.allclose(akinds, npinds)) - -def run_test_in1d(sarrays, cat, base_words): - more_choices = ak.randint(0, UNIQUE, 100) - #akwords = base_words[more_choices] - #more_words = akwords.to_ndarray() - matches = ak.in1d(sarrays, akwords) - catmatches = ak.in1d(cat, akwords) - assert((matches == catmatches).all()) - # Every word in matches should be in the target set - for word in sarrays[matches].to_ndarray(): - assert(word in more_words) - # Exhaustively find all matches to make sure we didn't miss any - inds = ak.zeros(sarrays.size, dtype=ak.bool) - for word in more_words: - inds |= (sarrays == word) - assert((inds == matches).all()) - -def run_test_groupby(sarrays, cat, akset): - g = ak.GroupBy(sarrays) - gc = ak.GroupBy(cat) - # Unique keys should be same result as ak.unique - assert(akset == set(g.unique_keys.to_ndarray())) - assert(akset == set(gc.unique_keys.to_ndarray())) - assert((gc.permutation == g.permutation).all()) - permStrings = sarrays[g.permutation].to_ndarray() - # Check each group individually - lengths = np.diff(np.hstack((g.segments.to_ndarray(), np.array([g.size])))) - for uk, s, l in zip(g.unique_keys.to_ndarray(), - g.segments.to_ndarray(), - lengths): - # All values in group should equal key - assert((permStrings[s:s+l] == uk).all()) - # Key should not appear anywhere outside of group - assert(not (permStrings[:s] == uk).any()) - assert(not (permStrings[s+l:] == uk).any()) - - -def run_test_contains(sarrays, test_sas, delim): - found = sarrays.contains(delim).to_ndarray() - npfound = np.array([s.count(delim) > 0 for s in test_sas]) - assert((found == npfound).all()) - -def run_test_starts_with(sarrays, test_sas, delim): - found = sarrays.startswith(delim).to_ndarray() - npfound = np.array([s.startswith(delim) for s in test_sas]) - assert((found == npfound).all()) - -def run_test_ends_with(sarrays, test_sas, delim): - found = sarrays.endswith(delim).to_ndarray() - npfound = np.array([s.endswith(delim) for s in test_sas]) - assert((found == npfound).all()) - -def run_test_peel(sarrays, test_sas, delim): - import itertools as it - tf = (True, False) - def munge(triple, inc, part): - ret = [] - for h, s, t in triple: - if not part and s == '': - ret.append(('', h)) - else: - if inc: - ret.append((h + s, t)) - else: - ret.append((h, t)) - l, r = tuple(zip(*ret)) - return np.array(l), np.array(r) - - def rmunge(triple, inc, part): - ret = [] - for h, s, t in triple: - if not part and s == '': - ret.append((t, '')) - else: - if inc: - ret.append((h, s + t)) - else: - ret.append((h, t)) - l, r = tuple(zip(*ret)) - return np.array(l), np.array(r) - - def slide(triple, delim): - h, s, t = triple - h2, s2, t2 = t.partition(delim) - newh = h + s + h2 - return newh, s2, t2 - - def rslide(triple, delim): - h, s, t = triple - h2, s2, t2 = h.rpartition(delim) - newt = t2 + s + t - return h2, s2, newt - - for times, inc, part in it.product(range(1,4), tf, tf): - ls, rs = sarrays.peel(delim, times=times, includeDelimiter=inc, keepPartial=part) - triples = [s.partition(delim) for s in test_sas] - for i in range(times-1): - triples = [slide(t, delim) for t in triples] - ltest, rtest = munge(triples, inc, part) - assert((ltest == ls.to_ndarray()).all() and (rtest == rs.to_ndarray()).all()) - - for times, inc, part in it.product(range(1,4), tf, tf): - ls, rs = sarrays.rpeel(delim, times=times, includeDelimiter=inc, keepPartial=part) - triples = [s.rpartition(delim) for s in test_sas] - for i in range(times-1): - triples = [rslide(t, delim) for t in triples] - ltest, rtest = rmunge(triples, inc, part) - assert((ltest == ls.to_ndarray()).all() and (rtest == rs.to_ndarray()).all()) - -def run_test_stick(sarrays, test_sas, base_words, delim): - test_sas2 = np.random.choice(base_words.to_ndarray(), N, replace=True) - sarrays2 = ak.array(test_sas2) - stuck = sarrays.stick(sarrays2, delimiter=delim).to_ndarray() - tstuck = np.array([delim.join((a, b)) for a, b in zip(test_sas, test_sas2)]) - assert ((stuck == tstuck).all()) - assert ((sarrays + sarrays2) == sarrays.stick(sarrays2, delimiter="")).all() - - lstuck = sarrays.lstick(sarrays2, delimiter=delim).to_ndarray() - tlstuck = np.array([delim.join((b, a)) for a, b in zip(test_sas, test_sas2)]) - assert ((lstuck == tlstuck).all()) - assert ((sarrays2 + sarrays) == sarrays.lstick(sarrays2, delimiter="")).all() - -def suffixArray(s): - suffixes = [(s[i:], i) for i in range(len(s))] - suffixes.sort(key=lambda x: x[0]) - sa= [s[1] for s in suffixes] - #sa.insert(0,len(sa)) - return sa - -def get_random_string(length): - letters = string.ascii_lowercase - result_str = ''.join(random.choice(letters) for i in range(length)) - return result_str -# print("Random string of length", length, "is:", result_str) - -def ascill_to_string(ini_list): - res="" - for val in ini_list: - res = res + chr(int(val)) - return res - - -def string_to_int(sa_str): - ary=[] - for val in sa_str: - ary.append(int(val)) - return ary - -def akstrings_to_suffix_array(ak_str): - ary=[] - for val in ak_str: - x=val.split(" ",1) - y=x[1] - z=y.split(" ") - s=ascill_to_string(z) - sa=suffixArray(s) - ary.append(sa) - return ary - -def aksa_to_int_array(ak_str): - ary=[] - for val in ak_str: - x=val.split(" ",1) - y=x[1] - z=y.split(" ") - intz= [int(z[i]) for i in range(len(z))] - ary.append(intz) - return ary -if __name__ == '__main__': - import sys - if len(sys.argv) > 1: - ak.connect(server=sys.argv[1], port=sys.argv[2]) - else: - ak.connect() - - # with open(__file__, 'r') as f: - # base_words = np.array(f.read().split()) - # test_sas = np.random.choice(base_words, N, replace=True) - # sarrays = ak.array(test_sas) - # generate a Strings object - base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') - # get the real strings - strings1 = [base_words1[i] for i in range(len(base_words1))] - # generate a Strings object - base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') - # get the real strings - strings2 = [base_words2[i] for i in range(len(base_words2))] - #Generate suffix array locally - sa_ori1=akstrings_to_suffix_array(strings1) - #Generate suffix array locally - sa_ori2=akstrings_to_suffix_array(strings2) - #Generate suffix array remotely - sa1=ak.suffix_array(base_words1) - #Generate suffix array remotely - sa2=ak.suffix_array(base_words2) - #get the suffix array from SArray object - suffixarray1=[sa1[i] for i in range(len(sa1))] - #transfer the string suffix array to real int suffix array - sa_test1=aksa_to_int_array(suffixarray1) - #get the suffix array from SArray object - suffixarray2=[sa2[i] for i in range(len(sa2))] - #transfer the string suffix array to real int suffix array - sa_test2=aksa_to_int_array(suffixarray2) - - cat=0 - # int index - run_test_index(sa_ori1, sa_test1, cat) - run_test_index(sa_ori2, sa_test2, cat) - print("int index passed") - - # slice - run_test_slice(sa_ori1, sa_test1, cat) - run_test_slice(sa_ori2, sa_test2, cat) - print("slice passed") - - # pdarray int index - #run_test_pdarray_index(sa_ori1, sa_test1, cat) - #run_test_pdarray_index(sa_ori2, sa_test2, cat) - #print("pdarray int index passed") - - # comparison - run_comparison_test(sa_ori1, sa_test1, cat) - run_comparison_test(sa_ori2, sa_test2, cat) - print("comparison passed") - - # pdarray bool index - #run_test_pdarray_index(sarrays, test_sas, cat) - #print("pdarray bool index passed") - - # in1d and iter - # more_words = np.random.choice(base_words, 100) - # akwords = ak.array(more_words) - #run_test_in1d(sa_ori1, sa_test1, cat) - #run_test_in1d(sa_ori2, sa_test2, cat) - #print("in1d and iter passed") - - # argsort - #run_test_argsort(sa_ori1, sa_test1, cat) - - # unique - #akset = run_test_unique(sarrays, test_sas, cat) - ''' - # groupby - run_test_groupby(sarrays, cat, akset) - print("groupby passed") - - # substring functions - x, w = tuple(zip(*Counter(''.join(base_words.to_ndarray())).items())) - delim = np.random.choice(x, p=(np.array(w)/sum(w))) - - # contains - run_test_contains(sarrays, test_sas, delim) - print("contains passed") - - # startswith - run_test_starts_with(sarrays, test_sas, delim) - print("startswith passed") - - # endswith - run_test_ends_with(sarrays, test_sas, delim) - print("endswith passed") - - # peel - run_test_peel(sarrays, test_sas, delim) - print("peel passed") - - # stick - run_test_stick(sarrays, test_sas, base_words, delim) - print("stick passed") - ''' -class SuffixArrayTest(ArkoudaTest): - - def setUp(self): - ArkoudaTest.setUp(self) - base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') - base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') - base_sas1 = ak.suffix_array(base_words1) - base_sas2 = ak.suffix_array(base_words2) - ''' - gremlins = ak.array([' ', '']) - self.base_words = ak.concatenate((base_words1, base_words2, gremlins)) - self.np_base_words = np.hstack((base_words1.to_ndarray(), base_words2.to_ndarray())) - choices = ak.randint(0, self.base_words.size, N) - self.sarrays = ak.concatenate((self.base_words[choices], gremlins)) - self.test_sas = self.sarrays.to_ndarray() - self.cat = ak.Categorical(self.sarrays) - x, w = tuple(zip(*Counter(''.join(self.base_words.to_ndarray())).items())) - self.delim = np.random.choice(x, p=(np.array(w)/sum(w))) - self.akset = set(ak.unique(self.sarrays).to_ndarray()) - ''' - - def test_compare_sarrays(self): - assert compare_sarrays(self.base_words.to_ndarray(), self.np_base_words) - - def test_argsort(self): - run_test_argsort(self.sarrays, self.test_sas, self.cat) - - def test_in1d(self): - run_test_in1d(self.sarrays, self.cat, self.base_words) - - def test_unique(self): - run_test_unique(self.sarrays, self.test_sas, self.cat) - - def test_groupby(self): - run_test_groupby(self.sarrays, self.cat, self.akset) - - @pytest.mark.skip(reason="awaiting bug fix.") - def test_index(self): - run_test_index(self.sarrays, self.test_sas, self.cat) - - def test_slice(self): - run_test_slice(self.sarrays, self.test_sas, self.cat) - - def test_pdarray_index(self): - run_test_pdarray_index(self.sarrays, self.test_sas, self.cat) - - def test_contains(self): - run_test_contains(self.sarrays, self.test_sas, self.delim) - - def test_starts_with(self): - run_test_starts_with(self.sarrays, self.test_sas, self.delim) - - @pytest.mark.skip(reason="awaiting bug fix.") - def test_ends_with(self): - run_test_ends_with(self.sarrays, self.test_sas, self.delim) - - def test_error_handling(self): - sarraysOne = ak.random_sarrays_uniform(1, 10, UNIQUE, - characters='printable') - sarraysTwo = ak.random_sarrays_uniform(1, 10, UNIQUE, - characters='printable') - - with self.assertRaises(TypeError) as cm: - sarraysOne.lstick(sarraysTwo, delimiter=1) - self.assertEqual('Delimiter must be a string, not int', - cm.exception.args[0]) - - with self.assertRaises(TypeError) as cm: - sarraysOne.lstick([1], 1) - self.assertEqual('stick: not supported between String and list', - cm.exception.args[0]) - - with self.assertRaises(TypeError) as cm: - sarraysOne.startswith(1) - self.assertEqual('Substring must be a string, not int', - cm.exception.args[0]) - - with self.assertRaises(TypeError) as cm: - sarraysOne.endswith(1) - self.assertEqual('Substring must be a string, not int', - cm.exception.args[0]) - - with self.assertRaises(TypeError) as cm: - sarraysOne.contains(1) - self.assertEqual('Substring must be a string, not int', - cm.exception.args[0]) - - with self.assertRaises(TypeError) as cm: - sarraysOne.peel(1) - self.assertEqual('Delimiter must be a string, not int', - cm.exception.args[0]) - - with self.assertRaises(ValueError) as cm: - sarraysOne.peel("",-5) - self.assertEqual('Times must be >= 1', - cm.exception.args[0]) - - @pytest.mark.skip(reason="awaiting bug fix.") - def test_peel(self): - run_test_peel(self.sarrays, self.test_sas, self.delim) - - @pytest.mark.skip(reson="awaiting bug fix.") - def test_stick(self): - run_test_stick(self.sarrays, self.test_sas, self.base_words, self.delim) From 6b6e41a3868631d478cb6a78a96dca28568f2e4e Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 20 Dec 2020 18:08:04 -0500 Subject: [PATCH 20/68] update third party config --- .../include/divsufsort_private.h | 149 +++++++++--------- thirdparty/SA/libdivsufsort/lib/utils.c | 1 - 2 files changed, 76 insertions(+), 74 deletions(-) diff --git a/thirdparty/SA/libdivsufsort/include/divsufsort_private.h b/thirdparty/SA/libdivsufsort/include/divsufsort_private.h index 7e261c19d4..a6d630d78e 100644 --- a/thirdparty/SA/libdivsufsort/include/divsufsort_private.h +++ b/thirdparty/SA/libdivsufsort/include/divsufsort_private.h @@ -24,106 +24,109 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +#define HAVE_CONFIG_H 1 +#undef _DIVSUFSORT_PRIVATE_H + #ifndef _DIVSUFSORT_PRIVATE_H -#define _DIVSUFSORT_PRIVATE_H 1 + #define _DIVSUFSORT_PRIVATE_H 1 -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ + #ifdef __cplusplus + extern "C" { + #endif /* __cplusplus */ -#if HAVE_CONFIG_H -# include "config.h" -#endif -#include -#include -#if HAVE_STRING_H -# include -#endif -#if HAVE_STDLIB_H -# include -#endif -#if HAVE_MEMORY_H -# include -#endif -#if HAVE_STDDEF_H -# include -#endif -#if HAVE_STRINGS_H -# include -#endif -#if HAVE_INTTYPES_H -# include -#else -# if HAVE_STDINT_H -# include -# endif + #if HAVE_CONFIG_H + # include "config.h" + #endif + #include + #include + #if HAVE_STRING_H + # include + #endif + #if HAVE_STDLIB_H + # include + #endif + #if HAVE_MEMORY_H + # include + #endif + #if HAVE_STDDEF_H + # include + #endif + #if HAVE_STRINGS_H + # include + #endif + #if HAVE_INTTYPES_H + # include + #else + # if HAVE_STDINT_H + # include + # endif #endif #if defined(BUILD_DIVSUFSORT64) -# include "divsufsort64.h" -# ifndef SAIDX_T -# define SAIDX_T -# define saidx_t saidx64_t -# endif /* SAIDX_T */ -# ifndef PRIdSAIDX_T -# define PRIdSAIDX_T PRIdSAIDX64_T -# endif /* PRIdSAIDX_T */ -# define divsufsort divsufsort64 -# define divbwt divbwt64 -# define divsufsort_version divsufsort64_version -# define bw_transform bw_transform64 -# define inverse_bw_transform inverse_bw_transform64 -# define sufcheck sufcheck64 -# define sa_search sa_search64 -# define sa_simplesearch sa_simplesearch64 -# define sssort sssort64 -# define trsort trsort64 + # include "divsufsort64.h" + # ifndef SAIDX_T + # define SAIDX_T + # define saidx_t saidx64_t + # endif /* SAIDX_T */ + # ifndef PRIdSAIDX_T + # define PRIdSAIDX_T PRIdSAIDX64_T + # endif /* PRIdSAIDX_T */ + # define divsufsort divsufsort64 + # define divbwt divbwt64 + # define divsufsort_version divsufsort64_version + # define bw_transform bw_transform64 + # define inverse_bw_transform inverse_bw_transform64 + # define sufcheck sufcheck64 + # define sa_search sa_search64 + # define sa_simplesearch sa_simplesearch64 + # define sssort sssort64 + # define trsort trsort64 #else -# include "divsufsort.h" + # include "divsufsort.h" #endif /*- Constants -*/ #if !defined(UINT8_MAX) -# define UINT8_MAX (255) + # define UINT8_MAX (255) #endif /* UINT8_MAX */ #if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) -# undef ALPHABET_SIZE + # undef ALPHABET_SIZE #endif #if !defined(ALPHABET_SIZE) -# define ALPHABET_SIZE (UINT8_MAX + 1) + # define ALPHABET_SIZE (UINT8_MAX + 1) #endif /* for divsufsort.c */ #define BUCKET_A_SIZE (ALPHABET_SIZE) #define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) /* for sssort.c */ #if defined(SS_INSERTIONSORT_THRESHOLD) -# if SS_INSERTIONSORT_THRESHOLD < 1 -# undef SS_INSERTIONSORT_THRESHOLD -# define SS_INSERTIONSORT_THRESHOLD (1) -# endif -#else -# define SS_INSERTIONSORT_THRESHOLD (8) + # if SS_INSERTIONSORT_THRESHOLD < 1 + # undef SS_INSERTIONSORT_THRESHOLD + # define SS_INSERTIONSORT_THRESHOLD (1) + # endif + #else + # define SS_INSERTIONSORT_THRESHOLD (8) #endif #if defined(SS_BLOCKSIZE) -# if SS_BLOCKSIZE < 0 -# undef SS_BLOCKSIZE -# define SS_BLOCKSIZE (0) -# elif 32768 <= SS_BLOCKSIZE -# undef SS_BLOCKSIZE -# define SS_BLOCKSIZE (32767) -# endif + # if SS_BLOCKSIZE < 0 + # undef SS_BLOCKSIZE + # define SS_BLOCKSIZE (0) + # elif 32768 <= SS_BLOCKSIZE + # undef SS_BLOCKSIZE + # define SS_BLOCKSIZE (32767) + # endif #else -# define SS_BLOCKSIZE (1024) + # define SS_BLOCKSIZE (1024) #endif /* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */ #if SS_BLOCKSIZE == 0 -# if defined(BUILD_DIVSUFSORT64) -# define SS_MISORT_STACKSIZE (96) -# else -# define SS_MISORT_STACKSIZE (64) -# endif -#elif SS_BLOCKSIZE <= 4096 -# define SS_MISORT_STACKSIZE (16) + # if defined(BUILD_DIVSUFSORT64) + # define SS_MISORT_STACKSIZE (96) + # else + # define SS_MISORT_STACKSIZE (64) + # endif + #elif SS_BLOCKSIZE <= 4096 + # define SS_MISORT_STACKSIZE (16) #else # define SS_MISORT_STACKSIZE (24) #endif diff --git a/thirdparty/SA/libdivsufsort/lib/utils.c b/thirdparty/SA/libdivsufsort/lib/utils.c index 681d541596..75f1f71fab 100644 --- a/thirdparty/SA/libdivsufsort/lib/utils.c +++ b/thirdparty/SA/libdivsufsort/lib/utils.c @@ -27,7 +27,6 @@ //#include "divsufsort_private.h" #include "../include/divsufsort_private.h" - /*- Private Function -*/ /* Binary search for inverse bwt. */ From 38818a359a42cc036bddc2d7abee6f85f47298d9 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Mon, 21 Dec 2020 09:44:13 -0500 Subject: [PATCH 21/68] follow suggestions from community --- arkouda/pdarrayclass.py | 19 ++++++++++++++----- arkouda/strings.py | 4 ++-- benchmarks/sa.py | 12 +++++------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py index 1d62407dce..8188a65b1b 100755 --- a/arkouda/pdarrayclass.py +++ b/arkouda/pdarrayclass.py @@ -69,7 +69,7 @@ def unescape(s): @typechecked -def parse_single_int_array_value(msg : str) -> object: +def _parse_single_int_array_value(msg : str) -> object: """ Attempt to convert a scalar return value from the arkouda server to a numpy string in Python. The user should not call this function directly. @@ -84,10 +84,19 @@ def parse_single_int_array_value(msg : str) -> object: object numpy scalar """ fields = msg.split(" ",1) - dtname=fields[0] -# mydtype = dtype(dtname) - nfields = fields[1].split("\"") - return nfields[1] + dtname, value = msg.split(maxsplit=1) + mydtype = dtype(dtname) + try: + if mydtype == akint64: + nfields = value.split("\"") + return nfields[1] + else: + raise ValueError(("not correct int data type from server {} {}".\ + format(mydtype.name, value))) + except: + raise ValueError(("unsupported value from server {} {}".\ + format(mydtype.name, value))) + # class for the pdarray class pdarray: diff --git a/arkouda/strings.py b/arkouda/strings.py index 7d403d9395..8dc9a73d4d 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -2,7 +2,7 @@ from typing import cast, Tuple, Union from typeguard import typechecked from arkouda.client import generic_msg, pdarrayIterThresh -from arkouda.pdarrayclass import pdarray, create_pdarray, parse_single_value,parse_single_int_array_value +from arkouda.pdarrayclass import pdarray, create_pdarray, parse_single_value,_parse_single_int_array_value from arkouda.dtypes import * from arkouda.dtypes import NUMBER_FORMAT_STRINGS from arkouda.logger import getArkoudaLogger @@ -959,7 +959,7 @@ def __getitem__(self, key): key) repMsg = generic_msg(msg) _, value = repMsg.split(maxsplit=1) - return parse_single_int_array_value(value) + return _parse_single_int_array_value(value) else: raise IndexError("[int] {} is out of bounds with size {}".\ format(orig_key,self.size)) diff --git a/benchmarks/sa.py b/benchmarks/sa.py index 02750554f5..a525bd886d 100755 --- a/benchmarks/sa.py +++ b/benchmarks/sa.py @@ -11,7 +11,11 @@ def time_ak_sa( vsize,strlen, trials, dtype): cfg = ak.get_config() Nv = vsize * cfg["numLocales"] print("numLocales = {}, num of strings = {:,}".format(cfg["numLocales"], Nv)) - v = ak.random_strings_uniform(1, strlen, Nv) + + if dtype == 'str': + v = ak.random_strings_uniform(1, strlen, Nv) + else: + print("Wrong data type") c=ak.suffix_array(v) # print("size of suffix array={}".format(c.bytes.size)) # print("offset/number of suffix array={}".format(c.offsets.size)) @@ -32,7 +36,6 @@ def time_ak_sa( vsize,strlen, trials, dtype): print("Average time = {:.4f} sec".format(tavg)) if dtype == 'str': offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize -# bytes_transferred = (c.offsets.size * c.offsets.itemsize) + (2 * c.bytes.size) bytes_transferred = (c.bytes.size * c.offsets.itemsize) + (0 * c.bytes.size) bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg else: @@ -58,11 +61,6 @@ def create_parser(): return parser - return parser - - - - if __name__ == "__main__": import sys From a6c536ee4308b893e62a148a8c05c6e42a14411a Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Tue, 22 Dec 2020 11:18:51 -0500 Subject: [PATCH 22/68] update the SegSArray --- src/SegmentedArray.chpl | 62 ++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index aecbc0b72d..0455b8d567 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -1076,18 +1076,32 @@ module SegmentedArray { var t = new Timer(); if useHash { // Hash all strings - if v { writeln("Hashing strings"); stdout.flush(); t.start(); } + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), "Hashing strings"); + if v { t.start(); } var hashes = this.hash(); - if v { t.stop(); writeln("hashing took %t seconds\nSorting hashes".format(t.elapsed())); stdout.flush(); t.clear(); t.start(); } + + if v { + t.stop(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "hashing took %t seconds\nSorting hashes".format(t.elapsed())); + t.clear(); t.start(); + } + // Return the permutation that sorts the hashes var iv = radixSortLSD_ranks(hashes); - if v { t.stop(); writeln("sorting took %t seconds".format(t.elapsed())); stdout.flush(); } - if DEBUG { + if v { + t.stop(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "sorting took %t seconds".format(t.elapsed())); + } + if v{ var sortedHashes = [i in iv] hashes[i]; - var diffs = sortedHashes[(iv.domain.low+1)..#(iv.size-1)] - sortedHashes[(iv.domain.low)..#(iv.size-1)]; + var diffs = sortedHashes[(iv.domain.low+1)..#(iv.size-1)] - + sortedHashes[(iv.domain.low)..#(iv.size-1)]; printAry("diffs = ", diffs); var nonDecreasing = [(d0,d1) in diffs] ((d0 > 0) || ((d0 == 0) && (d1 >= 0))); - writeln("Are hashes sorted? ", && reduce nonDecreasing); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "Are hashes sorted? %i".format(&& reduce nonDecreasing)); } return iv; } else { @@ -1136,15 +1150,32 @@ module SegmentedArray { return hits; } var t = new Timer(); - if DEBUG {writeln("Checking bytes of substr"); stdout.flush(); t.start();} + + if v { + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "Checking bytes of substr"); + t.start(); + } const truth = findSubstringInBytes(substr); const D = truth.domain; - if DEBUG {t.stop(); writeln("took %t seconds\nTranslating to segments...".format(t.elapsed())); stdout.flush(); t.clear(); t.start();} + if v { + t.stop(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "took %t seconds\nTranslating to segments...".format(t.elapsed())); + t.clear(); + t.start(); + } // Need to ignore segment(s) at the end of the array that are too short to contain substr const tail = + reduce (offsets.a > D.high); // oD is the right-truncated domain representing segments that are candidates for containing substr var oD: subdomain(offsets.aD) = offsets.aD[offsets.aD.low..#(offsets.size - tail)]; - if DEBUG {t.stop(); writeln("took %t seconds\ndetermining answer...".format(t.elapsed())); stdout.flush(); t.clear(); t.start();} + if v { + t.stop(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "took %t seconds\ndetermining answer...".format(t.elapsed())); + t.clear(); + t.start(); + } ref oa = offsets.a; if mode == SearchMode.contains { // Determine whether each segment contains a hit @@ -1162,7 +1193,11 @@ module SegmentedArray { hits[oD.interior(-(oD.size-1))] = truth[oa[oD.interior(oD.size-1)] - substr.numBytes - 1]; hits[oD.high] = truth[D.high]; } - if DEBUG {t.stop(); writeln("took %t seconds".format(t.elapsed())); stdout.flush();} + if v { + t.stop(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "took %t seconds".format(t.elapsed())); + } return hits; } @@ -1374,9 +1409,10 @@ module SegmentedArray { const ref D = offsets.aD; const ref va = values.a; if checkSorted && isSorted() { - if DEBUG { writeln("argsort called on already sorted array"); stdout.flush(); } - var ranks: [D] int = [i in D] i; - return ranks; + saLogger.warn(getModuleName(),getRoutineName(),getLineNumber(), + "argsort called on already sorted array"); + var ranks: [D] int = [i in D] i; + return ranks; } var ranks = twoPhaseStringSort(this); return ranks; From 16cca77d483fb59ffc368ce76274adb4d5d6a1e0 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 23 Dec 2020 21:49:28 -0500 Subject: [PATCH 23/68] remove unused import --- arkouda/pdarraycreation.py | 11 +++-------- arkouda/strings.py | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 5ae0e20912..5ea5c65c83 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -8,14 +8,12 @@ from arkouda.dtypes import structDtypeCodes, NUMBER_FORMAT_STRINGS from arkouda.dtypes import dtype as akdtype from arkouda.pdarrayclass import pdarray, create_pdarray -from arkouda.strings import Strings -from arkouda.strings import SArrays -#from multipledispatch import dispatch +from arkouda.strings import Strings, SArrays __all__ = ["array", "zeros", "ones", "zeros_like", "ones_like", "arange", "linspace", "randint", "uniform", "standard_normal", "random_strings_uniform", "random_strings_lognormal", "from_series", - "suffix_array"] + "suffix_array","suffix_array_file"] numericDTypes = frozenset(["bool", "int64", "float64"]) @@ -786,7 +784,7 @@ def random_strings_lognormal(logmean : Union[float, int], logstd : Union[float, @typechecked -def suffix_array( strings : Strings) -> SArrays: +def suffix_array(strings : Strings) -> SArrays: """ Return the suffix arrays of given strings. The size/shape of each suffix arrays is the same as the corresponding strings. @@ -831,7 +829,6 @@ def suffix_array( strings : Strings) -> SArrays: strings.bytes.name) repMsg = generic_msg(msg) return SArrays(*(cast(str,repMsg).split('+'))) -# pdarrays= SArrays(*(repMsg.split('+'))) @typechecked def suffix_array_file(filename: str) -> SArrays: @@ -877,5 +874,3 @@ def suffix_array_file(filename: str) -> SArrays: msg = "segmentedSAFile {}".format( filename ) repMsg = generic_msg(msg) return SArrays(*(cast(str,repMsg).split('+'))) -# pdarrays= SArrays(*(repMsg.split('+'))) -# return pdarrays diff --git a/arkouda/strings.py b/arkouda/strings.py index d6dd0e473b..47ae738633 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -1,6 +1,19 @@ from __future__ import annotations from typing import cast, Tuple, Union from typeguard import typechecked +from arkouda.client import generic_msg +from arkouda.pdarrayclass import pdarray, create_pdarray, parse_single_value,_parse_single_int_array_value +from arkouda.logger import getArkoudaLogger +import numpy as np # type: ignore +from arkouda.dtypes import str as akstr +from arkouda.dtypes import int64 as akint +from arkouda.dtypes import NUMBER_FORMAT_STRINGS, resolve_scalar_dtype, \ + translate_np_dtype +import json +''' +from __future__ import annotations +from typing import cast, Tuple, Union +from typeguard import typechecked from arkouda.client import generic_msg, pdarrayIterThresh from arkouda.pdarrayclass import pdarray, create_pdarray, parse_single_value,_parse_single_int_array_value from arkouda.dtypes import * @@ -15,7 +28,7 @@ global verbose global pdarrayIterThresh - +''' __all__ = ['Strings','SArrays'] class Strings: From 9d6056399937133c15245d86ed4d695e936ed8cd Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 23 Dec 2020 22:20:13 -0500 Subject: [PATCH 24/68] align with strings function --- arkouda/strings.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arkouda/strings.py b/arkouda/strings.py index 47ae738633..87ebbc37ba 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -887,6 +887,7 @@ def __len__(self) -> int: return self.shape[0] def __str__(self) -> str: + from arkouda.client import pdarrayIterThres if self.size <= pdarrayIterThresh: vals = ["'{}'".format(self[i]) for i in range(self.size)] else: @@ -950,13 +951,13 @@ def _binop(self, other : SArrays, op : str) -> pdarray: raise ValueError("SArrays: {} not supported between SArrays and {}"\ .format(op, other.__class__.__name__)) repMsg = generic_msg(msg) - return create_pdarray(repMsg) + return create_pdarray(cast(str,repMsg)) def __eq__(self, other) -> bool: return self._binop(other, "==") def __ne__(self, other) -> bool: - return self._binop(other, "!=") + return self._binop(cast(Strings, other), "!=") def __getitem__(self, key): if np.isscalar(key) and resolve_scalar_dtype(key) == 'int64': @@ -1023,7 +1024,7 @@ def get_lengths(self) -> pdarray: msg = "segmentLengths {} {} {}".\ format(self.objtype, self.offsets.name, self.bytes.name) repMsg = generic_msg(msg) - return create_pdarray(repMsg) + return create_pdarray(cast(str,repMsg)) # def __add__(self, other : SArrays) -> SArrays: # return self.stick(other) From 59174ac1b21bbb116691da399297ef1df3b6cb42 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 23 Dec 2020 22:47:58 -0500 Subject: [PATCH 25/68] correct a typo --- arkouda/strings.py | 2 +- tests/string_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arkouda/strings.py b/arkouda/strings.py index 87ebbc37ba..c5b2dc59ff 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -887,7 +887,7 @@ def __len__(self) -> int: return self.shape[0] def __str__(self) -> str: - from arkouda.client import pdarrayIterThres + from arkouda.client import pdarrayIterThresh if self.size <= pdarrayIterThresh: vals = ["'{}'".format(self[i]) for i in range(self.size)] else: diff --git a/tests/string_test.py b/tests/string_test.py index f10c337c85..700f7a7d95 100644 --- a/tests/string_test.py +++ b/tests/string_test.py @@ -450,4 +450,4 @@ def test_str_output(self): strings = ak.array(['string {}'.format(i) for i in range (0,101)]) print(str(strings)) self.assertEqual("['string 0', 'string 1', 'string 2', ... , 'string 98', 'string 99', 'string 100']", - str(strings)) \ No newline at end of file + str(strings)) From 0cda91dfafaf7c43f5a5629e601e81c6c53d9267 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 23 Dec 2020 23:31:10 -0500 Subject: [PATCH 26/68] type match --- arkouda/strings.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arkouda/strings.py b/arkouda/strings.py index c5b2dc59ff..d8a1d3e212 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -899,7 +899,7 @@ def __str__(self) -> str: def __repr__(self) -> str: return "array({})".format(self.__str__()) - def _binop(self, other : SArrays, op : str) -> pdarray: + def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: """ Executes the requested binop on this SArrays instance and the parameter SArrays object and returns the results within @@ -954,10 +954,11 @@ def _binop(self, other : SArrays, op : str) -> pdarray: return create_pdarray(cast(str,repMsg)) def __eq__(self, other) -> bool: - return self._binop(other, "==") +# return self._binop(other, "==") + return self._binop(cast(SArrays, other), "==") def __ne__(self, other) -> bool: - return self._binop(cast(Strings, other), "!=") + return self._binop(cast(SArrays, other), "!=") def __getitem__(self, key): if np.isscalar(key) and resolve_scalar_dtype(key) == 'int64': From e1c31736f6e37d34fc9f789dc9a869f1c83ab966 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 24 Dec 2020 00:01:20 -0500 Subject: [PATCH 27/68] data type --- arkouda/strings.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arkouda/strings.py b/arkouda/strings.py index d8a1d3e212..c9432f9030 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -953,11 +953,13 @@ def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: repMsg = generic_msg(msg) return create_pdarray(cast(str,repMsg)) - def __eq__(self, other) -> bool: +# def __eq__(self, other) -> bool: + def __eq__(self, other) -> pdarray: # return self._binop(other, "==") return self._binop(cast(SArrays, other), "==") - def __ne__(self, other) -> bool: +# def __ne__(self, other) -> bool: + def __ne__(self, other) -> pdarray: return self._binop(cast(SArrays, other), "!=") def __getitem__(self, key): From 3861e622572483be7ae76160ba34fb4e5491ae76 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 24 Dec 2020 00:07:05 -0500 Subject: [PATCH 28/68] bool or pdarray --- arkouda/strings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arkouda/strings.py b/arkouda/strings.py index c9432f9030..5cf8c2bfec 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -955,11 +955,11 @@ def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: # def __eq__(self, other) -> bool: def __eq__(self, other) -> pdarray: -# return self._binop(other, "==") - return self._binop(cast(SArrays, other), "==") + return self._binop(other, "==") +# return self._binop(cast(SArrays, other), "==") -# def __ne__(self, other) -> bool: - def __ne__(self, other) -> pdarray: + def __ne__(self, other) -> bool: +# def __ne__(self, other) -> pdarray: return self._binop(cast(SArrays, other), "!=") def __getitem__(self, key): From edc3f63a2d57a0ccd4e7bb0ae288f4b95aa69f85 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 24 Dec 2020 00:16:22 -0500 Subject: [PATCH 29/68] remove binary op --- arkouda/strings.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arkouda/strings.py b/arkouda/strings.py index 5cf8c2bfec..b586b06934 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -899,6 +899,7 @@ def __str__(self) -> str: def __repr__(self) -> str: return "array({})".format(self.__str__()) + ''' def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: """ Executes the requested binop on this SArrays instance and the @@ -953,14 +954,15 @@ def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: repMsg = generic_msg(msg) return create_pdarray(cast(str,repMsg)) -# def __eq__(self, other) -> bool: - def __eq__(self, other) -> pdarray: + def __eq__(self, other) -> bool: +# def __eq__(self, other) -> pdarray: return self._binop(other, "==") # return self._binop(cast(SArrays, other), "==") def __ne__(self, other) -> bool: # def __ne__(self, other) -> pdarray: return self._binop(cast(SArrays, other), "!=") + ''' def __getitem__(self, key): if np.isscalar(key) and resolve_scalar_dtype(key) == 'int64': From 6074b60ba48b50f861efcede3d3bd92d524fa142 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 24 Dec 2020 00:20:55 -0500 Subject: [PATCH 30/68] remove binary op --- arkouda/strings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arkouda/strings.py b/arkouda/strings.py index b586b06934..59ca529818 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -928,6 +928,7 @@ def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: Raised if a server-side error is thrown while executing the binary operation """ + ''' if op not in self.BinOps: raise ValueError("SArrays: unsupported operator: {}".format(op)) if isinstance(other, SArrays): @@ -953,7 +954,7 @@ def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: .format(op, other.__class__.__name__)) repMsg = generic_msg(msg) return create_pdarray(cast(str,repMsg)) - + def __eq__(self, other) -> bool: # def __eq__(self, other) -> pdarray: return self._binop(other, "==") From 4f773e2724a4ff3b0543383c6e94ab1465643a9a Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sat, 26 Dec 2020 00:38:10 -0500 Subject: [PATCH 31/68] remove the bug causing wrong return string value --- arkouda/pdarraysetops.py | 77 ++++++++++++++++ arkouda/strings.py | 71 +++++++-------- src/GenSymIO.chpl | 17 +++- src/SegmentedArray.chpl | 177 +++++++++++++++++++++++++++++++++--- src/SegmentedMsg.chpl | 190 ++++++++++++++++++++++++++++++++++++++- src/SipHash.chpl | 49 ++++++++++ src/arkouda_server.chpl | 3 + tests/string_test.py | 43 +++++++++ 8 files changed, 577 insertions(+), 50 deletions(-) diff --git a/arkouda/pdarraysetops.py b/arkouda/pdarraysetops.py index edf86e9fea..42c06f7631 100644 --- a/arkouda/pdarraysetops.py +++ b/arkouda/pdarraysetops.py @@ -161,6 +161,83 @@ def in1d(pda1 : Union[pdarray,Strings,'Categorical'], pda2 : Union[pdarray,Strin else: raise TypeError('Both pda1 and pda2 must be pdarray, Strings, or Categorical') + + +def in1d_int(pda1 : Union[pdarray,SArrays,'Categorical'], pda2 : Union[pdarray,SArrays,'Categorical'], #type: ignore + invert : bool=False) -> pdarray: #type: ignore + """ + Test whether each element of a 1-D array is also present in a second array. + + Returns a boolean array the same length as `pda1` that is True + where an element of `pda1` is in `pda2` and False otherwise. + + Parameters + ---------- + pda1 : pdarray or SArrays or Categorical + Input array. + pda2 : pdarray or SArrays or Categorical + The values against which to test each value of `pda1`. Must be the + same type as `pda1`. + invert : bool, optional + If True, the values in the returned array are inverted (that is, + False where an element of `pda1` is in `pda2` and True otherwise). + Default is False. ``ak.in1d(a, b, invert=True)`` is equivalent + to (but is faster than) ``~ak.in1d(a, b)``. + + Returns + ------- + pdarray, bool + The values `pda1[in1d]` are in `pda2`. + + Raises + ------ + TypeError + Raised if either pda1 or pda2 is not a pdarray, Strings, or + Categorical object or if invert is not a bool + RuntimeError + Raised if the dtype of either array is not supported + + See Also + -------- + unique, intersect1d, union1d + + Notes + ----- + `in1d` can be considered as an element-wise function version of the + python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is logically + equivalent to ``ak.array([item in b for item in a])``, but is much + faster and scales to arbitrarily large ``a``. + + ak.in1d is not supported for bool or float64 pdarrays + + Examples + -------- + >>> ak.in1d(ak.array([-1, 0, 1]), ak.array([-2, 0, 2])) + array([False, True, False]) + + >>> ak.in1d(ak.array(['one','two']),ak.array(['two', 'three','four','five'])) + array([False, True]) + """ + from arkouda.categorical import Categorical as Categorical_ + if hasattr(pda1, 'in1d'): + return cast(Categorical_,pda1).in1d(pda2) + elif isinstance(pda1, pdarray) and isinstance(pda2, pdarray): + repMsg = generic_msg("in1d {} {} {}".\ + format(pda1.name, pda2.name, invert)) + return create_pdarray(cast(str,repMsg)) + elif isinstance(pda1, SArrays) and isinstance(pda2, SArrays): + repMsg = generic_msg("segmentedIn1dInt {} {} {} {} {} {} {}".\ + format(pda1.objtype, + pda1.offsets.name, + pda1.bytes.name, + pda2.objtype, + pda2.offsets.name, + pda2.bytes.name, + invert)) + return create_pdarray(cast(str,repMsg)) + else: + raise TypeError('Both pda1 and pda2 must be pdarray, SArrays, or Categorical') + @typechecked def concatenate(arrays : Sequence[Union[pdarray,Strings]]) -> Union[pdarray,Strings]: """ diff --git a/arkouda/strings.py b/arkouda/strings.py index 59ca529818..aaad771619 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -10,25 +10,7 @@ from arkouda.dtypes import NUMBER_FORMAT_STRINGS, resolve_scalar_dtype, \ translate_np_dtype import json -''' -from __future__ import annotations -from typing import cast, Tuple, Union -from typeguard import typechecked -from arkouda.client import generic_msg, pdarrayIterThresh -from arkouda.pdarrayclass import pdarray, create_pdarray, parse_single_value,_parse_single_int_array_value -from arkouda.dtypes import * -from arkouda.dtypes import NUMBER_FORMAT_STRINGS -from arkouda.logger import getArkoudaLogger -import numpy as np # type: ignore -from arkouda.dtypes import str as akstr -from arkouda.dtypes import int64 as akint -from arkouda.dtypes import NUMBER_FORMAT_STRINGS, resolve_scalar_dtype, \ - translate_np_dtype -import json -global verbose -global pdarrayIterThresh -''' __all__ = ['Strings','SArrays'] class Strings: @@ -93,20 +75,16 @@ def __init__(self, offset_attrib : Union[pdarray,np.ndarray], from either the offset_attrib or bytes_attrib parameter """ if isinstance(offset_attrib, pdarray): -# print("In Strings init 1 offset_attrib={}".format(offset_attrib)) self.offsets = offset_attrib else: try: -# print("In Strings init 2 offset_attrib={}".format(offset_attrib)) self.offsets = create_pdarray(offset_attrib) except Exception as e: raise RuntimeError(e) if isinstance(bytes_attrib, pdarray): -# print("In Strings init 1 bytes_attrib={}".format(bytes_attrib)) self.bytes = bytes_attrib else: try: -# print("In Strings init 1 bytes_attrib={}".format(bytes_attrib)) self.bytes = create_pdarray(bytes_attrib) except Exception as e: raise RuntimeError(e) @@ -681,7 +659,6 @@ def group(self) -> pdarray: repMsg = generic_msg(msg) return create_pdarray(cast(str,repMsg)) - def to_ndarray(self) -> np.ndarray: """ Convert the array to a np.ndarray, transferring array data from the @@ -792,7 +769,6 @@ def attach(user_defined_name : str) -> Strings: return Strings(pdarray.attach(user_defined_name+'_offsets'), pdarray.attach(user_defined_name+'_bytes')) - class SArrays: """ Represents an array of (suffix) arrays whose data resides on the arkouda server. @@ -899,7 +875,7 @@ def __str__(self) -> str: def __repr__(self) -> str: return "array({})".format(self.__str__()) - ''' + @typechecked def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: """ Executes the requested binop on this SArrays instance and the @@ -928,14 +904,13 @@ def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: Raised if a server-side error is thrown while executing the binary operation """ - ''' if op not in self.BinOps: raise ValueError("SArrays: unsupported operator: {}".format(op)) - if isinstance(other, SArrays): + if isinstance(other, Strings): if self.size != other.size: raise ValueError("SArrays: size mismatch {} {}".\ format(self.size, other.size)) - msg = "segmentedBinopvv {} {} {} {} {} {} {}".format(op, + msg = "segmentedBinopvvInt {} {} {} {} {} {} {}".format(op, self.objtype, self.offsets.name, self.bytes.name, @@ -943,7 +918,7 @@ def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: other.offsets.name, other.bytes.name) elif resolve_scalar_dtype(other) == 'int': - msg = "segmentedBinopvs {} {} {} {} {} {}".format(op, + msg = "segmentedBinopvsInt {} {} {} {} {} {}".format(op, self.objtype, self.offsets.name, self.bytes.name, @@ -954,16 +929,12 @@ def _binop(self, other : Union[SArrays,np.int_], op : str) -> pdarray: .format(op, other.__class__.__name__)) repMsg = generic_msg(msg) return create_pdarray(cast(str,repMsg)) - + def __eq__(self, other) -> bool: -# def __eq__(self, other) -> pdarray: return self._binop(other, "==") -# return self._binop(cast(SArrays, other), "==") def __ne__(self, other) -> bool: -# def __ne__(self, other) -> pdarray: return self._binop(cast(SArrays, other), "!=") - ''' def __getitem__(self, key): if np.isscalar(key) and resolve_scalar_dtype(key) == 'int64': @@ -1000,7 +971,7 @@ def __getitem__(self, key): kind, _ = translate_np_dtype(key.dtype) if kind not in ("bool", "int"): raise TypeError("unsupported pdarray index type {}".format(key.dtype)) - if kind == "bool" and self.size != key.size: + if kind == "int" and self.size != key.size: raise ValueError("size mismatch {} {}".format(self.size,key.size)) msg = "segmentedIndex {} {} {} {} {}".format('pdarrayIndex', self.objtype, @@ -1032,10 +1003,36 @@ def get_lengths(self) -> pdarray: repMsg = generic_msg(msg) return create_pdarray(cast(str,repMsg)) -# def __add__(self, other : SArrays) -> SArrays: -# return self.stick(other) + ''' + def __add__(self, other : SArrays) -> SArrays: + return self.stick(other) + def hash(self) -> Tuple[pdarray,pdarray]: + """ + Compute a 128-bit hash of each suffix array. + + Returns + ------- + Tuple[pdarray,pdarray] + A tuple of two int64 pdarrays. The ith hash value is the concatenation + of the ith values from each array. + + Notes + ----- + The implementation uses SipHash128, a fast and balanced hash function (used + by Python for dictionaries and sets). For realistic numbers of suffix array (up + to about 10**15), the probability of a collision between two 128-bit hash + values is negligible. + """ + msg = "segmentedHash {} {} {}".format(self.objtype, self.offsets.name, + self.bytes.name) + repMsg = generic_msg(msg) + h1, h2 = cast(str,repMsg).split('+') + return create_pdarray(cast(str,h1)), create_pdarray(cast(str,h2)) + + ''' + def save(self, prefix_path : str, dataset : str='int_array', mode : str='truncate') -> None: """ diff --git a/src/GenSymIO.chpl b/src/GenSymIO.chpl index 83d4639aaf..f04e5301ab 100644 --- a/src/GenSymIO.chpl +++ b/src/GenSymIO.chpl @@ -152,7 +152,7 @@ module GenSymIO { } /* - * Converts the JSON array to a pdarray + * Converts the JSON array to a string pdarray */ proc jsonToPdArray(json: string, size: int) throws { var f = opentmp(); @@ -167,6 +167,21 @@ module GenSymIO { return array; } + /* + * Converts the JSON array to a integer pdarray + */ + proc jsonToPdArrayInt(json: string, size: int) throws { + var f = opentmp(); + var w = f.writer(); + w.write(json); + w.close(); + var r = f.reader(start=0); + var array: [0..#size] int; + r.readf("%jt", array); + r.close(); + f.close(); + return array; + } /* * Spawns a separate Chapel process that executes and returns the * result of the h5ls command diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index 0455b8d567..cc7cfab23e 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -141,14 +141,7 @@ module SegmentedArray { end = offsets.a[idx+1] - 1; } // Take the slice of the bytearray and "cast" it to a chpl string -// var s = interpretAsString(values.a[start..end]); - var tmp=values.a[start..end]; - var s: string; - var i:int; - s=""; - for i in tmp do { - s=s+" "+ i:string; - } + var s = interpretAsString(values.a[start..end]); return s; } @@ -372,6 +365,32 @@ module SegmentedArray { return hashes; } + /* Apply a hash function to all suffix array. This is useful for grouping + and set membership. The hash used is SipHash128.*/ + proc hashInt() throws { + // 128-bit hash values represented as 2-tuples of uint(64) + var hashes: [offsets.aD] 2*uint(64); + // Early exit for zero-length result + if (size == 0) { + return hashes; + } + ref oa = offsets.a; + ref va = values.a; + // Compute lengths of strings + var lengths = getLengths(); + // Hash each string + // TO DO: test on clause with aggregator + forall (o, l, h) in zip(oa, lengths, hashes) { + const myRange = o..#l; + h = sipHash128(va, myRange); + /* // localize the string bytes */ + /* const myBytes = va[{o..#l}]; */ + /* h = sipHash128(myBytes, hashKey); */ + /* // Perf Note: localizing string bytes is ~3x faster on IB multilocale than this: */ + /* // h = sipHash128(va[{o..#l}]); */ + } + return hashes; + } /* Return a permutation that groups the strings. Because hashing is used, this permutation will not sort the strings, but all equivalent strings will fall in one contiguous block. */ @@ -850,7 +869,7 @@ module SegmentedArray { return s; } - /* Take a slice of strings from the array. The slice must be a + /* Take a slice of indices from the array. The slice must be a Chapel range, i.e. low..high by stride, not a Python slice. Returns arrays for the segment offsets and bytes of the slice.*/ proc this(const slice: range(stridable=true)) throws { @@ -859,7 +878,8 @@ module SegmentedArray { } // Early return for zero-length result if (size == 0) || (slice.size == 0) { - return (makeDistArray(0, int), makeDistArray(0, uint(8))); +// return (makeDistArray(0, int), makeDistArray(0, uint(8))); + return (makeDistArray(0, int), makeDistArray(0, int)); } // Start of bytearray slice var start = offsets.a[slice.low]; @@ -1563,6 +1583,7 @@ module SegmentedArray { return truth; } + /* Test array of strings for membership in another array (set) of strings. Returns a boolean vector the same size as the first array. */ proc in1d(mainStr: SegString, testStr: SegString, invert=false) throws where useHash { @@ -1619,6 +1640,62 @@ module SegmentedArray { return truth; } + /* Test array of strings for membership in another array (set) of strings. Returns + a boolean vector the same size as the first array. */ + proc in1d_Int(mainSar: SegSArray, testSar: SegSArray, invert=false) throws where useHash { + var truth: [mainSar.offsets.aD] bool; + // Early exit for zero-length result + if (mainSar.size == 0) { + return truth; + } + // Hash all suffix array for fast comparison + var t = new Timer(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),"Hashing strings"); + if v { t.start(); } + const hashes = mainSar.hash(); + if v { + t.stop(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "%t seconds".format(t.elapsed())); + t.clear(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "Making associative domains for test set on each locale"); + t.start(); + } + // On each locale, make an associative domain with the hashes of the second array + // parSafe=false because we are adding in serial and it's faster + var localTestHashes: [PrivateSpace] domain(2*uint(64), parSafe=false); + coforall loc in Locales { + on loc { + // Local hashes of second array + ref mySet = localTestHashes[here.id]; + mySet.requestCapacity(testSar.size); + const testHashes = testSar.hash(); + for h in testHashes { + mySet += h; + } + /* // Check membership of hashes in this locale's chunk of the array */ + /* [i in truth.localSubdomain()] truth[i] = mySet.contains(hashes[i]); */ + } + } + if v { + t.stop(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "%t seconds".format(t.elapsed())); + t.clear(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "Testing membership"); + t.start(); + } + [i in truth.domain] truth[i] = localTestHashes[here.id].contains(hashes[i]); + if v { + t.stop(); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "%t seconds".format(t.elapsed())); + } + return truth; + } + proc concat(s1: [] int, v1: [] uint(8), s2: [] int, v2: [] uint(8)) throws { // TO DO: extend to axis == 1 var segs = makeDistArray(s1.size + s2.size, int); @@ -1712,6 +1789,86 @@ module SegmentedArray { } } + + proc in1d_Int(mainSar: SegSArray, testSar: SegSArray, invert=false) throws where !useHash { + var truth: [mainSar.offsets.aD] bool; + // Early exit for zero-length result + if (mainSar.size == 0) { + return truth; + } + if (testSar.size <= in1dSortThreshold) { + for i in 0..#testSar.size { + truth |= (mainSar == testSar[i]); + } + return truth; + } else { + // This is inspired by numpy in1d + const (uoMain, uvMain, cMain, revIdx) = uniqueGroup(mainSar, returnInverse=true); + const (uoTest, uvTest, cTest, revTest) = uniqueGroup(testSar); + const (segs, vals) = concat(uoMain, uvMain, uoTest, uvTest); + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "Unique strings in first array: %t\nUnique strings in second array: %t\nConcat length: %t".format( + uoMain.size, uoTest.size, segs.size)); + var st = new owned SymTab(); + const ar = new owned SegSArray(segs, vals, st); + const order = ar.argsort(); + const (sortedSegs, sortedVals) = ar[order]; + const sar = new owned SegSArray(sortedSegs, sortedVals, st); + if v { + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "Sorted concatenated unique strings:"); + sar.show(10); + stdout.flush(); + } + const D = sortedSegs.domain; + // First compare lengths and only check pairs whose lengths are equal (because gathering them is expensive) + var flag: [D] bool; + const lengths = sar.getLengths(); + const ref saro = sar.offsets.a; + const ref sarv = sar.values.a; + const high = D.high; + forall (i, f, o, l) in zip(D, flag, saro, lengths) { + if (i < high) && (l == lengths[i+1]) { + const left = o..saro[i+1]-1; + var eq: bool; + if (i < high - 1) { + const right = saro[i+1]..saro[i+2]-1; + eq = (memcmp(sarv, left, sarv, right) == 0); + } else { + const ref right = saro[i+1]..sar.values.aD.high; + eq = (memcmp(sarv, left, sarv, right) == 0); + } + if eq { + f = true; + flag[i+1] = true; + } + } + } + + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "Flag pop: %t".format(+ reduce flag)); + + // Now flag contains true for both elements of duplicate pairs + if invert {flag = !flag;} + // Permute back to unique order + var ret: [D] bool; + forall (o, f) in zip(order, flag) with (var agg = newDstAggregator(bool)) { + agg.copy(ret[o], f); + } + if v { + saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), + "Ret pop: %t".format(+ reduce ret)); + } + // Broadcast back to original (pre-unique) order + var truth: [mainSar.offsets.aD] bool; + forall (t, i) in zip(truth, revIdx) with (var agg = newSrcAggregator(bool)) { + agg.copy(t, ret[i]); + } + return truth; + } + } + + /* Convert an array of raw bytes into a Chapel string. */ inline proc interpretAsString(bytearray: [?D] uint(8)): string { // Byte buffer must be local in order to make a C pointer diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 61a8e6f593..6a69beb754 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -9,7 +9,7 @@ module SegmentedMsg { use MultiTypeSymEntry; use RandArray; use IO; - use GenSymIO only jsonToPdArray; + use GenSymIO only jsonToPdArray,jsonToPdArrayInt; use SymArrayDmap; use SACA; @@ -85,6 +85,12 @@ module SegmentedMsg { // Do not include the null terminator in the length lengths.a = strings.getLengths() - 1; } + when "int" { + var sarrays = new owned SegSArray(segName, valName, st); + var lengths = st.addEntry(rname, sarrays.size, int); + // Do not include the null terminator in the length + lengths.a = sarrays.getLengths() - 1; + } otherwise { var errorMsg = notImplementedError(pn, "%s".format(objtype)); smLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg); @@ -258,6 +264,20 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } return "created " + st.attrib(name1) + "+created " + st.attrib(name2); } +/* + when "int" { + var sarrays = new owned SegSArray(segName, valName, st); + var hashes = sarrays.hash(); + var name1 = st.nextName(); + var hash1 = st.addEntry(name1, hashes.size, int); + var name2 = st.nextName(); + var hash2 = st.addEntry(name2, hashes.size, int); + forall (h, h1, h2) in zip(hashes, hash1.a, hash2.a) { + (h1,h2) = h:(int,int); + } + return "created " + st.attrib(name1) + "+created " + st.attrib(name2); + } +*/ otherwise { var errorMsg = notImplementedError(pn, objtype); smLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg); @@ -393,6 +413,33 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var newValName = st.nextName(); // Compute the slice var (newSegs, newVals) = strings[slice]; + + // Store the resulting offsets and bytes arrays + var newSegsEntry = new shared SymEntry(newSegs); + var newValsEntry = new shared SymEntry(newVals); + st.addEntry(newSegName, newSegsEntry); + st.addEntry(newValName, newValsEntry); + return "created " + st.attrib(newSegName) + " +created " + st.attrib(newValName); + } + when "int" { + // Make a temporary integer array + var sarrays = new owned SegSArray(args[1], args[2], st); + // Parse the slice parameters + var start = args[3]:int; + var stop = args[4]:int; + var stride = args[5]:int; + // Only stride-1 slices are allowed for now + if (stride != 1) { + var errorMsg = notImplementedError(pn, "stride != 1"); + smLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg); + return errorMsg; + } + // TO DO: in the future, we will force the client to handle this + var slice: range(stridable=true) = convertPythonSliceToChapel(start, stop, stride); + var newSegName = st.nextName(); + var newValName = st.nextName(); + // Compute the slice + var (newSegs, newVals) = sarrays[slice]; // Store the resulting offsets and bytes arrays var newSegsEntry = new shared SymEntry(newSegs); var newValsEntry = new shared SymEntry(newVals); @@ -455,6 +502,32 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string "("+objtype+","+dtype2str(gIV.dtype)+")");} } } + when "int" { + var sarrays = new owned SegSArray(args[1], args[2], st); + var iname = args[3]; + var gIV: borrowed GenSymEntry = st.lookup(iname); + select gIV.dtype { + when DType.Int64 { + var iv = toSymEntry(gIV, int); + var (newSegs, newVals) = sarrays[iv.a]; + var newSegsEntry = new shared SymEntry(newSegs); + var newValsEntry = new shared SymEntry(newVals); + st.addEntry(newSegName, newSegsEntry); + st.addEntry(newValName, newValsEntry); + } + when DType.Bool { + var iv = toSymEntry(gIV, bool); + var (newSegs, newVals) = sarrays[iv.a]; + var newSegsEntry = new shared SymEntry(newSegs); + var newValsEntry = new shared SymEntry(newVals); + st.addEntry(newSegName, newSegsEntry); + st.addEntry(newValName, newValsEntry); + } + otherwise {return notImplementedError(pn, + "("+objtype+","+dtype2str(gIV.dtype)+")");} + } + } + otherwise {return notImplementedError(pn, objtype);} } return "created " + st.attrib(newSegName) + "+created " + st.attrib(newValName); @@ -519,6 +592,47 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string return repMsg; } + proc segBinopvvIntMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var repMsg: string; + var (op, + // Type and attrib names of left segmented array + ltype, lsegName, lvalName, + // Type and attrib names of right segmented array + rtype, rsegName, rvalName, leftStr, jsonStr) + = payload.decode().splitMsgToTuple(9); + + // check to make sure symbols defined + st.check(lsegName); + st.check(lvalName); + st.check(rsegName); + st.check(rvalName); + + select (ltype, rtype) { + when ("int", "int") { + var lsa = new owned SegSArray(lsegName, lvalName, st); + var rsa = new owned SegString(rsegName, rvalName, st); + select op { + when "==" { + var rname = st.nextName(); + var e = st.addEntry(rname, lsa.size, bool); + e.a = (lsa == rsa); + repMsg = "created " + st.attrib(rname); + } + when "!=" { + var rname = st.nextName(); + var e = st.addEntry(rname, lsa.size, bool); + e.a = (lsa != rsa); + repMsg = "created " + st.attrib(rname); + } + otherwise {return notImplementedError(pn, ltype, op, rtype);} + } + } + otherwise {return unrecognizedTypeError(pn, "("+ltype+", "+rtype+")");} + } + return repMsg; + } + proc segBinopvsMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { var pn = Reflection.getRoutineName(); var repMsg: string; @@ -528,7 +642,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string // check to make sure symbols defined st.check(segName); st.check(valName); - var json = jsonToPdArray(encodedVal, 1); var value = json[json.domain.low]; var rname = st.nextName(); @@ -552,6 +665,42 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string return "created " + st.attrib(rname); } + proc segBinopvsIntMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var repMsg: string; + var (op, objtype, segName, valName, valtype, encodedVal) + = payload.decode().splitMsgToTuple(6); + + // check to make sure symbols defined + st.check(segName); + st.check(valName); + var json = jsonToPdArrayInt(encodedVal, 1); + var value = json[json.domain.low]; + var rname = st.nextName(); + select (objtype, valtype) { + when ("int", "int") { + var sarrays = new owned SegSArray(segName, valName, st); + select op { + when "==" { + var e = st.addEntry(rname, sarrays.size, bool); + var tmp=sarrays[sarrays.offsets.aD.low]:int; + e.a = (tmp == value); +// e.a = (sarrays == value); + } + when "!=" { + var e = st.addEntry(rname, sarrays.size, bool); + var tmp=sarrays[sarrays.offsets.aD.low]:int; + e.a = (tmp != value); +// e.a = (sarrays != value); + } + otherwise {return notImplementedError(pn, objtype, op, valtype);} + } + } + otherwise {return unrecognizedTypeError(pn, "("+objtype+", "+valtype+")");} + } + return "created " + st.attrib(rname); + } + proc segIn1dMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { var pn = Reflection.getRoutineName(); var repMsg: string; @@ -590,6 +739,43 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string return "created " + st.attrib(rname); } + proc segIn1dIntMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var repMsg: string; + var (mainObjtype, mainSegName, mainValName, testObjtype, testSegName, + testValName, invertStr) = payload.decode().splitMsgToTuple(7); + + // check to make sure symbols defined + st.check(mainSegName); + st.check(mainValName); + st.check(testSegName); + st.check(testValName); + + var invert: bool; + if invertStr == "True" {invert = true;} + else if invertStr == "False" {invert = false;} + else {return "Error: Invalid argument in %s: %s (expected True or False)".format(pn, invertStr);} + + var rname = st.nextName(); + select (mainObjtype, testObjtype) { + when ("int", "int") { + var mainSA = new owned SegSArray(mainSegName, mainValName, st); + var testSA = new owned SegSArray(testSegName, testValName, st); + var e = st.addEntry(rname, mainSA.size, bool); + if invert { + e.a = !in1d_Int(mainSA, testSA); + } else { + e.a = in1d_Int(mainSA, testSA); + } + } + otherwise { + var errorMsg = unrecognizedTypeError(pn, "("+mainObjtype+", "+testObjtype+")"); + smLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg); + return errorMsg; + } + } + return "created " + st.attrib(rname); + } proc segGroupMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { var pn = Reflection.getRoutineName(); var (objtype, segName, valName) = payload.decode().splitMsgToTuple(3); diff --git a/src/SipHash.chpl b/src/SipHash.chpl index a64622bd40..6ffc8819ac 100644 --- a/src/SipHash.chpl +++ b/src/SipHash.chpl @@ -58,6 +58,27 @@ module SipHash { (p[7]: uint(64) << 56)); } + private inline proc U8TO64_LE(p: [] int, D): uint(64) { + return ((p[D.low]: uint(64)) | + (p[D.low+1]: uint(64) << 8) | + (p[D.low+2]: uint(64) << 16) | + (p[D.low+3]: uint(64) << 24) | + (p[D.low+4]: uint(64) << 32) | + (p[D.low+5]: uint(64) << 40) | + (p[D.low+6]: uint(64) << 48) | + (p[D.low+7]: uint(64) << 56)); + } + + private inline proc U8TO64_LE(p: c_ptr(int)): uint(64) { + return ((p[0]: uint(64)) | + (p[1]: uint(64) << 8) | + (p[2]: uint(64) << 16) | + (p[3]: uint(64) << 24) | + (p[4]: uint(64) << 32) | + (p[5]: uint(64) << 40) | + (p[6]: uint(64) << 48) | + (p[7]: uint(64) << 56)); + } private inline proc byte_reverse(b: uint(64)): uint(64) { var c: uint(64); @@ -80,6 +101,9 @@ module SipHash { proc sipHash128(msg: [] uint(8), D): 2*uint(64) { return computeSipHashLocalized(msg, D, 16); } + proc sipHash128(msg: [] int, D): 2*uint(64) { + return computeSipHashLocalized(msg, D, 16); + } private proc computeSipHashLocalized(msg: [] uint(8), D, param outlen: int) { if contiguousIndices(msg) { @@ -107,6 +131,31 @@ module SipHash { return computeSipHash(msg, D, outlen); } + private proc computeSipHashLocalized(msg: [] int, D, param outlen: int) { + if contiguousIndices(msg) { + ref start = msg[D.low]; + if D.high < D.low { + return computeSipHash(c_ptrTo(start), 0..#0, outlen); + } + ref end = msg[D.high]; + const startLocale = start.locale.id; + const endLocale = end.locale.id; + const hereLocale = here.id; + const l = D.size; + if startLocale == endLocale { + if startLocale == hereLocale { + return computeSipHash(c_ptrTo(start), 0..#l, outlen); + } else { + var a = c_malloc(msg.eltType, l); + GET(a, startLocale, getAddr(start), l); + var h = computeSipHash(a, 0..#l, outlen); + c_free(a); + return h; + } + } + } + return computeSipHash(msg, D, outlen); + } private proc computeSipHash(msg, D, param outlen: int) { if !((outlen == 8) || (outlen == 16)) { compilerError("outlen must be 8 or 16"); diff --git a/src/arkouda_server.chpl b/src/arkouda_server.chpl index 1fc1da6a7f..524929c699 100644 --- a/src/arkouda_server.chpl +++ b/src/arkouda_server.chpl @@ -235,10 +235,13 @@ proc main() { when "segmentedIndex" {repMsg = segmentedIndexMsg(cmd, payload, st);} when "segmentedBinopvv" {repMsg = segBinopvvMsg(cmd, payload, st);} when "segmentedBinopvs" {repMsg = segBinopvsMsg(cmd, payload, st);} + when "segmentedBinopvvInt" {repMsg = segBinopvvIntMsg(cmd, payload, st);} + when "segmentedBinopvsInt" {repMsg = segBinopvsIntMsg(cmd, payload, st);} when "segmentedGroup" {repMsg = segGroupMsg(cmd, payload, st);} when "segmentedSuffixAry"{repMsg = segSuffixArrayMsg(cmd, payload, st);} when "segmentedSAFile" {repMsg = segSAFileMsg(cmd, payload, st);} when "segmentedIn1d" {repMsg = segIn1dMsg(cmd, payload, st);} + when "segmentedIn1dInt" {repMsg = segIn1dIntMsg(cmd, payload, st);} when "lshdf" {repMsg = lshdfMsg(cmd, payload, st);} when "readhdf" {repMsg = readhdfMsg(cmd, payload, st);} when "readAllHdf" {repMsg = readAllHdfMsg(cmd, payload, st);} diff --git a/tests/string_test.py b/tests/string_test.py index 700f7a7d95..aa72e4f8e4 100644 --- a/tests/string_test.py +++ b/tests/string_test.py @@ -42,6 +42,15 @@ def run_test_unique(strings, test_strings, cat): def run_test_index(strings, test_strings, cat, specificInds): # int index + print("===============in run=============================") + print("The passed strints="); + print(str(strings)) + print("The passed test_strints="); + print(str(test_strings)) + print("The passed strints[N//3]="); + print(str(strings[N//3])) + print("The passed test_strints[N//3]="); + print(str(test_strings[N//3])) assert(strings[N//3] == test_strings[N//3]) assert(cat[N//3] == test_strings[N//3]) for i in specificInds: @@ -218,10 +227,17 @@ def run_test_stick(strings, test_strings, base_words, delim): # test_strings = np.random.choice(base_words, N, replace=True) # strings = ak.array(test_strings) + print("===============main=============================") base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') + print("base_words1=") + print(str(base_word1)) base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') + print("base_words2=") + print(str(base_word2)) gremlins = ak.array(['"', ' ', '']) base_words = ak.concatenate((base_words1, base_words2)) + print("base_words=") + print(str(base_word)) np_base_words = np.hstack((base_words1.to_ndarray(), base_words2.to_ndarray())) assert(compare_strings(base_words.to_ndarray(), np_base_words)) choices = ak.randint(0, base_words.size, N) @@ -233,6 +249,15 @@ def run_test_stick(strings, test_strings, base_words, delim): print("Generation and concatenate passed") # int index + print("") + print(str(strings)) + print("") + print(str(test_strings)) + print("") + print(str(strings[N//3])) + print("") + print(str(test_strings[N//3])) + print("") run_test_index(strings, test_strings, cat, range(-len(gremlins), 0)) print("int index passed") @@ -314,6 +339,22 @@ def setUp(self): self.gremlins_strings = ak.concatenate((base_words[choices], gremlins)) self.gremlins_test_strings = self.gremlins_strings.to_ndarray() self.gremlins_cat = ak.Categorical(self.gremlins_strings) + print("=================In Class will check===========================") + print("") + print(str(base_words1)) + print("After base_word1 ") + print("") + print(str(self.strings)) + print("After Print strings") + print(str(self.test_strings)) + print("") + print("After Print teststrings") + print(str(self.strings[N//3])) + print("") + print("After Print strings[N//3]") + print(str(self.test_strings[N//3])) + print("") + print("After Print test_strings[N//3]") def test_compare_strings(self): print('starting test_compare_Strings') @@ -342,6 +383,7 @@ def test_groupby(self): def test_index(self): print('starting test_index') + print("") run_test_index(self.strings, self.test_strings, self.cat, range(-len(self.gremlins), 0)) run_test_index(self.gremlins_strings, self.gremlins_test_strings, self.gremlins_cat, range(-len(self.gremlins), 0)) @@ -448,6 +490,7 @@ def test_stick(self): def test_str_output(self): strings = ak.array(['string {}'.format(i) for i in range (0,101)]) + print("============================================") print(str(strings)) self.assertEqual("['string 0', 'string 1', 'string 2', ... , 'string 98', 'string 99', 'string 100']", str(strings)) From 267238cb8bc0c3bdbe94a7f653472193f6e46d9d Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sat, 26 Dec 2020 00:55:43 -0500 Subject: [PATCH 32/68] import SArrays class in pdarraysetops.py --- arkouda/pdarraysetops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arkouda/pdarraysetops.py b/arkouda/pdarraysetops.py index 42c06f7631..1d73fa1098 100644 --- a/arkouda/pdarraysetops.py +++ b/arkouda/pdarraysetops.py @@ -5,7 +5,7 @@ from arkouda.pdarrayclass import pdarray, create_pdarray from arkouda.pdarraycreation import zeros_like, array from arkouda.sorting import argsort -from arkouda.strings import Strings +from arkouda.strings import Strings,SArrays from arkouda.logger import getArkoudaLogger Categorical = ForwardRef('Categorical') From 189c32e0220edfee157f7f9878dc00230a70820f Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sat, 26 Dec 2020 01:15:26 -0500 Subject: [PATCH 33/68] add an empty correctness function --- benchmarks/sa.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/benchmarks/sa.py b/benchmarks/sa.py index a525bd886d..20b4f51c91 100755 --- a/benchmarks/sa.py +++ b/benchmarks/sa.py @@ -47,6 +47,7 @@ def time_np_sa(Ni, Nv, trials, dtype, random): def check_correctness(dtype, random): print("to be done") + assert( 1==1) def create_parser(): parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") @@ -57,7 +58,8 @@ def create_parser(): parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') parser.add_argument('-d', '--dtype', default='str', help='Dtype of value array ({})'.format(', '.join(TYPES))) # parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') -# parser.add_argument('--correctness-only', default=False, action='store_true', help='Only check correctness, not performance.') + parser.add_argument('-r', '--randomize', default=False, action='store_true', help='Use random values instead of ones') + parser.add_argument('--correctness-only', default=False, action='store_true', help='Only check correctness, not performance.') return parser @@ -71,6 +73,12 @@ def create_parser(): ak.verbose = False ak.connect(args.hostname, args.port) + if args.correctness_only: + for dtype in TYPES: + check_correctness(dtype, args.randomize) + sys.exit(0) + + print("length of strings = {:,}".format(args.size)) print("number of strings = {:,}".format(args.number)) print("number of trials = ", args.trials) From 0aa835cc2813e15d4708b096830432628accf000 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sat, 26 Dec 2020 01:33:12 -0500 Subject: [PATCH 34/68] copy master gather.py --- benchmarks/gather.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/benchmarks/gather.py b/benchmarks/gather.py index ec40c6056a..ecfddcd5c4 100755 --- a/benchmarks/gather.py +++ b/benchmarks/gather.py @@ -26,26 +26,12 @@ def time_ak_gather(isize, vsize, trials, dtype, random): v = ak.random_strings_uniform(1, 16, Nv) else: v = ak.ones(Nv, dtype=dtype) - print("v={}".format(v)) - print("v.offsets={}".format(v.offsets)) - print("v.nbytes={}".format(v.nbytes)) - print("v[1]={}".format(v[1])) - print("In Gather size={}".format(v.size)) - print("In Gather nbytes={}".format(v.nbytes)) - print("In Gather ndim={}".format(v.ndim)) - print("In Gather shape={}".format(v.shape)) - print("In Gather offsets name ={}".format(v.offsets.name)) - print("In Gather offsets size={}".format(v.offsets.size)) - print("In Gather bytes name ={}".format(v.bytes.name)) - print("In Gather bytes size={}".format(v.bytes.size)) + timings = [] for _ in range(trials): - print("In Gather loop i={}".format(i)) - print("In Gather v[i]={}".format(v[i])) start = time.time() c = v[i] end = time.time() - print("In Gather loop c={}".format(c)) timings.append(end - start) tavg = sum(timings) / trials From ed98498132256bb5b80d5c6e980ea0cb91c2d289 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sat, 26 Dec 2020 01:47:35 -0500 Subject: [PATCH 35/68] make sa.py check easy --- benchmarks/sa.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/sa.py b/benchmarks/sa.py index 20b4f51c91..de8d40edba 100755 --- a/benchmarks/sa.py +++ b/benchmarks/sa.py @@ -46,8 +46,8 @@ def time_np_sa(Ni, Nv, trials, dtype, random): print("to be done") def check_correctness(dtype, random): - print("to be done") - assert( 1==1) +# print("to be done") + assert True def create_parser(): parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") From 9f5c3d3db65a5081042ccf5def49f4fc25fb1d12 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sat, 26 Dec 2020 10:28:59 -0500 Subject: [PATCH 36/68] check test/*.chpl --- test/UnitTestPeelStick.chpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/UnitTestPeelStick.chpl b/test/UnitTestPeelStick.chpl index 7fc2901d2d..108afea5f4 100644 --- a/test/UnitTestPeelStick.chpl +++ b/test/UnitTestPeelStick.chpl @@ -129,7 +129,7 @@ proc testMessageLayer(substr, n, minLen, maxLen) throws { d.stop("make_strings"); var reqMsg = "peel str %s %s str 1 True True True %jt".format(strings.offsetName, strings.valueName, [substr]); writeReq(reqMsg); - var repMsg = segmentedPeelMsg(cmd="segmentedPeel", payload=reqMsg.encode(), st); + var repMsg = segmentedEfuncMsg(cmd="segmentedEfunc", payload=reqMsg.encode(), st); writeRep(repMsg); var (loAttribs,lvAttribs,roAttribs,rvAttribs) = repMsg.splitMsgToTuple('+', 4); var loname = parseName(loAttribs); From ac7e2098b84518e7d133d72decb8ac6fdd922a69 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 27 Dec 2020 16:40:20 -0500 Subject: [PATCH 37/68] add corectness check in sa.py --- benchmarks/sa.py | 52 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/benchmarks/sa.py b/benchmarks/sa.py index de8d40edba..6c61bf0844 100755 --- a/benchmarks/sa.py +++ b/benchmarks/sa.py @@ -3,6 +3,8 @@ import time, argparse import numpy as np import arkouda as ak +import random +import string TYPES = ('int64', 'float64', 'bool', 'str') @@ -42,12 +44,47 @@ def time_ak_sa( vsize,strlen, trials, dtype): print("Wrong data type") print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) -def time_np_sa(Ni, Nv, trials, dtype, random): - print("to be done") + +def suffixArray(s): + suffixes = [(s[i:], i) for i in range(len(s))] + suffixes.sort(key=lambda x: x[0]) + sa= [s[1] for s in suffixes] + #sa.insert(0,len(sa)) + return sa + +def time_np_sa(vsize, strlen, trials, dtype): + s=''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(strlen)) + timings = [] + for _ in range(trials): + start = time.time() + sa=suffixArray(s) + end = time.time() + timings.append(end - start) + tavg = sum(timings) / trials + print("Average time = {:.4f} sec".format(tavg)) + if dtype == 'str': + offsets_transferred = 0 + bytes_transferred = len(s) + bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg + else: + print("Wrong data type") + print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) def check_correctness(dtype, random): -# print("to be done") - assert True + Ni = 10*4 + Nv = 100 + + v = ak.random_strings_uniform(1, Ni, Nv) + c=ak.suffix_array(v) + for k in range(Nv): + s=v[k] + sa=suffixArray(s) + aksa=c[k] + _,tmp=c[k].split(maxsplit=1) + aksa=tmp.split() + intaksa = [int(numeric_string) for numeric_string in aksa] + assert (sa==intaksa) + def create_parser(): parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") @@ -57,7 +94,7 @@ def create_parser(): parser.add_argument('-v', '--number', type=int, default=10,help='Number of strings') parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') parser.add_argument('-d', '--dtype', default='str', help='Dtype of value array ({})'.format(', '.join(TYPES))) -# parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') + parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') parser.add_argument('-r', '--randomize', default=False, action='store_true', help='Use random values instead of ones') parser.add_argument('--correctness-only', default=False, action='store_true', help='Only check correctness, not performance.') return parser @@ -82,6 +119,7 @@ def create_parser(): print("length of strings = {:,}".format(args.size)) print("number of strings = {:,}".format(args.number)) print("number of trials = ", args.trials) - time_ak_sa( args.number, args.size, args.trials, args.dtype) - + time_ak_sa(args.number, args.size, args.trials, args.dtype) + if args.numpy: + time_np_sa(args.number, args.size, args.trials, args.dtype) sys.exit(0) From 6fd3b05e58d8a645d0b28f823692c06223d6389a Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 27 Dec 2020 18:31:09 -0500 Subject: [PATCH 38/68] change suffix array return as an int array --- arkouda/pdarrayclass.py | 9 ++++++++- benchmarks/sa.py | 30 ++++++++++++++++-------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py index 244ae6b9b6..9212ea3399 100755 --- a/arkouda/pdarrayclass.py +++ b/arkouda/pdarrayclass.py @@ -89,7 +89,14 @@ def _parse_single_int_array_value(msg : str) -> object: try: if mydtype == akint64: nfields = value.split("\"") - return nfields[1] +# return nfields[1] +# original we return a string include the last ending 0 + + _,sastr=nfields[1].split(maxsplit=1) + tmpstr=sastr.split() + intary = [int(numeric_string) for numeric_string in tmpstr] + return intary +# now we return a suffix array and not include the last ending 0 else: raise ValueError(("not correct int data type from server {} {}".\ format(mydtype.name, value))) diff --git a/benchmarks/sa.py b/benchmarks/sa.py index 6c61bf0844..d021a36a5c 100755 --- a/benchmarks/sa.py +++ b/benchmarks/sa.py @@ -22,11 +22,11 @@ def time_ak_sa( vsize,strlen, trials, dtype): # print("size of suffix array={}".format(c.bytes.size)) # print("offset/number of suffix array={}".format(c.offsets.size)) # print("itemsize of suffix array={}".format(c.offsets.itemsize)) -# print("All the random strings are as follows") -# for k in range(vsize): -# print("the {} th random tring ={}".format(k,v[k])) -# print("the {} th suffix array ={}".format(k,c[k])) -# print("") + print("All the random strings are as follows") + for k in range(vsize): + print("the {} th random tring ={}".format(k,v[k])) + print("the {} th suffix array ={}".format(k,c[k])) + print("") timings = [] for _ in range(trials): start = time.time() @@ -70,9 +70,9 @@ def time_np_sa(vsize, strlen, trials, dtype): print("Wrong data type") print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) -def check_correctness(dtype, random): - Ni = 10*4 - Nv = 100 +def check_correctness( vsize,strlen, trials, dtype): + Ni = strlen + Nv = vsize v = ak.random_strings_uniform(1, Ni, Nv) c=ak.suffix_array(v) @@ -80,10 +80,13 @@ def check_correctness(dtype, random): s=v[k] sa=suffixArray(s) aksa=c[k] - _,tmp=c[k].split(maxsplit=1) - aksa=tmp.split() - intaksa = [int(numeric_string) for numeric_string in aksa] - assert (sa==intaksa) +# _,tmp=c[k].split(maxsplit=1) +# aksa=tmp.split() +# intaksa = [int(numeric_string) for numeric_string in aksa] +# intaksa = aksa[1:-1] +# print(sa) +# print(intaksa) + assert (sa==aksa) def create_parser(): @@ -111,8 +114,7 @@ def create_parser(): ak.connect(args.hostname, args.port) if args.correctness_only: - for dtype in TYPES: - check_correctness(dtype, args.randomize) + check_correctness(args.number, args.size, args.trials, args.dtype) sys.exit(0) From 50be2e3be5ece8211175e5857bc2134a9d76be5c Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 27 Dec 2020 22:25:52 -0500 Subject: [PATCH 39/68] copy string_test.py --- tests/string_test.py | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/tests/string_test.py b/tests/string_test.py index aa72e4f8e4..700f7a7d95 100644 --- a/tests/string_test.py +++ b/tests/string_test.py @@ -42,15 +42,6 @@ def run_test_unique(strings, test_strings, cat): def run_test_index(strings, test_strings, cat, specificInds): # int index - print("===============in run=============================") - print("The passed strints="); - print(str(strings)) - print("The passed test_strints="); - print(str(test_strings)) - print("The passed strints[N//3]="); - print(str(strings[N//3])) - print("The passed test_strints[N//3]="); - print(str(test_strings[N//3])) assert(strings[N//3] == test_strings[N//3]) assert(cat[N//3] == test_strings[N//3]) for i in specificInds: @@ -227,17 +218,10 @@ def run_test_stick(strings, test_strings, base_words, delim): # test_strings = np.random.choice(base_words, N, replace=True) # strings = ak.array(test_strings) - print("===============main=============================") base_words1 = ak.random_strings_uniform(1, 10, UNIQUE, characters='printable') - print("base_words1=") - print(str(base_word1)) base_words2 = ak.random_strings_lognormal(2, 0.25, UNIQUE, characters='printable') - print("base_words2=") - print(str(base_word2)) gremlins = ak.array(['"', ' ', '']) base_words = ak.concatenate((base_words1, base_words2)) - print("base_words=") - print(str(base_word)) np_base_words = np.hstack((base_words1.to_ndarray(), base_words2.to_ndarray())) assert(compare_strings(base_words.to_ndarray(), np_base_words)) choices = ak.randint(0, base_words.size, N) @@ -249,15 +233,6 @@ def run_test_stick(strings, test_strings, base_words, delim): print("Generation and concatenate passed") # int index - print("") - print(str(strings)) - print("") - print(str(test_strings)) - print("") - print(str(strings[N//3])) - print("") - print(str(test_strings[N//3])) - print("") run_test_index(strings, test_strings, cat, range(-len(gremlins), 0)) print("int index passed") @@ -339,22 +314,6 @@ def setUp(self): self.gremlins_strings = ak.concatenate((base_words[choices], gremlins)) self.gremlins_test_strings = self.gremlins_strings.to_ndarray() self.gremlins_cat = ak.Categorical(self.gremlins_strings) - print("=================In Class will check===========================") - print("") - print(str(base_words1)) - print("After base_word1 ") - print("") - print(str(self.strings)) - print("After Print strings") - print(str(self.test_strings)) - print("") - print("After Print teststrings") - print(str(self.strings[N//3])) - print("") - print("After Print strings[N//3]") - print(str(self.test_strings[N//3])) - print("") - print("After Print test_strings[N//3]") def test_compare_strings(self): print('starting test_compare_Strings') @@ -383,7 +342,6 @@ def test_groupby(self): def test_index(self): print('starting test_index') - print("") run_test_index(self.strings, self.test_strings, self.cat, range(-len(self.gremlins), 0)) run_test_index(self.gremlins_strings, self.gremlins_test_strings, self.gremlins_cat, range(-len(self.gremlins), 0)) @@ -490,7 +448,6 @@ def test_stick(self): def test_str_output(self): strings = ak.array(['string {}'.format(i) for i in range (0,101)]) - print("============================================") print(str(strings)) self.assertEqual("['string 0', 'string 1', 'string 2', ... , 'string 98', 'string 99', 'string 100']", str(strings)) From f1781e8a51c0d3d78a4d1753a6a70e7a41db8e7b Mon Sep 17 00:00:00 2001 From: Bill Reus Date: Mon, 4 Jan 2021 16:18:51 -0500 Subject: [PATCH 40/68] Fixed bug in UnitTestPeelStick --- test/UnitTestPeelStick.chpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/UnitTestPeelStick.chpl b/test/UnitTestPeelStick.chpl index 108afea5f4..7fc2901d2d 100644 --- a/test/UnitTestPeelStick.chpl +++ b/test/UnitTestPeelStick.chpl @@ -129,7 +129,7 @@ proc testMessageLayer(substr, n, minLen, maxLen) throws { d.stop("make_strings"); var reqMsg = "peel str %s %s str 1 True True True %jt".format(strings.offsetName, strings.valueName, [substr]); writeReq(reqMsg); - var repMsg = segmentedEfuncMsg(cmd="segmentedEfunc", payload=reqMsg.encode(), st); + var repMsg = segmentedPeelMsg(cmd="segmentedPeel", payload=reqMsg.encode(), st); writeRep(repMsg); var (loAttribs,lvAttribs,roAttribs,rvAttribs) = repMsg.splitMsgToTuple('+', 4); var loname = parseName(loAttribs); From 221679e6bba34815ca796d3c6345f183a8c7c688 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Mon, 4 Jan 2021 17:23:31 -0500 Subject: [PATCH 41/68] update lcp related code --- arkouda/pdarraycreation.py | 32 +++++++++- src/MultiTypeSymEntry.chpl | 4 +- src/SegmentedArray.chpl | 3 +- src/SegmentedMsg.chpl | 124 +++++++++++++++++++++++++++++++++---- src/arkouda_server.chpl | 1 + 5 files changed, 149 insertions(+), 15 deletions(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 5ea5c65c83..59be44cce2 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -13,7 +13,7 @@ __all__ = ["array", "zeros", "ones", "zeros_like", "ones_like", "arange", "linspace", "randint", "uniform", "standard_normal", "random_strings_uniform", "random_strings_lognormal", "from_series", - "suffix_array","suffix_array_file"] + "suffix_array","lcp_array","suffix_array_file"] numericDTypes = frozenset(["bool", "int64", "float64"]) @@ -830,6 +830,36 @@ def suffix_array(strings : Strings) -> SArrays: repMsg = generic_msg(msg) return SArrays(*(cast(str,repMsg).split('+'))) + +@typechecked +def lcp_array(suffixarrays : SArrays, strings : Strings) -> SArrays: + """ + Return the longest common prefix of given suffix arrays. The size/shape of each lcp + arrays is the same as the corresponding suffix array. + ------- + SArrays + The LCP arrays of the given suffix arrays + + See Also + -------- + + Notes + ----- + + Raises + ------ + RuntimeError + Raised if there is a server-side error in executing group request or + creating the pdarray encapsulating the return message + """ + msg = "segmentedLCP {} {} {} {} {}".format( suffixarrays.objtype, + suffixarrays.offsets.name, + suffixarrays.bytes.name, + strings.offsets.name, + strings.bytes.name) + repMsg = generic_msg(msg) + return SArrays(*(cast(str,repMsg).split('+'))) + @typechecked def suffix_array_file(filename: str) -> SArrays: """ diff --git a/src/MultiTypeSymEntry.chpl b/src/MultiTypeSymEntry.chpl index 5af8af1152..ef9ff8321d 100644 --- a/src/MultiTypeSymEntry.chpl +++ b/src/MultiTypeSymEntry.chpl @@ -78,9 +78,11 @@ module MultiTypeSymEntry :arg etype: type to be instantiated :type etype: type */ + /* var enhancedInfo:string; - /* this entry is used to described the LCP (longest common prefix) array + this entry is used to described the LCP (longest common prefix) array of suffix array or any other information closely related to this entry + This is commented because of the confliction with other parts. */ proc init(len: int, type etype) { super.init(etype, len); diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index cc7cfab23e..b7f84020be 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -1151,6 +1151,7 @@ module SegmentedArray { return lengths; } +/* proc findSubstringInBytes(const substr: string) { // Find the start position of every occurence of substr in the flat bytes array // Start by making a right-truncated subdomain representing all valid starting positions for substr of given length @@ -1391,7 +1392,7 @@ module SegmentedArray { } return (newOffsets, newVals); } - +*/ proc ediff():[offsets.aD] int { var diff: [offsets.aD] int; if (size < 2) { diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 6a69beb754..e6f8ebed7e 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -826,7 +826,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var sasoff = offsegs; //allocate an values array var sasval:[0..(nBytes-1)] int; - var lcpval:[0..(nBytes-1)] int; +// var lcpval:[0..(nBytes-1)] int; var i:int; var j:int; @@ -851,8 +851,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var tmparray:[0..sasize+2] int; var intstrArray:[0..sasize+2] int; var x:int; - var y:int(32); -// var y:int; +// var y:int(32); + var y:int; forall (x,y) in zip ( intstrArray[0..sasize-1],strings.values.a[startposition..endposition]) do x=y; intstrArray[sasize]=0; intstrArray[sasize+1]=0; @@ -860,6 +860,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string SuffixArraySkew(intstrArray,tmparray,sasize,256); for (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do x = y; +/* // Here we calculate the lcp(Longest Common Prefix) array value forall j in startposition+1..endposition do{ var tmpcount=0:int; @@ -875,20 +876,23 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } lcpval[j]=tmpcount; } +*/ } var segName2 = st.nextName(); var valName2 = st.nextName(); - var lcpvalName = st.nextName(); +// var lcpvalName = st.nextName(); var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); - var lcpvalEntry = new shared SymEntry(lcpval); +// var lcpvalEntry = new shared SymEntry(lcpval); + /* valEntry.enhancedInfo=lcpvalName; lcpvalEntry.enhancedInfo=valName2; - + we have removed enchancedInfo. + */ st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); - st.addEntry(lcpvalName, lcpvalEntry); +// st.addEntry(lcpvalName, lcpvalEntry); repMsg = 'created ' + st.attrib(segName2) + '+created ' + st.attrib(valName2); return repMsg; @@ -908,6 +912,98 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } + proc segLCPMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var (objtype, segName1, valName1,segName2,valName2) = payload.decode().splitMsgToTuple(5); + var repMsg: string; + + // check to make sure symbols defined + st.check(segName1); + st.check(valName1); + st.check(segName2); + st.check(valName2); + + var suffixarrays = new owned SegSArray(segName1, valName1, st); + var size=suffixarrays.size; + var nBytes = suffixarrays.nBytes; + var length=suffixarrays.getLengths(); + var offsegs = (+ scan length) - length; + + + var strings = new owned SegString(segName2, valName2, st); + + select (objtype) { + when "int" { + // To be checked, I am not sure if this formula can estimate the total memory requirement + // Lengths + 2*segs + 2*vals (copied to SymTab) + overMemLimit(8*size + 16*size + nBytes); + + //allocate an offset array + var sasoff = offsegs; + //allocate an values array + var lcpval:[0..(nBytes-1)] int; + + var i:int; + var j:int; + forall i in 0..(size-1) do { + // the start position of ith surrix array in value array + var startposition:int; + var endposition:int; + startposition = offsegs[i]; + endposition = startposition+length[i]-1; + + var sasize=length[i]:int; + ref sufArray=suffixarrays.values.a[startposition..endposition]; + ref strArray=strings.values.a[startposition..endposition]; +// Here we calculate the lcp(Longest Common Prefix) array value + forall j in startposition+1..endposition do{ + var tmpcount=0:int; + var tmpbefore=sufArray[j-1]:int; + var tmpcur=sufArray[j]:int; + var tmplen=min(sasize-tmpcur, sasize-tmpbefore); + var tmpi:int; + for tmpi in 0..tmplen-1 do { + if (strArray[tmpbefore]!=strArray[tmpcur]) { + break; + } + tmpbefore+=1; + tmpcur+=1; + tmpcount+=1; + } + lcpval[j]=tmpcount; + } + } + var lcpsegName = st.nextName(); + var lcpvalName = st.nextName(); + + var lcpsegEntry = new shared SymEntry(sasoff); + var lcpvalEntry = new shared SymEntry(lcpval); + /* + valEntry.enhancedInfo=lcpvalName; + lcpvalEntry.enhancedInfo=valName2; + we have removed enchancedInfo. + */ + st.addEntry(lcpsegName, lcpsegEntry); + st.addEntry(lcpvalName, lcpvalEntry); + repMsg = 'created ' + st.attrib(lcpsegName) + '+created ' + st.attrib(lcpvalName); + return repMsg; + + + } + otherwise { + var errorMsg = notImplementedError(pn, "("+objtype+")"); + writeln(generateErrorContext( + msg=errorMsg, + lineNumber=getLineNumber(), + moduleName=getModuleName(), + routineName=getRoutineName(), + errorClass="NotImplementedError")); + return errorMsg; + } + } + + } + // directly read a string from given file and generate its suffix array proc segSAFileMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { var pn = Reflection.getRoutineName(); @@ -933,7 +1029,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var sasoff = offsegs; //allocate a suffix array values array and lcp array var sasval:[0..(nBytes-1)] int; - var lcpval:[0..(nBytes-1)] int; +// var lcpval:[0..(nBytes-1)] int; var i:int; forall i in 0..(size-1) do { @@ -960,6 +1056,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string // divsufsort(strArray,tmparray,sasize); forall (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do x = y; +/* // Here we calculate the lcp(Longest Common Prefix) array value forall j in startposition+1..endposition do{ var tmpcount=0:int; @@ -975,20 +1072,23 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } lcpval[j]=tmpcount; } +*/ } var segName2 = st.nextName(); var valName2 = st.nextName(); - var lcpvalName = st.nextName(); +// var lcpvalName = st.nextName(); var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); - var lcpvalEntry = new shared SymEntry(lcpval); +// var lcpvalEntry = new shared SymEntry(lcpval); + /* valEntry.enhancedInfo=lcpvalName; lcpvalEntry.enhancedInfo=valName2; - + We have removed enhancedInfo. + */ st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); - st.addEntry(lcpvalName, lcpvalEntry); +// st.addEntry(lcpvalName, lcpvalEntry); repMsg = 'created ' + st.attrib(segName2) + '+created ' + st.attrib(valName2); return repMsg; diff --git a/src/arkouda_server.chpl b/src/arkouda_server.chpl index 524929c699..25dc8c73cc 100644 --- a/src/arkouda_server.chpl +++ b/src/arkouda_server.chpl @@ -239,6 +239,7 @@ proc main() { when "segmentedBinopvsInt" {repMsg = segBinopvsIntMsg(cmd, payload, st);} when "segmentedGroup" {repMsg = segGroupMsg(cmd, payload, st);} when "segmentedSuffixAry"{repMsg = segSuffixArrayMsg(cmd, payload, st);} + when "segmentedLCP" {repMsg = segLCPMsg(cmd, payload, st);} when "segmentedSAFile" {repMsg = segSAFileMsg(cmd, payload, st);} when "segmentedIn1d" {repMsg = segIn1dMsg(cmd, payload, st);} when "segmentedIn1dInt" {repMsg = segIn1dIntMsg(cmd, payload, st);} From 0bff3e46766e2f0ac534759d149909073225cb3e Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Mon, 4 Jan 2021 18:45:28 -0500 Subject: [PATCH 42/68] remove the enhenced attribute in sym table --- src/MultiTypeSymEntry.chpl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/MultiTypeSymEntry.chpl b/src/MultiTypeSymEntry.chpl index ef9ff8321d..3004183d76 100644 --- a/src/MultiTypeSymEntry.chpl +++ b/src/MultiTypeSymEntry.chpl @@ -78,12 +78,7 @@ module MultiTypeSymEntry :arg etype: type to be instantiated :type etype: type */ - /* - var enhancedInfo:string; - this entry is used to described the LCP (longest common prefix) array - of suffix array or any other information closely related to this entry - This is commented because of the confliction with other parts. - */ + proc init(len: int, type etype) { super.init(etype, len); this.etype = etype; From 2af5ce88d6e80cb67032e0cb9d6ff626f35c901a Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Tue, 5 Jan 2021 09:57:42 -0500 Subject: [PATCH 43/68] check the comments to remove docs CI check error --- src/SegmentedArray.chpl | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index b7f84020be..2bffa171e2 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -775,7 +775,7 @@ module SegmentedArray { * The pdaray containing the complete int array composed of integer index * corresponding to each string, */ -// var values: borrowed SymEntry(uint(8)); + // var values: borrowed SymEntry(uint(8)); var values: borrowed SymEntry(int); /** @@ -804,7 +804,7 @@ module SegmentedArray { valueName = valName; var vs = try! st.lookup(valName); -// var vals = toSymEntry(vs, uint(8)): unmanaged SymEntry(uint(8)); + // var vals = toSymEntry(vs, uint(8)): unmanaged SymEntry(uint(8)); var vals = toSymEntry(vs, int): unmanaged SymEntry(int); values = vals; size = segs.size; @@ -816,7 +816,7 @@ module SegmentedArray { * inputs, generates the SymEntry objects for each and passes the * offset and value SymTab lookup names to the alternate init method */ -// proc init(segments: [] int, values: [] uint(8), st: borrowed SymTab) { + // proc init(segments: [] int, values: [] uint(8), st: borrowed SymTab) { proc init(segments: [] int, values: [] int, st: borrowed SymTab) { var oName = st.nextName(); var segEntry = new shared SymEntry(segments); @@ -858,7 +858,7 @@ module SegmentedArray { end = offsets.a[idx+1] - 1; } // Take the slice of the bytearray and "cast" it to a chpl string -// var s = interpretAsString(values.a[start..end]); + //var s = interpretAsString(values.a[start..end]); var tmp=values.a[start..end]; var s: string; var i:int; @@ -878,7 +878,7 @@ module SegmentedArray { } // Early return for zero-length result if (size == 0) || (slice.size == 0) { -// return (makeDistArray(0, int), makeDistArray(0, uint(8))); + //return (makeDistArray(0, int), makeDistArray(0, uint(8))); return (makeDistArray(0, int), makeDistArray(0, int)); } // Start of bytearray slice @@ -901,11 +901,11 @@ module SegmentedArray { // Offsets need to be re-zeroed newSegs -= start; // Bytearray of the new slice -// var newVals = makeDistArray(end - start + 1, uint(8)); + //var newVals = makeDistArray(end - start + 1, uint(8)); var newVals = makeDistArray(end - start + 1, int); ref va = values.a; // newVals = values.a[start..end]; -// forall (i, nv) in zip(newVals.domain, newVals) with (var agg = newSrcAggregator(uint(8))) { + //forall (i, nv) in zip(newVals.domain, newVals) with (var agg = newSrcAggregator(uint(8))) { forall (i, nv) in zip(newVals.domain, newVals) with (var agg = newSrcAggregator(int)) { agg.copy(nv, va[start + i]); } @@ -917,7 +917,7 @@ module SegmentedArray { proc this(iv: [?D] int) throws { // Early return for zero-length result if (D.size == 0) { -// return (makeDistArray(0, int), makeDistArray(0, uint(8))); + //return (makeDistArray(0, int), makeDistArray(0, uint(8))); return (makeDistArray(0, int), makeDistArray(0, int)); } // Check all indices within bounds @@ -954,7 +954,7 @@ module SegmentedArray { writeln("Copying values"); stdout.flush(); t1 = getCurrentTime(); } -// var gatheredVals = makeDistArray(retBytes, uint(8)); + //var gatheredVals = makeDistArray(retBytes, uint(8)); var gatheredVals = makeDistArray(retBytes, int); // Multi-locale requires some extra localization work that is not needed // in CHPL_COMM=none @@ -982,7 +982,7 @@ module SegmentedArray { srcIdx = + scan srcIdx; // Now srcIdx has a dst-local copy of the source index and vals can be efficiently gathered ref va = values.a; -// forall (v, si) in zip(gatheredVals, srcIdx) with (var agg = newSrcAggregator(uint(8))) { + //forall (v, si) in zip(gatheredVals, srcIdx) with (var agg = newSrcAggregator(uint(8))) { forall (v, si) in zip(gatheredVals, srcIdx) with (var agg = newSrcAggregator(int)) { agg.copy(v, va[si]); } @@ -1015,7 +1015,7 @@ module SegmentedArray { steps -= iv; // Early return for zero-length result if (newSize == 0) { -// return (makeDistArray(0, int), makeDistArray(0, uint(8))); + //return (makeDistArray(0, int), makeDistArray(0, uint(8))); return (makeDistArray(0, int), makeDistArray(0, int)); } var segInds = makeDistArray(newSize, int); @@ -1151,7 +1151,7 @@ module SegmentedArray { return lengths; } -/* + /* proc findSubstringInBytes(const substr: string) { // Find the start position of every occurence of substr in the flat bytes array // Start by making a right-truncated subdomain representing all valid starting positions for substr of given length @@ -1319,8 +1319,8 @@ module SegmentedArray { const leftOffsets = (+ scan leftLengths) - leftLengths; const rightOffsets = (+ scan rightLengths) - rightLengths; // Allocate values and fill -// var leftVals = makeDistArray((+ reduce leftLengths), uint(8)); -// var rightVals = makeDistArray((+ reduce rightLengths), uint(8)); + // var leftVals = makeDistArray((+ reduce leftLengths), uint(8)); + // var rightVals = makeDistArray((+ reduce rightLengths), uint(8)); var leftVals = makeDistArray((+ reduce leftLengths), int); var rightVals = makeDistArray((+ reduce rightLengths), int); ref va = values.a; @@ -1392,7 +1392,8 @@ module SegmentedArray { } return (newOffsets, newVals); } -*/ + */ + proc ediff():[offsets.aD] int { var diff: [offsets.aD] int; if (size < 2) { From 2d40c0e67ddd9e3369c08a6c23900557f23fa877 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Tue, 5 Jan 2021 14:14:36 -0500 Subject: [PATCH 44/68] solve the sphinx error --- src/SegmentedArray.chpl | 63 +++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 37 deletions(-) diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index 2bffa171e2..92e849ac92 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -145,9 +145,12 @@ module SegmentedArray { return s; } + /* Take a slice of strings from the array. The slice must be a - Chapel range, i.e. low..high by stride, not a Python slice. - Returns arrays for the segment offsets and bytes of the slice.*/ + * Chapel range, i.e. low..high by stride, not a Python slice. + * Returns arrays for the segment offsets and bytes of the slice. + */ + proc this(const slice: range(stridable=true)) throws { if (slice.low < offsets.aD.low) || (slice.high > offsets.aD.high) { throw new owned OutOfBoundsError(); @@ -186,7 +189,8 @@ module SegmentedArray { } /* Gather strings by index. Returns arrays for the segment offsets - and bytes of the gathered strings.*/ + * and bytes of the gathered strings. + */ proc this(iv: [?D] int) throws { // Early return for zero-length result if (D.size == 0) { @@ -235,11 +239,11 @@ module SegmentedArray { if CHPL_COMM != 'none' { // Compute the src index for each byte in gatheredVals /* For performance, we will do this with a scan, so first we need an array - with the difference in index between the current and previous byte. For - the interior of a segment, this is just one, but at the segment boundary, - it is the difference between the src offset of the current segment ("left") - and the src index of the last byte in the previous segment (right - 1). - */ + * with the difference in index between the current and previous byte. For + * the interior of a segment, this is just one, but at the segment boundary, + * it is the difference between the src offset of the current segment ("left") + * and the src index of the last byte in the previous segment (right - 1). + */ var srcIdx = makeDistArray(retBytes, int); srcIdx = 1; var diffs: [D] int; @@ -336,6 +340,7 @@ module SegmentedArray { /* saLogger.debug(getModuleName(),getRoutineName(),getLineNumber(), "%i seconds".format(getCurrentTime() - t1));*/ /* return (gatheredOffsets, gatheredVals); */ + } /* Apply a hash function to all strings. This is useful for grouping @@ -365,32 +370,6 @@ module SegmentedArray { return hashes; } - /* Apply a hash function to all suffix array. This is useful for grouping - and set membership. The hash used is SipHash128.*/ - proc hashInt() throws { - // 128-bit hash values represented as 2-tuples of uint(64) - var hashes: [offsets.aD] 2*uint(64); - // Early exit for zero-length result - if (size == 0) { - return hashes; - } - ref oa = offsets.a; - ref va = values.a; - // Compute lengths of strings - var lengths = getLengths(); - // Hash each string - // TO DO: test on clause with aggregator - forall (o, l, h) in zip(oa, lengths, hashes) { - const myRange = o..#l; - h = sipHash128(va, myRange); - /* // localize the string bytes */ - /* const myBytes = va[{o..#l}]; */ - /* h = sipHash128(myBytes, hashKey); */ - /* // Perf Note: localizing string bytes is ~3x faster on IB multilocale than this: */ - /* // h = sipHash128(va[{o..#l}]); */ - } - return hashes; - } /* Return a permutation that groups the strings. Because hashing is used, this permutation will not sort the strings, but all equivalent strings will fall in one contiguous block. */ @@ -741,7 +720,6 @@ module SegmentedArray { } // class SegString - /** * Represents an array of arrays, implemented as a segmented array of integers. * Instances are ephemeral, not stored in the symbol table. Instead, attributes @@ -752,7 +730,6 @@ module SegmentedArray { */ class SegSArray { - /** * The name of the SymEntry corresponding to the pdarray containing * the offsets, which are start indices for each string bytearray @@ -816,6 +793,7 @@ module SegmentedArray { * inputs, generates the SymEntry objects for each and passes the * offset and value SymTab lookup names to the alternate init method */ + // proc init(segments: [] int, values: [] uint(8), st: borrowed SymTab) { proc init(segments: [] int, values: [] int, st: borrowed SymTab) { var oName = st.nextName(); @@ -959,7 +937,9 @@ module SegmentedArray { // Multi-locale requires some extra localization work that is not needed // in CHPL_COMM=none if CHPL_COMM != 'none' { + // Compute the src index for each byte in gatheredVals + /* For performance, we will do this with a scan, so first we need an array with the difference in index between the current and previous byte. For the interior of a segment, this is just one, but at the segment boundary, @@ -1151,10 +1131,14 @@ module SegmentedArray { return lengths; } + /* + proc findSubstringInBytes(const substr: string) { // Find the start position of every occurence of substr in the flat bytes array - // Start by making a right-truncated subdomain representing all valid starting positions for substr of given length + // Start by making a right-truncated subdomain representing all valid starting positions + // for substr of given length + var D: subdomain(values.aD) = values.aD[values.aD.low..#(values.size - substr.numBytes)]; // Every start position is valid until proven otherwise var truth: [D] bool = true; @@ -1394,6 +1378,11 @@ module SegmentedArray { } */ + /* The comments above is treated as though they were ediff's comment string, which will cause sphinx errors + * It takes me several hours without any idea and thanks Brad help out. He added the following + * line to solve the problem + * dummy chpldoc description for ediff() + */ proc ediff():[offsets.aD] int { var diff: [offsets.aD] int; if (size < 2) { From c782b5a348453e8e124cc04dec0403a404caa72a Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Fri, 8 Jan 2021 13:19:21 -0500 Subject: [PATCH 45/68] add switch betwteen different SA algorithms --- benchmarks/sa.py | 1 + src/SegmentedMsg.chpl | 180 ++++++++++++++++++++++++++---------------- 2 files changed, 112 insertions(+), 69 deletions(-) diff --git a/benchmarks/sa.py b/benchmarks/sa.py index d021a36a5c..ae59178180 100755 --- a/benchmarks/sa.py +++ b/benchmarks/sa.py @@ -115,6 +115,7 @@ def create_parser(): if args.correctness_only: check_correctness(args.number, args.size, args.trials, args.dtype) + print("CORRECT") sys.exit(0) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index e6f8ebed7e..277f74c7df 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -816,6 +816,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var nBytes = strings.nBytes; var length=strings.getLengths(); var offsegs = (+ scan length) - length; + var algorithmNum=2:int; //2:"divsufsort";1:SuffixArraySkew select (objtype) { when "str" { // To be checked, I am not sure if this formula can estimate the total memory requirement @@ -826,40 +827,47 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var sasoff = offsegs; //allocate an values array var sasval:[0..(nBytes-1)] int; -// var lcpval:[0..(nBytes-1)] int; + // var lcpval:[0..(nBytes-1)] int; now we will not build the LCP array at the same time var i:int; var j:int; forall i in 0..(size-1) do { -// for i in 0..(size-1) do { // the start position of ith string in value array + var startposition:int; var endposition:int; startposition = offsegs[i]; endposition = startposition+length[i]-1; -// var sasize=length[i]:int(32); -// ref strArray=strings.values.a[startposition..endposition]; -// var tmparray:[1..sasize] int(32); -// divsufsort(strArray,tmparray,sasize); -// var x:int; -// var y:int(32); -// for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do -// x = y; + // what we do in the select structure is filling the sasval array with correct index + select (algorithmNum) { + when 1 { + var sasize=length[i]:int; + ref strArray=strings.values.a[startposition..endposition]; + var tmparray:[0..sasize+2] int; + var intstrArray:[0..sasize+2] int; + var x:int; + var y:int; + forall (x,y) in zip ( intstrArray[0..sasize-1], + strings.values.a[startposition..endposition]) do x=y; + intstrArray[sasize]=0; + intstrArray[sasize+1]=0; + intstrArray[sasize+2]=0; + SuffixArraySkew(intstrArray,tmparray,sasize,256); + for (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do + x = y; + } + when 2 { + var sasize=length[i]:int(32); + ref strArray=strings.values.a[startposition..endposition]; + var tmparray:[1..sasize] int(32); + divsufsort(strArray,tmparray,sasize); + var x:int; + var y:int(32); + for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do + x = y; + } + } - var sasize=length[i]:int; - ref strArray=strings.values.a[startposition..endposition]; - var tmparray:[0..sasize+2] int; - var intstrArray:[0..sasize+2] int; - var x:int; -// var y:int(32); - var y:int; - forall (x,y) in zip ( intstrArray[0..sasize-1],strings.values.a[startposition..endposition]) do x=y; - intstrArray[sasize]=0; - intstrArray[sasize+1]=0; - intstrArray[sasize+2]=0; - SuffixArraySkew(intstrArray,tmparray,sasize,256); - for (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do - x = y; /* // Here we calculate the lcp(Longest Common Prefix) array value forall j in startposition+1..endposition do{ @@ -880,11 +888,11 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } var segName2 = st.nextName(); var valName2 = st.nextName(); -// var lcpvalName = st.nextName(); + // var lcpvalName = st.nextName(); var segEntry = new shared SymEntry(sasoff); var valEntry = new shared SymEntry(sasval); -// var lcpvalEntry = new shared SymEntry(lcpval); + // var lcpvalEntry = new shared SymEntry(lcpval); /* valEntry.enhancedInfo=lcpvalName; lcpvalEntry.enhancedInfo=valName2; @@ -952,6 +960,9 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string startposition = offsegs[i]; endposition = startposition+length[i]-1; + + + var sasize=length[i]:int; ref sufArray=suffixarrays.values.a[startposition..endposition]; ref strArray=strings.values.a[startposition..endposition]; @@ -1019,6 +1030,17 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var length:[0..0] int =nBytes; var offsegs:[0..0] int =0 ; + var sasize=nBytes:int; + var startposition:int; + var endposition:int; + startposition = 0; + endposition = nBytes-1; + var strArray:[startposition..endposition]uint(8); + var r = f.reader(kind=ionative); + r.read(strArray); + + var algorithmNum=2:int; //2:"divsufsort";1:SuffixArraySkew + select ("str") { when "str" { // To be checked, I am not sure if this formula can estimate the total memory requirement @@ -1034,46 +1056,33 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var i:int; forall i in 0..(size-1) do { // the start position of ith string in value array - var startposition:int; - var endposition:int; - startposition = 0; - endposition = nBytes-1; -// var sasize=nBytes:int(32); - var sasize=nBytes:int; - var strArray:[startposition..endposition]uint(8); - var r = f.reader(kind=ionative); - r.read(strArray); -// var tmparray:[1..sasize] int(32); - var tmparray:[0..sasize+2] int; - var intstrArray:[0..sasize+2] int; - var x:int; - var y:int; - forall (x,y) in zip ( intstrArray[0..sasize-1],strArray[startposition..endposition]) do x=y; - intstrArray[sasize]=0; - intstrArray[sasize+1]=0; - intstrArray[sasize+2]=0; - SuffixArraySkew(intstrArray,tmparray,sasize,256); -// divsufsort(strArray,tmparray,sasize); - forall (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do - x = y; -/* -// Here we calculate the lcp(Longest Common Prefix) array value - forall j in startposition+1..endposition do{ - var tmpcount=0:int; - var tmpbefore=sasval[j-1]:int; - var tmpcur=sasval[j]:int; - var tmplen=min(sasize-tmpcur, sasize-tmpbefore); - var tmpi:int; - for tmpi in 0..tmplen-1 do { - if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { - break; - } - tmpcount+=1; - } - lcpval[j]=tmpcount; - } -*/ - } + select (algorithmNum) { + when 1 { + var sasize=length[i]:int; + var tmparray:[0..sasize+2] int; + var intstrArray:[0..sasize+2] int; + var x:int; + var y:int; + forall (x,y) in zip ( intstrArray[0..sasize-1],strArray[startposition..endposition]) do x=y; + intstrArray[sasize]=0; + intstrArray[sasize+1]=0; + intstrArray[sasize+2]=0; + SuffixArraySkew(intstrArray,tmparray,sasize,256); + for (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do + x = y; + } + when 2 { + var sasize=length[i]:int(32); + //ref strArray=strings.values.a[startposition..endposition]; + var tmparray:[1..sasize] int(32); + divsufsort(strArray,tmparray,sasize); + var x:int; + var y:int(32); + for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do + x = y; + } + }// end of select + } // end of forall var segName2 = st.nextName(); var valName2 = st.nextName(); // var lcpvalName = st.nextName(); @@ -1109,6 +1118,39 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } - - - +/* + var sasize=nBytes:int; + var strArray:[startposition..endposition]uint(8); + var r = f.reader(kind=ionative); + r.read(strArray); +// var tmparray:[1..sasize] int(32); + var tmparray:[0..sasize+2] int; + var intstrArray:[0..sasize+2] int; + var x:int; + var y:int; + forall (x,y) in zip ( intstrArray[0..sasize-1],strArray[startposition..endposition]) do x=y; + intstrArray[sasize]=0; + intstrArray[sasize+1]=0; + intstrArray[sasize+2]=0; + SuffixArraySkew(intstrArray,tmparray,sasize,256); +// divsufsort(strArray,tmparray,sasize); + forall (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do + x = y; +*/ +/* +// Here we calculate the lcp(Longest Common Prefix) array value + forall j in startposition+1..endposition do{ + var tmpcount=0:int; + var tmpbefore=sasval[j-1]:int; + var tmpcur=sasval[j]:int; + var tmplen=min(sasize-tmpcur, sasize-tmpbefore); + var tmpi:int; + for tmpi in 0..tmplen-1 do { + if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { + break; + } + tmpcount+=1; + } + lcpval[j]=tmpcount; + } +*/ From 00b3579b915bd799ffff03ac5caa28eb2b3536ea Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Fri, 8 Jan 2021 15:00:43 -0500 Subject: [PATCH 46/68] single locales for C code --- src/SegmentedMsg.chpl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 277f74c7df..961916d05d 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -858,9 +858,15 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } when 2 { var sasize=length[i]:int(32); + var localstrArray:[0..endposition-startposition] uint(8); + var a:int(8); + var b:int(8); ref strArray=strings.values.a[startposition..endposition]; + localstrArray=strArray; + //for all (a,b) in zip (localstrArray[0..sasize-1],strArray) do a=b; var tmparray:[1..sasize] int(32); - divsufsort(strArray,tmparray,sasize); + divsufsort(localstrArray,tmparray,sasize); + //divsufsort(strArray,tmparray,sasize); var x:int; var y:int(32); for (x, y) in zip(sasval[startposition..endposition], tmparray[1..sasize]) do From 010a446e24cc2329b0c6b608fb4b1a4e6259e163 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 10 Jan 2021 19:35:09 -0500 Subject: [PATCH 47/68] return the string for suffix_array_file --- arkouda/pdarraycreation.py | 10 ++++++++-- src/SegmentedMsg.chpl | 12 +++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 8574a2f5bf..5a5da0ac3a 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -890,7 +890,7 @@ def lcp_array(suffixarrays : SArrays, strings : Strings) -> SArrays: return SArrays(*(cast(str,repMsg).split('+'))) @typechecked -def suffix_array_file(filename: str) -> SArrays: +def suffix_array_file(filename: str) -> (SArrays,Strings): """ This function is major used for testing correctness and performance Return the suffix array of given file name's content as a string. @@ -932,4 +932,10 @@ def suffix_array_file(filename: str) -> SArrays: """ msg = "segmentedSAFile {}".format( filename ) repMsg = generic_msg(msg) - return SArrays(*(cast(str,repMsg).split('+'))) + tmpmsg=cast(str,repMsg).split('+') + sastr=tmpmsg[0:2] + strstr=tmpmsg[2:4] + suffixarray=SArrays(*(cast(str,sastr))) + originalstr=Strings(*(cast(str,strstr))) + return suffixarray,originalstr +# return SArrays(*(cast(str,repMsg).split('+'))) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index b8a7d16eed..e18dd59de5 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -1112,6 +1112,15 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var strArray:[startposition..endposition]uint(8); var r = f.reader(kind=ionative); r.read(strArray); + r.close(); + + var segName = st.nextName(); + var valName = st.nextName(); + + var segEntry = new shared SymEntry(offsegs); + var valEntry = new shared SymEntry(strArray); + st.addEntry(segName, segEntry); + st.addEntry(valName, valEntry); var algorithmNum=2:int; //2:"divsufsort";1:SuffixArraySkew @@ -1172,7 +1181,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string st.addEntry(segName2, segEntry); st.addEntry(valName2, valEntry); // st.addEntry(lcpvalName, lcpvalEntry); - repMsg = 'created ' + st.attrib(segName2) + '+created ' + st.attrib(valName2); + repMsg = 'created ' + st.attrib(segName2) + '+created ' + st.attrib(valName2) + + '+created ' + st.attrib(segName) + '+created ' + st.attrib(valName); return repMsg; } From f5ca67a2f43c9110ab2c4137d54834b95eaee89b Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 10 Jan 2021 20:44:09 -0500 Subject: [PATCH 48/68] remove mypy CI check error --- arkouda/pdarraycreation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 5a5da0ac3a..565ad55a5a 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -890,7 +890,7 @@ def lcp_array(suffixarrays : SArrays, strings : Strings) -> SArrays: return SArrays(*(cast(str,repMsg).split('+'))) @typechecked -def suffix_array_file(filename: str) -> (SArrays,Strings): +def suffix_array_file(filename: str) -> [SArrays,Strings]: """ This function is major used for testing correctness and performance Return the suffix array of given file name's content as a string. From 6d65335f8e6b99ae49f361d0e1986fdbe599ff7e Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 10 Jan 2021 22:18:31 -0500 Subject: [PATCH 49/68] tuple data type --- arkouda/pdarraycreation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 565ad55a5a..7ce17d40cd 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -890,7 +890,8 @@ def lcp_array(suffixarrays : SArrays, strings : Strings) -> SArrays: return SArrays(*(cast(str,repMsg).split('+'))) @typechecked -def suffix_array_file(filename: str) -> [SArrays,Strings]: +def suffix_array_file(filename: str) -> tuple: +#def suffix_array_file(filename: str) -> tuple[SArrays,Strings]: """ This function is major used for testing correctness and performance Return the suffix array of given file name's content as a string. From 45346d080214d2431cbc1094895f4e4d7cdf916c Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 20 Jan 2021 10:28:22 -0500 Subject: [PATCH 50/68] update the partial results in graph --- arkouda/pdarraycreation.py | 2 + src/SegmentedMsg.chpl | 146 +++++++++++++++++++++++++++---------- src/arkouda_server.chpl | 1 + toys/np_rmat.py | 4 +- 4 files changed, 114 insertions(+), 39 deletions(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 7ce17d40cd..2df5d028b2 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -9,6 +9,7 @@ from arkouda.dtypes import dtype as akdtype from arkouda.pdarrayclass import pdarray, create_pdarray from arkouda.strings import Strings, SArrays +from arkouda.graph import Graph __all__ = ["array", "zeros", "ones", "zeros_like", "ones_like", "arange", "linspace", "randint", "uniform", "standard_normal", @@ -934,6 +935,7 @@ def suffix_array_file(filename: str) -> tuple: msg = "segmentedSAFile {}".format( filename ) repMsg = generic_msg(msg) tmpmsg=cast(str,repMsg).split('+') + print(tmpmsg) sastr=tmpmsg[0:2] strstr=tmpmsg[2:4] suffixarray=SArrays(*(cast(str,sastr))) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index e18dd59de5..76e1990881 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -1200,41 +1200,113 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } -} -/* - var sasize=nBytes:int; - var strArray:[startposition..endposition]uint(8); - var r = f.reader(kind=ionative); - r.read(strArray); -// var tmparray:[1..sasize] int(32); - var tmparray:[0..sasize+2] int; - var intstrArray:[0..sasize+2] int; - var x:int; - var y:int; - forall (x,y) in zip ( intstrArray[0..sasize-1],strArray[startposition..endposition]) do x=y; - intstrArray[sasize]=0; - intstrArray[sasize+1]=0; - intstrArray[sasize+2]=0; - SuffixArraySkew(intstrArray,tmparray,sasize,256); -// divsufsort(strArray,tmparray,sasize); - forall (x, y) in zip(sasval[startposition..endposition], tmparray[0..sasize-1]) do - x = y; -*/ -/* -// Here we calculate the lcp(Longest Common Prefix) array value - forall j in startposition+1..endposition do{ - var tmpcount=0:int; - var tmpbefore=sasval[j-1]:int; - var tmpcur=sasval[j]:int; - var tmplen=min(sasize-tmpcur, sasize-tmpbefore); - var tmpi:int; - for tmpi in 0..tmplen-1 do { - if (intstrArray[tmpbefore]!=intstrArray[tmpcur]) { - break; - } - tmpcount+=1; - } - lcpval[j]=tmpcount; - } -*/ + proc segrmatgenMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var (slgNv, sNe_per_v, sp, sperm ) + = payload.decode().splitMsgToTuple(4); + var lgNv = slgNv: int; + var Ne_per_v = sNe_per_v: int; + var p = sp: real; + var perm = sperm: bool; + + var Nv = 2**lgNv:int; + # number of edges + var Ne = Ne_per_v * Nv:int; + # probabilities + var a = p; + var b = (1.0 - a)/ 3.0; + var c = b; + var d = b; + var src: [0..Ne-1]int; + var dst: [0..Ne-1]int; + var e_weight: [0..Ne-1] real; + var v_weight: [0..Nv-1] real; + var neighbour: [0..Nv-1] int; + var directed:bool; + var n_vertices=Nv; + var n_edges=Ne; + src=1; + dst=1; + var dst [0..Ne-1]:int; + # quantites to use in edge generation loop + var ab = a+b:real; + var c_norm = c / (c + d):real; + var a_norm = a / (a + b):real; + # generate edges + + var src_bit [0..Ne-1]:int; + var src_bit [0..Ne-1]:int; + forall ib in 1..lgNv { + var tmpvar[0..Ne-1]:real; + fillRandom(tmpvar); + src_bit=tmpvar>ab; + fillRandom(tmpvar); + dst_bit=tmpvar>(c_norm * src_bit + a_norm * (~ src_bit)); + src = src + ((2**(ib-1)) * src_bit); + dst = dst + ((2**(ib-1)) * dst_bit); + src=src+(src==dst); + # maybe: remove edges which are self-loops??? + var iv = radixSortLSD_ranks(src); + # permute into sorted order + src = src[iv] # permute first vertex into sorted order + dst = dst[iv] # permute second vertex into sorted order + # to premute/rename vertices + # + var begin, end:int; + begin=0; + var sort=0:int; + while (begin < Ne-2) { + end=begin+1; + while ( end Date: Wed, 20 Jan 2021 20:49:05 -0500 Subject: [PATCH 51/68] update bool parameters --- Makefile | 3 + arkouda/graph.py | 269 +++++++++++++++++++++++++++++++++++++ arkouda/pdarraycreation.py | 125 ++++++++++++++++- src/SegmentedMsg.chpl | 80 +++++------ 4 files changed, 437 insertions(+), 40 deletions(-) create mode 100755 arkouda/graph.py diff --git a/Makefile b/Makefile index a1d98e737e..e936457e03 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,9 @@ default: $(DEFAULT_TARGET) VERBOSE ?= 0 +define ARKOUDA_QUICK_COMPILE +CHPL_FLAGS += --no-checks --no-loop-invariant-code-motion --no-fast-followers --ccflags="-O0" +endef CHPL := chpl CHPL_DEBUG_FLAGS += --print-passes ifdef ARKOUDA_DEVELOPER diff --git a/arkouda/graph.py b/arkouda/graph.py new file mode 100755 index 0000000000..73335d8986 --- /dev/null +++ b/arkouda/graph.py @@ -0,0 +1,269 @@ +from __future__ import annotations +from typing import cast, Tuple, Union +from typeguard import typechecked +from arkouda.client import generic_msg +from arkouda.pdarrayclass import pdarray, create_pdarray +#, parse_single_value,_parse_single_int_array_value +from arkouda.logger import getArkoudaLogger +import numpy as np # type: ignore +from arkouda.dtypes import str as akstr +from arkouda.dtypes import int64 as akint +from arkouda.dtypes import NUMBER_FORMAT_STRINGS, resolve_scalar_dtype, \ + translate_np_dtype +import json + +__all__ = ['Graph'] + +class Vertex: + """ + Represents a vertex of a graph + + Attributes + ---------- + vertex_id : int + The unique identification of the vertex in a graph + weight : int + The weitht information of the current vertex + neighbors : pdarray + all the vertices connected to the current vertex. For directed graph, out edge vertices are given. + logger : ArkoudaLogger + Used for all logging operations + + Notes + ----- + Vertex is composed of one pdarray: the ID value array which + contains the all the ids of the adjacency vertices. + """ + # based on args + def __init__(self, *args) -> None: + + try: + self.vertex_id=args[0] + if len(args) > 2: + if isinstance(args[2],pdarray): + self.neighbours=args[2] + else: + try: + self.neighbours = create_pdarray(args[2]) + except Exception as e: + raise RuntimeError(e) + if len(args) > 1: + self.weight=args[1] + except Exception as e: + raise RuntimeError(e) + + self.dtype = akint + self.logger = getArkoudaLogger(name=__class__.__name__) # type: ignore + + def __iter__(self): + raise NotImplementedError('Graph does not support iteration') + + def __size__(self) -> int: + return self.adjacency.size + + + def __str__(self) -> str: + return "vertex id={},#adjacency={},weight={}".format(self.vertex_id,\ + self.size,self.weight) + + def __repr__(self) -> str: + return "{}".format(self.__str__()) + + +class Edge: + """ + Represents an Edge of a graph + + Attributes + ---------- + vertex_pair : tuple + The unique identification of the edge in a graph + weight : int + The weitht information of the current edge + adjacency : pdarray + all the vertices connected to the current vertex. For directed graph, out edge vertices are given. + logger : ArkoudaLogger + Used for all logging operations + + Notes + ----- + Vertex is composed of one pdarray: the ID value array which + contains the all the ids of the adjacency vertices. + """ + # based on args + def __init__(self, *args) -> None: + try: + self.vertex_pair=args[0] + if len(args) > 2: + if isinstance(args[2],pdarray): + self.adjacency=args[2] + else: + try: + self.adjacency = create_pdarray(args[2]) + except Exception as e: + raise RuntimeError(e) + if len(args) > 1: + self.weight=args[1] + except Exception as e: + raise RuntimeError(e) + self.dtype = akint + self.logger = getArkoudaLogger(name=__class__.__name__) # type: ignore + + def __iter__(self): + raise NotImplementedError('Graph does not support iteration') + + def __size__(self) -> int: + return self.adjacency.size/2 + + + def __str__(self) -> str: + return "vertex pair={},#adjacency={},weight={}".format(self.vertex_pair,\ + self.size,self.weight) + + def __repr__(self) -> str: + return "{}".format(self.__str__()) + + +class Graph: + """ + Represents a graph whose data resides on the + arkouda server. The user should not call this class directly; + rather its instances are created by other arkouda functions. + + Attributes + ---------- + n_vertices : int + The starting indices for each string + n_edges : int + The starting indices for each string + directed : bool + The raw bytes of all strings, joined by nulls + src : pdarray + The source of every edge in the graph + dst : pdarray + The destination of every vertex in the graph + v_weight : pdarray + The weitht of every vertex in the graph + e_weight : pdarray + The weitht of every edge in the graph + neighbour : pdarray + The current vertix id v's (v None: + """ + Initializes the Graph instance by setting all instance + attributes, some of which are derived from the array parameters. + + Parameters + ---------- + n_vertices : must provide + n_edges : must provide + directed : optional + src,dst : optional if no directed + neighbour : optional if no src and dst + v_weight : optional if no neighbour + e_weight : optional if no v_weight + + + Returns + ------- + None + + Raises + ------ + RuntimeError + Raised if there's an error converting a Numpy array or standard + Python array to either the offset_attrib or bytes_attrib + ValueError + Raised if there's an error in generating instance attributes + from either the offset_attrib or bytes_attrib parameter + """ + try: + print("init the graph") + print(args) + if len(args) < 2: + raise ValueError + self.n_vertices=cast(int,args[0]) + self.n_edges=cast(int,args[1]) + if len(args) > 7: + if isinstance(args[7],pdarray): + self.e_weight=args[7] + else: + try: + self.e_weight = create_pdarray(args[7]) + except Exception as e: + raise RuntimeError(e) + if len(args) > 6: + if isinstance(args[6],pdarray): + self.v_weight=args[6] + else: + try: + self.v_weight = create_pdarray(args[6]) + except Exception as e: + raise RuntimeError(e) + if len(args) > 5: + if isinstance(args[5],pdarray): + self.neighbour=args[5] + else: + try: + self.neighbour = create_pdarray(args[5]) + except Exception as e: + raise RuntimeError(e) + if len(args) == 4: + raise ValueError + if len(args) > 4: + if (isinstance(args[4],pdarray) and isinstance(args[3],pdarray)) : + self.src=args[3] + self.dst=args[4] + else: + try: + self.src = create_pdarray(args[3]) + self.dst = create_pdarray(args[4]) + except Exception as e: + raise RuntimeError(e) + if len(args) > 2: + self.directed=cast(int,args[2]) + raise ValueError + except Exception as e: + raise RuntimeError(e) + self.dtype = akint + self.logger = getArkoudaLogger(name=__class__.__name__) # type: ignore + + def __iter__(self): + raise NotImplementedError('Graph does not support iteration') + + def __size__(self) -> int: + return self.n_vertices + + + def add_vertice(self, x: Vertice) : + print() + + def remove_vertice(self, x: int) : + print() + + def neighbours(self, x: int)->pdarray : + print() + + def adjacent(self, x: int, y:int )->pdarray : + print() + + def get_vertice_value(self, x: int) -> Vertice: + print() + + def set_vertice_value(self, x: int, v: Vertice) : + print() + + def add_edge(self, x: int, y: int) : + print() + + def remove_edge(self, x: int, y: int) : + print() + diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 2df5d028b2..1236853c11 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -14,7 +14,8 @@ __all__ = ["array", "zeros", "ones", "zeros_like", "ones_like", "arange", "linspace", "randint", "uniform", "standard_normal", "random_strings_uniform", "random_strings_lognormal", - "from_series", "suffix_array","lcp_array","suffix_array_file"] + "from_series", "suffix_array","lcp_array","suffix_array_file", + "rmat_gen"] numericDTypes = frozenset(["bool", "int64", "float64"]) @@ -942,3 +943,125 @@ def suffix_array_file(filename: str) -> tuple: originalstr=Strings(*(cast(str,strstr))) return suffixarray,originalstr # return SArrays(*(cast(str,repMsg).split('+'))) + + +@typechecked +def graph_file(filename: str) -> Graph: + """ + This function is major for creating a graph from a file + Returns + ------- + Graph + The Graph class to represent the data + + See Also + -------- + + Notes + ----- + + Raises + ------ + RuntimeError + """ + msg = "segmentedGraphFile {}".format( filename ) + repMsg = generic_msg(msg) + return Graph(*(cast(str,repMsg).split('+'))) + +@typechecked +def rmat_gen (lgNv:int, Ne_per_v:int, p:float, perm: int) -> Graph: + """ + This function is for creating a graph using rmat graph generator + Returns + ------- + Graph + The Graph class to represent the data + + See Also + -------- + + Notes + ----- + + Raises + ------ + RuntimeError + """ + msg = "segmentedRMAT {} {} {} {}".format(lgNv, Ne_per_v, p, perm) + repMsg = generic_msg(msg) + print(repMsg) + print(cast(str,repMsg).split('+')) + return Graph(*(cast(str,repMsg).split('+'))) + +@typechecked +def graph_bfs (graph: Graph, root: int ) -> tuple: + """ + This function is generating the breadth-first search vertices sequences in given graph + starting from the given root vertex + Returns + ------- + pdarray + The bfs vertices results + + See Also + -------- + + Notes + ----- + + Raises + ------ + RuntimeError + """ + msg = "segmentedGraphBFS {} {} {}".format(graph.edges.name,graph.vertices.name,root) + repMsg = generic_msg(msg) + return Graph(*(cast(str,repMsg).split('+'))) + + +@typechecked +def graph_dfs (graph: Graph, root: int ) -> pdarray: + """ + This function is generating the depth-first search vertices sequences in given graph + starting from the given root vertex + Returns + ------- + pdarray + The dfs vertices results + + See Also + -------- + + Notes + ----- + + Raises + ------ + RuntimeError + """ + msg = "segmentedGraphDFS {} {} {}".format(graph.edges.name,graph.vertices.name,root) + repMsg = generic_msg(msg) + return Graph(*(cast(str,repMsg).split('+'))) + + +@typechecked +def components (graph: Graph ) -> int : + """ + This function returns the number of components of the given graph + Returns + ------- + int + The total number of components + + See Also + -------- + + Notes + ----- + + Raises + ------ + RuntimeError + """ + msg = "segmentedGraphComponents {} {}".format(graph.edges.name,graph.vertices.name) + repMsg = generic_msg(msg) + return cast(int,repMsg) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 76e1990881..45c58313a7 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -13,6 +13,8 @@ module SegmentedMsg { use SymArrayDmap; use SACA; + use Random; + use RadixSortLSD only radixSortLSD_ranks; private config const DEBUG = false; @@ -1203,82 +1205,80 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string proc segrmatgenMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { var pn = Reflection.getRoutineName(); + var repMsg: string; var (slgNv, sNe_per_v, sp, sperm ) = payload.decode().splitMsgToTuple(4); + writeln(slgNv, sNe_per_v, sp, sperm); var lgNv = slgNv: int; var Ne_per_v = sNe_per_v: int; var p = sp: real; - var perm = sperm: bool; + var perm = sperm: int; var Nv = 2**lgNv:int; - # number of edges + // number of edges var Ne = Ne_per_v * Nv:int; - # probabilities + // probabilities var a = p; - var b = (1.0 - a)/ 3.0; + var b = (1.0 - a)/ 3.0:real; var c = b; var d = b; - var src: [0..Ne-1]int; - var dst: [0..Ne-1]int; + var src: [0..Ne-1] int; + var dst: [0..Ne-1] int; var e_weight: [0..Ne-1] real; var v_weight: [0..Nv-1] real; - var neighbour: [0..Nv-1] int; + var length: [0..Nv-1] int; var directed:bool; var n_vertices=Nv; var n_edges=Ne; src=1; dst=1; - var dst [0..Ne-1]:int; - # quantites to use in edge generation loop + // quantites to use in edge generation loop var ab = a+b:real; var c_norm = c / (c + d):real; var a_norm = a / (a + b):real; - # generate edges - - var src_bit [0..Ne-1]:int; - var src_bit [0..Ne-1]:int; - forall ib in 1..lgNv { - var tmpvar[0..Ne-1]:real; + // generate edges + var src_bit: [0..Ne-1]int; + var dst_bit: [0..Ne-1]int; + for ib in 1..lgNv { + var tmpvar: [0..Ne-1] real; fillRandom(tmpvar); src_bit=tmpvar>ab; fillRandom(tmpvar); dst_bit=tmpvar>(c_norm * src_bit + a_norm * (~ src_bit)); src = src + ((2**(ib-1)) * src_bit); dst = dst + ((2**(ib-1)) * dst_bit); + } src=src+(src==dst); - # maybe: remove edges which are self-loops??? + // maybe: remove edges which are self-loops??? var iv = radixSortLSD_ranks(src); - # permute into sorted order - src = src[iv] # permute first vertex into sorted order - dst = dst[iv] # permute second vertex into sorted order - # to premute/rename vertices - # - var begin, end:int; - begin=0; + // permute into sorted order + src = src[iv]; //# permute first vertex into sorted order + dst = dst[iv]; //# permute second vertex into sorted order + //# to premute/rename vertices + var startpos=0, endpos:int; var sort=0:int; - while (begin < Ne-2) { - end=begin+1; - while ( end Date: Wed, 20 Jan 2021 20:50:47 -0500 Subject: [PATCH 52/68] RAMT benchmark --- benchmarks/rmatgen.py | 118 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100755 benchmarks/rmatgen.py diff --git a/benchmarks/rmatgen.py b/benchmarks/rmatgen.py new file mode 100755 index 0000000000..f882294c3b --- /dev/null +++ b/benchmarks/rmatgen.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +import time, argparse +import numpy as np +import arkouda as ak +import random +import string + +TYPES = ('int64', 'float64', 'bool', 'str') + +def time_ak_rmat_graph(lgNv, Ne_per_v, p, perm): + print(">>> arkouda rmat graph") + cfg = ak.get_config() + Nv = cfg["numLocales"] + print("numLocales = {}".format(cfg["numLocales"])) + Graph = ak.rmat_gen(lgNv, Ne_per_v, p, perm) + print("number of vertices ={}".format(Graph.n_vertices)) + print("number of edges ={}".format(Graph.n_edges)) + print("directed graph ={}".format(Graph.directed)) + print("source of edges ={}".format(Graph.src)) + print("dest of edges ={}".format(Graph.dst)) + print("vertices weight ={}".format(Graph.v_weight)) + print("edges weight ={}".format(Graph.e_weight)) + print("neighbour ={}".format(Graph.neighbour)) + return + timings = [] + for _ in range(trials): + start = time.time() + c=ak.suffix_array(v) + end = time.time() + timings.append(end - start) + tavg = sum(timings) / trials + + print("Average time = {:.4f} sec".format(tavg)) + if dtype == 'str': + offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize + bytes_transferred = (c.bytes.size * c.offsets.itemsize) + (0 * c.bytes.size) + bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg + else: + print("Wrong data type") + print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) + + +def suffixArray(s): + suffixes = [(s[i:], i) for i in range(len(s))] + suffixes.sort(key=lambda x: x[0]) + sa= [s[1] for s in suffixes] + #sa.insert(0,len(sa)) + return sa + +def time_np_sa(vsize, strlen, trials, dtype): + s=''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(strlen)) + timings = [] + for _ in range(trials): + start = time.time() + sa=suffixArray(s) + end = time.time() + timings.append(end - start) + tavg = sum(timings) / trials + print("Average time = {:.4f} sec".format(tavg)) + if dtype == 'str': + offsets_transferred = 0 + bytes_transferred = len(s) + bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg + else: + print("Wrong data type") + print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) + +def check_correctness( vsize,strlen, trials, dtype): + Ni = strlen + Nv = vsize + + v = ak.random_strings_uniform(1, Ni, Nv) + c=ak.suffix_array(v) + for k in range(Nv): + s=v[k] + sa=suffixArray(s) + aksa=c[k] +# _,tmp=c[k].split(maxsplit=1) +# aksa=tmp.split() +# intaksa = [int(numeric_string) for numeric_string in aksa] +# intaksa = aksa[1:-1] +# print(sa) +# print(intaksa) + assert (sa==aksa) + + +def create_parser(): + parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") + parser.add_argument('hostname', help='Hostname of arkouda server') + parser.add_argument('port', type=int, help='Port of arkouda server') + parser.add_argument('-v', '--logvertices', type=int, default=5, help='Problem size: log number of vertices') + parser.add_argument('-e', '--vedges', type=int, default=4,help='Number of edges per vertex') + parser.add_argument('-p', '--possibility', type=float, default=0.3,help='Possibility ') + parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') + parser.add_argument('-m', '--perm', type=int, default=0 , help='if permutation ') + parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') + parser.add_argument('--correctness-only', default=False, action='store_true', help='Only check correctness, not performance.') + return parser + + + +if __name__ == "__main__": + import sys + parser = create_parser() + args = parser.parse_args() + ak.verbose = False + ak.connect(args.hostname, args.port) + + ''' + if args.correctness_only: + check_correctness(args.number, args.size, args.trials, args.dtype) + print("CORRECT") + sys.exit(0) + ''' + + time_ak_rmat_graph(args.logvertices, args.vedges, args.possibility, args.perm) + sys.exit(0) From d06e6f3deaa7a7d032108d1c3ff1e00676d7e669 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 20 Jan 2021 22:02:03 -0500 Subject: [PATCH 53/68] remove print information --- arkouda/graph.py | 7 ++----- arkouda/pdarraycreation.py | 2 -- benchmarks/rmatgen.py | 6 +++--- src/SegmentedMsg.chpl | 9 +++------ 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/arkouda/graph.py b/arkouda/graph.py index 73335d8986..b0329df2a3 100755 --- a/arkouda/graph.py +++ b/arkouda/graph.py @@ -126,7 +126,7 @@ def __repr__(self) -> str: class Graph: """ - Represents a graph whose data resides on the + This is an array based graph representation. The graph data resides on the arkouda server. The user should not call this class directly; rather its instances are created by other arkouda functions. @@ -186,8 +186,6 @@ def __init__(self, *args) -> None: from either the offset_attrib or bytes_attrib parameter """ try: - print("init the graph") - print(args) if len(args) < 2: raise ValueError self.n_vertices=cast(int,args[0]) @@ -229,8 +227,7 @@ def __init__(self, *args) -> None: except Exception as e: raise RuntimeError(e) if len(args) > 2: - self.directed=cast(int,args[2]) - raise ValueError + self.directed=cast(bool,args[2]) except Exception as e: raise RuntimeError(e) self.dtype = akint diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 1236853c11..8bb368ebbf 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -989,8 +989,6 @@ def rmat_gen (lgNv:int, Ne_per_v:int, p:float, perm: int) -> Graph: """ msg = "segmentedRMAT {} {} {} {}".format(lgNv, Ne_per_v, p, perm) repMsg = generic_msg(msg) - print(repMsg) - print(cast(str,repMsg).split('+')) return Graph(*(cast(str,repMsg).split('+'))) @typechecked diff --git a/benchmarks/rmatgen.py b/benchmarks/rmatgen.py index f882294c3b..b35bc0136d 100755 --- a/benchmarks/rmatgen.py +++ b/benchmarks/rmatgen.py @@ -89,9 +89,9 @@ def create_parser(): parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") parser.add_argument('hostname', help='Hostname of arkouda server') parser.add_argument('port', type=int, help='Port of arkouda server') - parser.add_argument('-v', '--logvertices', type=int, default=5, help='Problem size: log number of vertices') - parser.add_argument('-e', '--vedges', type=int, default=4,help='Number of edges per vertex') - parser.add_argument('-p', '--possibility', type=float, default=0.3,help='Possibility ') + parser.add_argument('-v', '--logvertices', type=int, default=7, help='Problem size: log number of vertices') + parser.add_argument('-e', '--vedges', type=int, default=2,help='Number of edges per vertex') + parser.add_argument('-p', '--possibility', type=float, default=0.03,help='Possibility ') parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') parser.add_argument('-m', '--perm', type=int, default=0 , help='if permutation ') parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 45c58313a7..51f45b8b26 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -1208,7 +1208,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var repMsg: string; var (slgNv, sNe_per_v, sp, sperm ) = payload.decode().splitMsgToTuple(4); - writeln(slgNv, sNe_per_v, sp, sperm); var lgNv = slgNv: int; var Ne_per_v = sNe_per_v: int; var p = sp: real; @@ -1268,11 +1267,9 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string ref p=dst[startpos..endpos]; var ivx=radixSortLSD_ranks(p); dst[startpos..endpos]=dst[ivx]; - } else { - startpos+=1; - break; - - } + } + startpos+=1; + break; } }//end of while endpos }//end of while startpos From 380668c6589ca1cad58a6c7b676ba5d6a3e4b01b Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 21 Jan 2021 10:38:01 -0500 Subject: [PATCH 54/68] comment some graph functions --- arkouda/graph.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/arkouda/graph.py b/arkouda/graph.py index b0329df2a3..4ac23eff57 100755 --- a/arkouda/graph.py +++ b/arkouda/graph.py @@ -59,11 +59,11 @@ def __iter__(self): raise NotImplementedError('Graph does not support iteration') def __size__(self) -> int: - return self.adjacency.size + return self.neighbours.size def __str__(self) -> str: - return "vertex id={},#adjacency={},weight={}".format(self.vertex_id,\ + return "vertex id={},#neighbours={},weight={}".format(self.vertex_id,\ self.size,self.weight) def __repr__(self) -> str: @@ -112,13 +112,9 @@ def __init__(self, *args) -> None: def __iter__(self): raise NotImplementedError('Graph does not support iteration') - def __size__(self) -> int: - return self.adjacency.size/2 - - def __str__(self) -> str: - return "vertex pair={},#adjacency={},weight={}".format(self.vertex_pair,\ - self.size,self.weight) + return "vertex pair={},weight={},#adjacency".format(self.vertex_pair,\ + self.weight,self.adjacency.size) def __repr__(self) -> str: return "{}".format(self.__str__()) @@ -239,18 +235,19 @@ def __iter__(self): def __size__(self) -> int: return self.n_vertices - - def add_vertice(self, x: Vertice) : + ''' + def add_vertice(self, x: Vertice)->None : print() - def remove_vertice(self, x: int) : + def remove_vertice(self, x: int) ->None: print() def neighbours(self, x: int)->pdarray : - print() + return self.neighbour[i] def adjacent(self, x: int, y:int )->pdarray : - print() + neighbours(self,x) + neighbours(self,y) def get_vertice_value(self, x: int) -> Vertice: print() @@ -263,4 +260,4 @@ def add_edge(self, x: int, y: int) : def remove_edge(self, x: int, y: int) : print() - + ''' From c6a3946526a408686fbb16fbaa1371d2ada1d25f Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Thu, 21 Jan 2021 13:54:26 -0500 Subject: [PATCH 55/68] update graph class --- arkouda/graph.py | 6 +- src/SegmentedArray.chpl | 214 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+), 3 deletions(-) diff --git a/arkouda/graph.py b/arkouda/graph.py index 4ac23eff57..1d2a8e021b 100755 --- a/arkouda/graph.py +++ b/arkouda/graph.py @@ -133,7 +133,7 @@ class Graph: n_edges : int The starting indices for each string directed : bool - The raw bytes of all strings, joined by nulls + The graph is directed (True) or undirected (False) src : pdarray The source of every edge in the graph dst : pdarray @@ -143,8 +143,8 @@ class Graph: e_weight : pdarray The weitht of every edge in the graph neighbour : pdarray - The current vertix id v's (v Date: Fri, 22 Jan 2021 23:44:43 -0500 Subject: [PATCH 56/68] vertex model, add start_i --- arkouda/graph.py | 42 +++++---- arkouda/pdarraycreation.py | 4 +- benchmarks/rmatgen.py | 7 +- src/SegmentedArray.chpl | 32 ++++++- src/SegmentedMsg.chpl | 185 +++++++++++++++++++++++++++++++++++++ 5 files changed, 245 insertions(+), 25 deletions(-) diff --git a/arkouda/graph.py b/arkouda/graph.py index 1d2a8e021b..9b42ea777e 100755 --- a/arkouda/graph.py +++ b/arkouda/graph.py @@ -63,8 +63,8 @@ def __size__(self) -> int: def __str__(self) -> str: - return "vertex id={},#neighbours={},weight={}".format(self.vertex_id,\ - self.size,self.weight) + return "vertex id={},weight={},#neighbours={}".format(self.vertex_id,\ + self.weight, self.neighbours.size) def __repr__(self) -> str: return "{}".format(self.__str__()) @@ -113,7 +113,7 @@ def __iter__(self): raise NotImplementedError('Graph does not support iteration') def __str__(self) -> str: - return "vertex pair={},weight={},#adjacency".format(self.vertex_pair,\ + return "vertex pair={},weight={},#adjacency={}".format(self.vertex_pair,\ self.weight,self.adjacency.size) def __repr__(self) -> str: @@ -138,13 +138,15 @@ class Graph: The source of every edge in the graph dst : pdarray The destination of every vertex in the graph + start : pdarray + The starting index of all the vertices in src and dst + neighbour : pdarray + The current vertex id v's (v None: raise ValueError self.n_vertices=cast(int,args[0]) self.n_edges=cast(int,args[1]) - if len(args) > 7: - if isinstance(args[7],pdarray): - self.e_weight=args[7] + if len(args) > 8: + if isinstance(args[8],pdarray): + self.e_weight=args[8] else: try: - self.e_weight = create_pdarray(args[7]) + self.e_weight = create_pdarray(args[8]) except Exception as e: raise RuntimeError(e) - if len(args) > 6: - if isinstance(args[6],pdarray): - self.v_weight=args[6] + if len(args) > 7: + if isinstance(args[7],pdarray): + self.v_weight=args[7] else: try: - self.v_weight = create_pdarray(args[6]) + self.v_weight = create_pdarray(args[7]) except Exception as e: raise RuntimeError(e) - if len(args) > 5: - if isinstance(args[5],pdarray): - self.neighbour=args[5] + if len(args) == 6: + raise ValueError + if len(args) > 6: + if (isinstance(args[6],pdarray) and isinstance(args[5],pdarray)) : + self.start=args[5] + self.neighbour=args[6] else: try: - self.neighbour = create_pdarray(args[5]) + self.start = create_pdarray(args[5]) + self.neighbour = create_pdarray(args[6]) except Exception as e: raise RuntimeError(e) if len(args) == 4: diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 8bb368ebbf..73f2e04713 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -1011,7 +1011,9 @@ def graph_bfs (graph: Graph, root: int ) -> tuple: ------ RuntimeError """ - msg = "segmentedGraphBFS {} {} {}".format(graph.edges.name,graph.vertices.name,root) + msg = "segmentedGraphBFS {} {} {} {} {} {} {} {}".format(graph.n_vertices,graph.n_edges,\ + graph.directed,graph.src.name,graph.dst.name,\ + graph.start.name,graph.neighbour.name,root) repMsg = generic_msg(msg) return Graph(*(cast(str,repMsg).split('+'))) diff --git a/benchmarks/rmatgen.py b/benchmarks/rmatgen.py index b35bc0136d..7f858b7c09 100755 --- a/benchmarks/rmatgen.py +++ b/benchmarks/rmatgen.py @@ -19,9 +19,10 @@ def time_ak_rmat_graph(lgNv, Ne_per_v, p, perm): print("directed graph ={}".format(Graph.directed)) print("source of edges ={}".format(Graph.src)) print("dest of edges ={}".format(Graph.dst)) + print("start ={}".format(Graph.start)) + print("neighbour ={}".format(Graph.neighbour)) print("vertices weight ={}".format(Graph.v_weight)) print("edges weight ={}".format(Graph.e_weight)) - print("neighbour ={}".format(Graph.neighbour)) return timings = [] for _ in range(trials): @@ -89,9 +90,9 @@ def create_parser(): parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") parser.add_argument('hostname', help='Hostname of arkouda server') parser.add_argument('port', type=int, help='Port of arkouda server') - parser.add_argument('-v', '--logvertices', type=int, default=7, help='Problem size: log number of vertices') + parser.add_argument('-v', '--logvertices', type=int, default=5, help='Problem size: log number of vertices') parser.add_argument('-e', '--vedges', type=int, default=2,help='Number of edges per vertex') - parser.add_argument('-p', '--possibility', type=float, default=0.03,help='Possibility ') + parser.add_argument('-p', '--possibility', type=float, default=0.01,help='Possibility ') parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') parser.add_argument('-m', '--perm', type=int, default=0 , help='if permutation ') parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index 09612b5b6a..6655c7727c 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -1478,6 +1478,12 @@ module SegmentedArray { var dst: borrowed SymEntry(int); + /* The starting index of every vertex in src and dst the ,name */ + var startName : string; + + /* The starting index of every vertex in src and dst the ,name */ + var start_p: borrowed SymEntry(int); + /* The current vertex id v's (vab; + fillRandom(tmpvar); + dst_bit=tmpvar>(c_norm * src_bit + a_norm * (~ src_bit)); + src = src + ((2**(ib-1)) * src_bit); + dst = dst + ((2**(ib-1)) * dst_bit); + } + src=src%Nv; + dst=dst%Nv; + //src=src+(src==dst); + // maybe: remove edges which are self-loops??? + /* + writeln("before sorting"); + writeln("src="); + writeln(src); + writeln("dst="); + writeln("dst=",dst); + */ + var iv = radixSortLSD_ranks(src); + // permute into sorted order + var src1 = src[iv]; //# permute first vertex into sorted order + var dst1 = dst[iv]; //# permute second vertex into sorted order + /* + writeln("before sorting"); + writeln("src="); + writeln(src); + writeln("dst="); + writeln("dst=",dst); + writeln("iv="); + writeln(iv); + */ + //# to premute/rename vertices + var startpos=0, endpos:int; + var sort=0:int; + while (startpos < Ne-2) { + endpos=startpos+1; + sort=0; + //writeln("startpos=",startpos,"endpos=",endpos); + while (endpos <=Ne-1) { + if (src1[startpos]==src1[endpos]) { + sort=1; + endpos+=1; + continue; + } else { + break; + } + }//end of while endpos + if (sort==1) { + var tmpary:[0..endpos-startpos-1] int; + tmpary=dst1[startpos..endpos-1]; + var ivx=radixSortLSD_ranks(tmpary); + dst1[startpos..endpos-1]=tmpary[ivx]; + //writeln("src1=",src1,"dst1=",dst1,"ivx=",ivx); + sort=0; + } + startpos+=1; + }//end of while startpos + + //writeln("before assignment start_i="); + //writeln(start_i); + //writeln(""); + for i in 0..Ne-1 do { + length[src1[i]]+=1; + if (start_i[src1[i]] ==-1){ + start_i[src1[i]]=i; + //writeln("assign index ",i, " to vertex ",src1[i]); + } + + } + var neighbour = (+ scan length) - length; + /* + writeln("src="); + writeln(src); + writeln("dst="); + writeln(dst); + writeln("src1="); + writeln(src1); + writeln("dst1="); + writeln(dst1); + writeln("start_i="); + writeln(start_i); + writeln("start_i[0]=",start_i[0]); + writeln("neighbour="); + writeln(neighbour); + */ + fillRandom(e_weight); + fillRandom(v_weight); + //writeln("e_weight=",e_weight,"v_weight=",v_weight); + var srcName = st.nextName(); + var dstName = st.nextName(); + var startName = st.nextName(); + var neiName = st.nextName(); + var ewName = st.nextName(); + var vwName = st.nextName(); + var srcEntry = new shared SymEntry(src1); + var dstEntry = new shared SymEntry(dst1); + var ewEntry = new shared SymEntry(e_weight); + var vwEntry = new shared SymEntry(v_weight); + var neiEntry = new shared SymEntry(neighbour); + var startEntry = new shared SymEntry(start_i); + st.addEntry(srcName, srcEntry); + st.addEntry(dstName, dstEntry); + st.addEntry(startName, startEntry); + st.addEntry(neiName, neiEntry); + st.addEntry(vwName, vwEntry); + st.addEntry(ewName, ewEntry); + var sNv=Nv:string; + var sNe=Ne:string; + var sDirected=directed:string; + //repMsg = (Ne:string) + '+ ' + (Nv:string) + '+ ' + (directed:string) + + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) + + '+created ' + st.attrib(vwName) + '+created ' + st.attrib(ewName); + smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); + return repMsg; + } + + + + + proc segBFSMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var repMsg: string; + var (n_verticesN,n_edgesN,directedN,srcN, dstN, neighbourN, rootN ) + = payload.decode().splitMsgToTuple(7); + var Nv=n_verticesN:int; + var Ne=n_edgesN:int; + var Directed=directedN:bool; + var root=rootN:int; + + var agraph = new owned SegGraph(Nv,Ne,Directed,srcN,dstN,neighbourN, st); + + var dep=-1:[0..Nv-1] int; + dep[root]=0; + var level=0; + +/* // probabilities var a = p; var b = (1.0 - a)/ 3.0:real; @@ -1306,6 +1473,24 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return repMsg; +*/ + return "test"; } + + + + + + + + + + + + + + + + } From 07906d79144ccbeba2c0f432df438fdbea5f857b Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 24 Jan 2021 00:54:48 -0500 Subject: [PATCH 57/68] implement bfs algorithm --- Makefile | 3 - arkouda/graph.py | 13 +-- arkouda/pdarraycreation.py | 17 ++-- benchmarks/bfs.py | 144 +++++++++++++++++++++++++++++++++ src/SegmentedArray.chpl | 160 +++++-------------------------------- src/SegmentedMsg.chpl | 160 ++++++++++++------------------------- src/arkouda_server.chpl | 1 + 7 files changed, 232 insertions(+), 266 deletions(-) create mode 100755 benchmarks/bfs.py diff --git a/Makefile b/Makefile index e936457e03..a1d98e737e 100644 --- a/Makefile +++ b/Makefile @@ -12,9 +12,6 @@ default: $(DEFAULT_TARGET) VERBOSE ?= 0 -define ARKOUDA_QUICK_COMPILE -CHPL_FLAGS += --no-checks --no-loop-invariant-code-motion --no-fast-followers --ccflags="-O0" -endef CHPL := chpl CHPL_DEBUG_FLAGS += --print-passes ifdef ARKOUDA_DEVELOPER diff --git a/arkouda/graph.py b/arkouda/graph.py index 9b42ea777e..ec02e70e6b 100755 --- a/arkouda/graph.py +++ b/arkouda/graph.py @@ -138,11 +138,12 @@ class Graph: The source of every edge in the graph dst : pdarray The destination of every vertex in the graph - start : pdarray - The starting index of all the vertices in src and dst + start_i : pdarray + The starting index of all the vertices in src neighbour : pdarray - The current vertex id v's (v None: n_edges : must provide directed : optional src,dst : optional if no directed - neighbour : optional if no src and dst + start_i, neighbour : optional if no src and dst v_weight : optional if no neighbour e_weight : optional if no v_weight @@ -208,7 +209,7 @@ def __init__(self, *args) -> None: raise ValueError if len(args) > 6: if (isinstance(args[6],pdarray) and isinstance(args[5],pdarray)) : - self.start=args[5] + self.start_i=args[5] self.neighbour=args[6] else: try: diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 73f2e04713..50a36b5a04 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -15,7 +15,7 @@ "arange", "linspace", "randint", "uniform", "standard_normal", "random_strings_uniform", "random_strings_lognormal", "from_series", "suffix_array","lcp_array","suffix_array_file", - "rmat_gen"] + "rmat_gen","graph_bfs"] numericDTypes = frozenset(["bool", "int64", "float64"]) @@ -936,7 +936,6 @@ def suffix_array_file(filename: str) -> tuple: msg = "segmentedSAFile {}".format( filename ) repMsg = generic_msg(msg) tmpmsg=cast(str,repMsg).split('+') - print(tmpmsg) sastr=tmpmsg[0:2] strstr=tmpmsg[2:4] suffixarray=SArrays(*(cast(str,sastr))) @@ -1013,9 +1012,15 @@ def graph_bfs (graph: Graph, root: int ) -> tuple: """ msg = "segmentedGraphBFS {} {} {} {} {} {} {} {}".format(graph.n_vertices,graph.n_edges,\ graph.directed,graph.src.name,graph.dst.name,\ - graph.start.name,graph.neighbour.name,root) + graph.start.name,graph.neighbour.name,\ + graph.v_weight.name,graph.e_weight.name,root) repMsg = generic_msg(msg) - return Graph(*(cast(str,repMsg).split('+'))) + tmpmsg=cast(str,repMsg).split('+') + levelstr=tmpmsg[0:1] + vertexstr=tmpmsg[1:2] + levelary=create_pdarray(cast(str,levelstr)) + vertexary=create_pdarray(cast(str,vertexstr)) + return levelary,vertexary @typechecked @@ -1038,7 +1043,9 @@ def graph_dfs (graph: Graph, root: int ) -> pdarray: ------ RuntimeError """ - msg = "segmentedGraphDFS {} {} {}".format(graph.edges.name,graph.vertices.name,root) + msg = "segmentedGraphDFS {} {} {} {} {} {} {} {}".format(graph.n_vertices,graph.n_edges,\ + graph.directed,graph.src.name,graph.dst.name,\ + graph.start.name,graph.neighbour.name,root) repMsg = generic_msg(msg) return Graph(*(cast(str,repMsg).split('+'))) diff --git a/benchmarks/bfs.py b/benchmarks/bfs.py new file mode 100755 index 0000000000..e7f414a92b --- /dev/null +++ b/benchmarks/bfs.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 + +import time, argparse +import numpy as np +import arkouda as ak +import random +import string + +TYPES = ('int64', 'float64', 'bool', 'str') + +def time_ak_bfs_graph(): + print("BFS graph") + lgNv=6 + Ne_per_v=3 + p=0.03 + perm=0 + Graph=ak.rmat_gen(lgNv, Ne_per_v, p, perm) + print("number of vertices ={}".format(Graph.n_vertices)) + print("number of edges ={}".format(Graph.n_edges)) + print("directed graph ={}".format(Graph.directed)) + print("source of edges ={}".format(Graph.src)) + print("dest of edges ={}".format(Graph.dst)) + print("start ={}".format(Graph.start)) + print("neighbour ={}".format(Graph.neighbour)) + print("vertices weight ={}".format(Graph.v_weight)) + print("edges weight ={}".format(Graph.e_weight)) + level,vertex = ak.graph_bfs(Graph,0) + print("level=".format(level)) + print("vertex=".format(vertex)) + + return + timings = [] + for _ in range(trials): + start = time.time() + c=ak.suffix_array(v) +def time_ak_rmat_graph(lgNv, Ne_per_v, p, perm): + print(">>> arkouda rmat graph") + cfg = ak.get_config() + Nv = cfg["numLocales"] + print("numLocales = {}".format(cfg["numLocales"])) + Graph = ak.rmat_gen(lgNv, Ne_per_v, p, perm) + print("number of vertices ={}".format(Graph.n_vertices)) + print("number of edges ={}".format(Graph.n_edges)) + print("directed graph ={}".format(Graph.directed)) + print("source of edges ={}".format(Graph.src)) + print("dest of edges ={}".format(Graph.dst)) + print("start ={}".format(Graph.start)) + print("neighbour ={}".format(Graph.neighbour)) + print("vertices weight ={}".format(Graph.v_weight)) + print("edges weight ={}".format(Graph.e_weight)) + return + timings = [] + for _ in range(trials): + start = time.time() + c=ak.suffix_array(v) + end = time.time() + timings.append(end - start) + tavg = sum(timings) / trials + + print("Average time = {:.4f} sec".format(tavg)) + if dtype == 'str': + offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize + bytes_transferred = (c.bytes.size * c.offsets.itemsize) + (0 * c.bytes.size) + bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg + else: + print("Wrong data type") + print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) + + +def suffixArray(s): + suffixes = [(s[i:], i) for i in range(len(s))] + suffixes.sort(key=lambda x: x[0]) + sa= [s[1] for s in suffixes] + #sa.insert(0,len(sa)) + return sa + +def time_np_sa(vsize, strlen, trials, dtype): + s=''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(strlen)) + timings = [] + for _ in range(trials): + start = time.time() + sa=suffixArray(s) + end = time.time() + timings.append(end - start) + tavg = sum(timings) / trials + print("Average time = {:.4f} sec".format(tavg)) + if dtype == 'str': + offsets_transferred = 0 + bytes_transferred = len(s) + bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg + else: + print("Wrong data type") + print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) + +def check_correctness( vsize,strlen, trials, dtype): + Ni = strlen + Nv = vsize + + v = ak.random_strings_uniform(1, Ni, Nv) + c=ak.suffix_array(v) + for k in range(Nv): + s=v[k] + sa=suffixArray(s) + aksa=c[k] +# _,tmp=c[k].split(maxsplit=1) +# aksa=tmp.split() +# intaksa = [int(numeric_string) for numeric_string in aksa] +# intaksa = aksa[1:-1] +# print(sa) +# print(intaksa) + assert (sa==aksa) + + +def create_parser(): + parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") + parser.add_argument('hostname', help='Hostname of arkouda server') + parser.add_argument('port', type=int, help='Port of arkouda server') + parser.add_argument('-v', '--logvertices', type=int, default=5, help='Problem size: log number of vertices') + parser.add_argument('-e', '--vedges', type=int, default=2,help='Number of edges per vertex') + parser.add_argument('-p', '--possibility', type=float, default=0.01,help='Possibility ') + parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') + parser.add_argument('-m', '--perm', type=int, default=0 , help='if permutation ') + parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') + parser.add_argument('--correctness-only', default=False, action='store_true', help='Only check correctness, not performance.') + return parser + + + +if __name__ == "__main__": + import sys + parser = create_parser() + args = parser.parse_args() + ak.verbose = False + ak.connect(args.hostname, args.port) + + ''' + if args.correctness_only: + check_correctness(args.number, args.size, args.trials, args.dtype) + print("CORRECT") + sys.exit(0) + ''' + + time_ak_bfs_graph() + sys.exit(0) diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index 6655c7727c..853511be92 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -1482,10 +1482,10 @@ module SegmentedArray { var startName : string; /* The starting index of every vertex in src and dst the ,name */ - var start_p: borrowed SymEntry(int); + var start_i: borrowed SymEntry(int); - /* The current vertex id v's (v0) { + SetNextF.clear(); + forall i in SetCurF { + var numNF=-1 :int; + if (ag.start_i.a[i] <0 ){ + numNF=0; + } else { + if (iab; - fillRandom(tmpvar); - dst_bit=tmpvar>(c_norm * src_bit + a_norm * (~ src_bit)); - src = src + ((2**(ib-1)) * src_bit); - dst = dst + ((2**(ib-1)) * dst_bit); + } + if ((ag.start_i.a[i] >0) && (numNF>0)) { + var NF=ag.dst.a[ag.start_i.a[i]..ag.start_i.a[i]+numNF-1]; + forall j in NF { + if (depth[j]==-1) { + depth[j]=cur_level+1; + SetNextF.add(j); + } + } + } + }//end forall i + cur_level+=1; + numCurF=SetNextF.size; + SetCurF=SetNextF; } - src=src+(src==dst); - // maybe: remove edges which are self-loops??? - var iv = radixSortLSD_ranks(src); - // permute into sorted order - src = src[iv]; //# permute first vertex into sorted order - dst = dst[iv]; //# permute second vertex into sorted order - //# to premute/rename vertices - var startpos=0, endpos:int; - var sort=0:int; - while (startpos < Ne-2) { - endpos=startpos+1; - while (endpos Date: Mon, 25 Jan 2021 12:00:39 -0500 Subject: [PATCH 58/68] add bfs.py, remove bugs --- Makefile | 5 +- arkouda/graph.py | 419 ++++++++++++++++++++++++++- arkouda/pdarraycreation.py | 89 ++++-- benchmarks/bfs.py | 129 +++------ benchmarks/rmatgen.py | 53 +--- src/SegmentedArray.chpl | 303 ++++++++++++++++++++ src/SegmentedMsg.chpl | 560 ++++++++++++++++++++++++++++++++----- 7 files changed, 1326 insertions(+), 232 deletions(-) diff --git a/Makefile b/Makefile index a1d98e737e..7187369b3d 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,9 @@ default: $(DEFAULT_TARGET) VERBOSE ?= 0 +define ARKOUDA_QUICK_COMPILE +CHPL_FLAGS += --no-checks --no-loop-invariant-code-motion --no-fast-followers --ccflags="-O0" +endef CHPL := chpl CHPL_DEBUG_FLAGS += --print-passes ifdef ARKOUDA_DEVELOPER @@ -19,7 +22,7 @@ CHPL_FLAGS += --ccflags="-O1" else ifdef ARKOUDA_QUICK_COMPILE CHPL_FLAGS += --no-checks --no-loop-invariant-code-motion --no-fast-followers --ccflags="-O0" else -CHPL_FLAGS += --fast +#CHPL_FLAGS += --fast endif CHPL_FLAGS += -smemTrack=true CHPL_FLAGS += -lhdf5 -lhdf5_hl -lzmq diff --git a/arkouda/graph.py b/arkouda/graph.py index ec02e70e6b..3cfd9c7da0 100755 --- a/arkouda/graph.py +++ b/arkouda/graph.py @@ -12,7 +12,7 @@ translate_np_dtype import json -__all__ = ['Graph'] +__all__ = ['Graph','GraphD','GraphDW','GraphUD','GraphUDW'] class Vertex: """ @@ -120,6 +120,414 @@ def __repr__(self) -> str: return "{}".format(self.__str__()) +class GraphD: + """ + This is an array based graph representation. The graph data resides on the + arkouda server. The user should not call this class directly; + rather its instances are created by other arkouda functions. + + Attributes + ---------- + n_vertices : int + The starting indices for each string + n_edges : int + The starting indices for each string + directed : int + The graph is directed (True) or undirected (False) + weighted : int + The graph is weighted (True) or not + src : pdarray + The source of every edge in the graph + dst : pdarray + The destination of every vertex in the graph + start_i : pdarray + The starting index of all the vertices in src + neighbour : pdarray + The number of current vertex id v's (v None: + """ + Initializes the Graph instance by setting all instance + attributes, some of which are derived from the array parameters. + + Parameters + ---------- + n_vertices : must provide args[0] + n_edges : must provide args[1] + directed : must provide args[2] + weighted : must provide args[3] + src,dst : optional if no weighted args[4] args[5] + start_i, neighbour : optional if no src and dst args[6] args[7] + v_weight : optional if no neighbour args[8] + e_weight : optional if no v_weight args[9] + + + Returns + ------- + None + + Raises + ------ + RuntimeError + Raised if there's an error converting a Numpy array or standard + Python array to either the offset_attrib or bytes_attrib + ValueError + Raised if there's an error in generating instance attributes + from either the offset_attrib or bytes_attrib parameter + """ + try: + if len(args) < 4: + raise ValueError + self.n_vertices=cast(int,args[0]) + self.n_edges=cast(int,args[1]) + self.directed=cast(int,args[2]) + self.weighted=cast(int,args[3]) + if len(args) == 7: + raise ValueError + if len(args) > 7: + if (isinstance(args[7],pdarray) and isinstance(args[6],pdarray)) : + self.start_i=args[6] + self.neighbour=args[7] + else: + try: + self.start_i = create_pdarray(args[6]) + self.neighbour = create_pdarray(args[7]) + except Exception as e: + raise RuntimeError(e) + if len(args) == 5: + raise ValueError + if len(args) > 5: + if (isinstance(args[5],pdarray) and isinstance(args[4],pdarray)) : + self.src=args[4] + self.dst=args[5] + else: + try: + self.src = create_pdarray(args[4]) + self.dst = create_pdarray(args[5]) + except Exception as e: + raise RuntimeError(e) + except Exception as e: + raise RuntimeError(e) + self.dtype = akint + self.logger = getArkoudaLogger(name=__class__.__name__) # type: ignore + + def __iter__(self): + raise NotImplementedError('Graph does not support iteration') + + def __size__(self) -> int: + return self.n_vertices + + ''' + def add_vertice(self, x: Vertice)->None : + print() + + def remove_vertice(self, x: int) ->None: + print() + + def neighbours(self, x: int)->pdarray : + return self.neighbour[i] + + def adjacent(self, x: int, y:int )->pdarray : + neighbours(self,x) + neighbours(self,y) + + def get_vertice_value(self, x: int) -> Vertice: + print() + + def set_vertice_value(self, x: int, v: Vertice) : + print() + + def add_edge(self, x: int, y: int) : + print() + + def remove_edge(self, x: int, y: int) : + print() + ''' + +class GraphDW(GraphD): + """ + This is an array based graph representation. The graph data resides on the + arkouda server. The user should not call this class directly; + rather its instances are created by other arkouda functions. + + Attributes + ---------- + n_vertices : int + The starting indices for each string + n_edges : int + The starting indices for each string + directed : int + The graph is directed (True) or undirected (False) + weighted : int + The graph is weighted (True) or not + src : pdarray + The source of every edge in the graph + dst : pdarray + The destination of every vertex in the graph + start_i : pdarray + The starting index of all the vertices in src + neighbour : pdarray + The number of current vertex id v's (v None: + """ + Initializes the Graph instance by setting all instance + attributes, some of which are derived from the array parameters. + + Parameters + ---------- + n_vertices : must provide args[0] + n_edges : must provide args[1] + directed : must provide args[2] + weighted : must provide args[3] + src,dst : optional if no weighted args[4] args[5] + start_i, neighbour : optional if no src and dst args[6] args[7] + v_weight : optional if no neighbour args[8] + e_weight : optional if no v_weight args[9] + + + Returns + ------- + None + + Raises + ------ + RuntimeError + Raised if there's an error converting a Numpy array or standard + Python array to either the offset_attrib or bytes_attrib + ValueError + Raised if there's an error in generating instance attributes + from either the offset_attrib or bytes_attrib parameter + """ + super().__init__(*args) + try: + if len(args) > 9: + if isinstance(args[9],pdarray): + self.e_weight=args[9] + else: + try: + self.e_weight = create_pdarray(args[9]) + except Exception as e: + raise RuntimeError(e) + if len(args) > 8: + if isinstance(args[8],pdarray): + self.v_weight=args[8] + else: + try: + self.v_weight = create_pdarray(args[8]) + except Exception as e: + raise RuntimeError(e) + except Exception as e: + raise RuntimeError(e) + + + +class GraphUD(GraphD): + """ + This is an array based graph representation. The graph data resides on the + arkouda server. The user should not call this class directly; + rather its instances are created by other arkouda functions. + + Attributes + ---------- + n_vertices : int + The starting indices for each string + n_edges : int + The starting indices for each string + directed : int + The graph is directed (True) or undirected (False) + weighted : int + The graph is weighted (True) or not + src : pdarray + The source of every edge in the graph + dst : pdarray + The destination of every vertex in the graph + start_i : pdarray + The starting index of all the vertices in src + neighbour : pdarray + The number of current vertex id v's (v None: + """ + Initializes the Graph instance by setting all instance + attributes, some of which are derived from the array parameters. + + Parameters + ---------- + n_vertices : must provide args[0] + n_edges : must provide args[1] + directed : must provide args[2] + weighted : must provide args[3] + src,dst : optional if no weighted args[4] args[5] + start_i, neighbour : optional if no src and dst args[6] args[7] + srcR,dstR : optional if no neighbour args[8] args[9] + start_iR, neighbourR : optional if no dstR args[10] args[11] + v_weight : optional if no neighbouirR args[12] + e_weight : optional if no v_weight args[13] + + Returns + ------- + None + + Raises + ------ + RuntimeError + ValueError + """ + super().__init__(*args) + try: + if len(args) > 11: + if (isinstance(args[11],pdarray) and isinstance(args[10],pdarray)) : + self.start_iR=args[10] + self.neighbourR=args[11] + else: + try: + self.start_iR = create_pdarray(args[10]) + self.neighbourR = create_pdarray(args[11]) + except Exception as e: + raise RuntimeError(e) + if len(args) > 9: + if (isinstance(args[9],pdarray) and isinstance(args[8],pdarray)) : + self.srcR=args[8] + self.dstR=args[9] + else: + try: + self.srcR = create_pdarray(args[8]) + self.dstR = create_pdarray(args[9]) + except Exception as e: + raise RuntimeError(e) + except Exception as e: + raise RuntimeError(e) + + +class GraphUDW(GraphUD): + """ + This is an array based graph representation. The graph data resides on the + arkouda server. The user should not call this class directly; + rather its instances are created by other arkouda functions. + + Attributes + ---------- + n_vertices : int + The starting indices for each string + n_edges : int + The starting indices for each string + directed : int + The graph is directed (True) or undirected (False) + src : pdarray + The source of every edge in the graph + dst : pdarray + The destination of every vertex in the graph + start_i : pdarray + The starting index of all the vertices in src + neighbour : pdarray + The number of current vertex id v's (v None: + """ + Initializes the Graph instance by setting all instance + attributes, some of which are derived from the array parameters. + + Parameters + ---------- + n_vertices : must provide args[0] + n_edges : must provide args[1] + directed : must provide args[2] + weighted : must provide args[3] + src,dst : optional if no weighted args[4] args[5] + start_i, neighbour : optional if no src and dst args[6] args[7] + srcR,dstR : optional if no neighbour args[8] args[9] + start_iR, neighbourR : optional if no dstR args[10] args[11] + v_weight : optional if no neighbouirR args[12] + e_weight : optional if no v_weight args[13] + + + Returns + ------- + None + + Raises + ------ + RuntimeError + ValueError + """ + super().__init__(*args) + try: + if len(args) > 13: + if isinstance(args[13],pdarray): + self.e_weight=args[13] + else: + try: + self.e_weight = create_pdarray(args[13]) + except Exception as e: + raise RuntimeError(e) + if len(args) > 12: + if isinstance(args[12],pdarray): + self.v_weight=args[12] + else: + try: + self.v_weight = create_pdarray(args[12]) + except Exception as e: + raise RuntimeError(e) + except Exception as e: + raise RuntimeError(e) + + class Graph: """ This is an array based graph representation. The graph data resides on the @@ -132,7 +540,7 @@ class Graph: The starting indices for each string n_edges : int The starting indices for each string - directed : bool + directed : int The graph is directed (True) or undirected (False) src : pdarray The source of every edge in the graph @@ -213,7 +621,7 @@ def __init__(self, *args) -> None: self.neighbour=args[6] else: try: - self.start = create_pdarray(args[5]) + self.start_i = create_pdarray(args[5]) self.neighbour = create_pdarray(args[6]) except Exception as e: raise RuntimeError(e) @@ -230,7 +638,7 @@ def __init__(self, *args) -> None: except Exception as e: raise RuntimeError(e) if len(args) > 2: - self.directed=cast(bool,args[2]) + self.directed=cast(int,args[2]) except Exception as e: raise RuntimeError(e) self.dtype = akint @@ -268,3 +676,6 @@ def add_edge(self, x: int, y: int) : def remove_edge(self, x: int, y: int) : print() ''' + + + diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 50a36b5a04..f31d9e7a09 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -9,7 +9,7 @@ from arkouda.dtypes import dtype as akdtype from arkouda.pdarrayclass import pdarray, create_pdarray from arkouda.strings import Strings, SArrays -from arkouda.graph import Graph +from arkouda.graph import GraphD, GraphDW,GraphUD,GraphUDW __all__ = ["array", "zeros", "ones", "zeros_like", "ones_like", "arange", "linspace", "randint", "uniform", "standard_normal", @@ -945,7 +945,7 @@ def suffix_array_file(filename: str) -> tuple: @typechecked -def graph_file(filename: str) -> Graph: +def graph_file(filename: str) -> Union[GraphD,GraphUD,GraphDW,GraphUDW]: """ This function is major for creating a graph from a file Returns @@ -968,7 +968,8 @@ def graph_file(filename: str) -> Graph: return Graph(*(cast(str,repMsg).split('+'))) @typechecked -def rmat_gen (lgNv:int, Ne_per_v:int, p:float, perm: int) -> Graph: +def rmat_gen (lgNv:int, Ne_per_v:int, p:float, directed: int,weighted:int) ->\ + Union[GraphD,GraphUD,GraphDW,GraphUDW]: """ This function is for creating a graph using rmat graph generator Returns @@ -986,12 +987,21 @@ def rmat_gen (lgNv:int, Ne_per_v:int, p:float, perm: int) -> Graph: ------ RuntimeError """ - msg = "segmentedRMAT {} {} {} {}".format(lgNv, Ne_per_v, p, perm) + msg = "segmentedRMAT {} {} {} {} {}".format(lgNv, Ne_per_v, p, directed, weighted) repMsg = generic_msg(msg) - return Graph(*(cast(str,repMsg).split('+'))) + if (directed!=0) : + if (weighted!=0) : + return GraphDW(*(cast(str,repMsg).split('+'))) + else: + return GraphD(*(cast(str,repMsg).split('+'))) + else: + if (weighted!=0) : + return GraphUDW(*(cast(str,repMsg).split('+'))) + else: + return GraphUD(*(cast(str,repMsg).split('+'))) @typechecked -def graph_bfs (graph: Graph, root: int ) -> tuple: +def graph_bfs (graph: Union[GraphD,GraphDW,GraphUD,GraphUDW], root: int ) -> tuple: """ This function is generating the breadth-first search vertices sequences in given graph starting from the given root vertex @@ -1010,21 +1020,60 @@ def graph_bfs (graph: Graph, root: int ) -> tuple: ------ RuntimeError """ - msg = "segmentedGraphBFS {} {} {} {} {} {} {} {}".format(graph.n_vertices,graph.n_edges,\ - graph.directed,graph.src.name,graph.dst.name,\ - graph.start.name,graph.neighbour.name,\ - graph.v_weight.name,graph.e_weight.name,root) + #if (cast(int,graph.directed)!=0) : + if (int(graph.directed)>0) : + if (int(graph.weighted)==0): + # directed unweighted GraphD + msg = "segmentedGraphBFS {} {} {} {} {} {} {} {} {}".format( + graph.n_vertices,graph.n_edges,\ + graph.directed,graph.weighted,\ + graph.src.name,graph.dst.name,\ + graph.start_i.name,graph.neighbour.name,\ + root) + else: + # directed weighted GraphDW + msg = "segmentedGraphBFS {} {} {} {} {} {} {} {} {} {} {}".format( + graph.n_vertices,graph.n_edges,\ + graph.directed,graph.weighted,\ + graph.src.name,graph.dst.name,\ + graph.start_i.name,graph.neighbour.name,\ + graph.v_weight.name,graph.e_weight.name,\ + root) + else: + if (int(graph.weighted)==0): + # undirected unweighted GraphUD + msg = "segmentedGraphBFS {} {} {} {} {} {} {} {} {} {} {} {} {}".format( + graph.n_vertices,graph.n_edges,\ + graph.directed,graph.weighted,\ + graph.src.name,graph.dst.name,\ + graph.start_i.name,graph.neighbour.name,\ + graph.srcR.name,graph.dstR.name,\ + graph.start_iR.name,graph.neighbourR.name,\ + root) + else: + # undirected weighted GraphUDW 15 + msg = "segmentedGraphBFS {} {} {} {} {} {} {} {} {} {} {} {} {} {} {}".format( + graph.n_vertices,graph.n_edges,\ + graph.directed,graph.weighted,\ + graph.src.name,graph.dst.name,\ + graph.start_i.name,graph.neighbour.name,\ + graph.srcR.name,graph.dstR.name,\ + graph.start_iR.name,graph.neighbourR.name,\ + graph.v_weight.name,graph.e_weight.name,\ + root) + repMsg = generic_msg(msg) tmpmsg=cast(str,repMsg).split('+') levelstr=tmpmsg[0:1] vertexstr=tmpmsg[1:2] - levelary=create_pdarray(cast(str,levelstr)) - vertexary=create_pdarray(cast(str,vertexstr)) - return levelary,vertexary + levelary=create_pdarray(*(cast(str,levelstr)) ) + + vertexary=create_pdarray(*(cast(str,vertexstr)) ) + return (levelary,vertexary) @typechecked -def graph_dfs (graph: Graph, root: int ) -> pdarray: +def graph_dfs (graph: Union[GraphD,GraphUD,GraphDW,GraphUDW], root: int ) -> tuple: """ This function is generating the depth-first search vertices sequences in given graph starting from the given root vertex @@ -1045,13 +1094,19 @@ def graph_dfs (graph: Graph, root: int ) -> pdarray: """ msg = "segmentedGraphDFS {} {} {} {} {} {} {} {}".format(graph.n_vertices,graph.n_edges,\ graph.directed,graph.src.name,graph.dst.name,\ - graph.start.name,graph.neighbour.name,root) + graph.start_i.name,graph.neighbour.name,root) repMsg = generic_msg(msg) - return Graph(*(cast(str,repMsg).split('+'))) + tmpmsg=cast(str,repMsg).split('+') + levelstr=tmpmsg[0:1] + vertexstr=tmpmsg[1:2] + levelary=create_pdarray(*(cast(str,levelstr)) ) + + vertexary=create_pdarray(*(cast(str,vertexstr)) ) + return (levelary,vertexary) @typechecked -def components (graph: Graph ) -> int : +def components (graph: Union[GraphD,GraphUD,GraphDW,GraphUDW] ) -> int : """ This function returns the number of components of the given graph Returns diff --git a/benchmarks/bfs.py b/benchmarks/bfs.py index e7f414a92b..31d7e88977 100755 --- a/benchmarks/bfs.py +++ b/benchmarks/bfs.py @@ -8,107 +8,66 @@ TYPES = ('int64', 'float64', 'bool', 'str') -def time_ak_bfs_graph(): - print("BFS graph") - lgNv=6 +def time_ak_bfs_graph(trials:int): + print("Graph BFS") + lgNv=10 Ne_per_v=3 p=0.03 - perm=0 - Graph=ak.rmat_gen(lgNv, Ne_per_v, p, perm) - print("number of vertices ={}".format(Graph.n_vertices)) - print("number of edges ={}".format(Graph.n_edges)) - print("directed graph ={}".format(Graph.directed)) - print("source of edges ={}".format(Graph.src)) - print("dest of edges ={}".format(Graph.dst)) - print("start ={}".format(Graph.start)) - print("neighbour ={}".format(Graph.neighbour)) - print("vertices weight ={}".format(Graph.v_weight)) - print("edges weight ={}".format(Graph.e_weight)) - level,vertex = ak.graph_bfs(Graph,0) - print("level=".format(level)) - print("vertex=".format(vertex)) - - return - timings = [] - for _ in range(trials): - start = time.time() - c=ak.suffix_array(v) -def time_ak_rmat_graph(lgNv, Ne_per_v, p, perm): - print(">>> arkouda rmat graph") - cfg = ak.get_config() - Nv = cfg["numLocales"] - print("numLocales = {}".format(cfg["numLocales"])) - Graph = ak.rmat_gen(lgNv, Ne_per_v, p, perm) + directed=0 + weighted=1 + Graph=ak.rmat_gen(lgNv, Ne_per_v, p, directed, weighted) + ''' print("number of vertices ={}".format(Graph.n_vertices)) print("number of edges ={}".format(Graph.n_edges)) print("directed graph ={}".format(Graph.directed)) + print("weighted graph ={}".format(Graph.weighted)) print("source of edges ={}".format(Graph.src)) + print("R dest of edges ={}".format(Graph.dstR)) print("dest of edges ={}".format(Graph.dst)) - print("start ={}".format(Graph.start)) - print("neighbour ={}".format(Graph.neighbour)) + print("R source of edges ={}".format(Graph.srcR)) + print("start ={}".format(Graph.start_i)) + print("R start ={}".format(Graph.start_iR)) + print(" neighbour ={}".format(Graph.neighbour)) + print("R neighbour ={}".format(Graph.neighbourR)) print("vertices weight ={}".format(Graph.v_weight)) print("edges weight ={}".format(Graph.e_weight)) - return - timings = [] - for _ in range(trials): - start = time.time() - c=ak.suffix_array(v) - end = time.time() - timings.append(end - start) - tavg = sum(timings) / trials - - print("Average time = {:.4f} sec".format(tavg)) - if dtype == 'str': - offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize - bytes_transferred = (c.bytes.size * c.offsets.itemsize) + (0 * c.bytes.size) - bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg - else: - print("Wrong data type") - print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) - - -def suffixArray(s): - suffixes = [(s[i:], i) for i in range(len(s))] - suffixes.sort(key=lambda x: x[0]) - sa= [s[1] for s in suffixes] - #sa.insert(0,len(sa)) - return sa + ''' + ll,ver = ak.graph_bfs(Graph,4) + old=-2; + visit=[] + for i in range(int(Graph.n_vertices)): + cur=ll[i] + if (int(cur)!=int(old)): + if len(visit) >0: + print(visit) + print("current level=",cur,"the vertices at this level are") + old=cur + visit=[] + visit.append(ver[i]) + print(visit) + + ''' + print("total edges are as follows") + for i in range(int(Graph.n_edges)): + print("<",Graph.src[i]," -- ", Graph.dst[i],">") -def time_np_sa(vsize, strlen, trials, dtype): - s=''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(strlen)) + print("total reverse edges are as follows") + for i in range(int(Graph.n_edges)): + print("<",Graph.srcR[i]," -- ", Graph.dstR[i],">") + ''' timings = [] for _ in range(trials): start = time.time() - sa=suffixArray(s) + level,nodes = ak.graph_bfs(Graph,0) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) - if dtype == 'str': - offsets_transferred = 0 - bytes_transferred = len(s) - bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg - else: - print("Wrong data type") - print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) - -def check_correctness( vsize,strlen, trials, dtype): - Ni = strlen - Nv = vsize - - v = ak.random_strings_uniform(1, Ni, Nv) - c=ak.suffix_array(v) - for k in range(Nv): - s=v[k] - sa=suffixArray(s) - aksa=c[k] -# _,tmp=c[k].split(maxsplit=1) -# aksa=tmp.split() -# intaksa = [int(numeric_string) for numeric_string in aksa] -# intaksa = aksa[1:-1] -# print(sa) -# print(intaksa) - assert (sa==aksa) + print("Average Edges = {:.4f} M/s".format(int(Graph.n_edges)/tavg/1024/1024)) + print("Average Vertices = {:.4f} M/s".format(int(Graph.n_vertices)/tavg/1024/1024)) + ''' + #print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) + ''' def create_parser(): @@ -140,5 +99,5 @@ def create_parser(): sys.exit(0) ''' - time_ak_bfs_graph() + time_ak_bfs_graph(args.trials) sys.exit(0) diff --git a/benchmarks/rmatgen.py b/benchmarks/rmatgen.py index 7f858b7c09..3504635074 100755 --- a/benchmarks/rmatgen.py +++ b/benchmarks/rmatgen.py @@ -23,68 +23,17 @@ def time_ak_rmat_graph(lgNv, Ne_per_v, p, perm): print("neighbour ={}".format(Graph.neighbour)) print("vertices weight ={}".format(Graph.v_weight)) print("edges weight ={}".format(Graph.e_weight)) - return timings = [] for _ in range(trials): start = time.time() - c=ak.suffix_array(v) + Graph = ak.rmat_gen(lgNv, Ne_per_v, p, perm) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) - if dtype == 'str': - offsets_transferred = 0 * c.offsets.size * c.offsets.itemsize - bytes_transferred = (c.bytes.size * c.offsets.itemsize) + (0 * c.bytes.size) - bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg - else: - print("Wrong data type") - print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) -def suffixArray(s): - suffixes = [(s[i:], i) for i in range(len(s))] - suffixes.sort(key=lambda x: x[0]) - sa= [s[1] for s in suffixes] - #sa.insert(0,len(sa)) - return sa - -def time_np_sa(vsize, strlen, trials, dtype): - s=''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(strlen)) - timings = [] - for _ in range(trials): - start = time.time() - sa=suffixArray(s) - end = time.time() - timings.append(end - start) - tavg = sum(timings) / trials - print("Average time = {:.4f} sec".format(tavg)) - if dtype == 'str': - offsets_transferred = 0 - bytes_transferred = len(s) - bytes_per_sec = (offsets_transferred + bytes_transferred) / tavg - else: - print("Wrong data type") - print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) - -def check_correctness( vsize,strlen, trials, dtype): - Ni = strlen - Nv = vsize - - v = ak.random_strings_uniform(1, Ni, Nv) - c=ak.suffix_array(v) - for k in range(Nv): - s=v[k] - sa=suffixArray(s) - aksa=c[k] -# _,tmp=c[k].split(maxsplit=1) -# aksa=tmp.split() -# intaksa = [int(numeric_string) for numeric_string in aksa] -# intaksa = aksa[1:-1] -# print(sa) -# print(intaksa) - assert (sa==aksa) - def create_parser(): parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl index 853511be92..a710835b7e 100644 --- a/src/SegmentedArray.chpl +++ b/src/SegmentedArray.chpl @@ -1565,6 +1565,309 @@ module SegmentedArray { + + /** + * We use several arrays and intgers to represent a basic directed graph + * Instances are ephemeral, not stored in the symbol table. Instead, attributes + * of this class refer to symbol table entries that persist. This class is a + * convenience for bundling those persistent objects and defining graph-relevant + * operations. + * Now we copy from SegSArray, we need change more in the future to fit a graph + */ + class SegGraphD { + + /* The starting indices for each string*/ + var n_vertices : int; + + /* The starting indices for each string*/ + var n_edges : int; + + /* The graph is directed (True) or undirected (False)*/ + var directed=1 : int; + + /* The graph is directed (True) or undirected (False)*/ + var weighted=0 : int; + + /* The source of every edge in the graph, name */ + var srcName : string; + + /* The source of every edge in the graph,array value */ + var src: borrowed SymEntry(int); + + /* The destination of every vertex in the graph,name */ + var dstName : string; + + /* The destination of every vertex in the graph,array value */ + var dst: borrowed SymEntry(int); + + + /* The starting index of every vertex in src and dst the ,name */ + var startName : string; + + /* The starting index of every vertex in src and dst the ,name */ + var start_i: borrowed SymEntry(int); + + /* The number of current vertex id v's (v0) { SetNextF.clear(); forall i in SetCurF { var numNF=-1 :int; - if (ag.start_i.a[i] <0 ){ - numNF=0; - } else { - if (i0) { + forall j in NF { + //writeln("current node ",i, " has neibours ",NF); + //for j in NF { + if (depth[j]==-1) { + depth[j]=cur_level+1; + SetNextF.add(j); + //writeln("add ", j, " into level ", cur_level+1); + } + } + } + }//end forall i + cur_level+=1; + //writeln("SetCurF= ", SetCurF, "SetNextF=", SetNextF, " level ", cur_level+1); + numCurF=SetNextF.size; + SetCurF=SetNextF; + } + var vertexValue = radixSortLSD_ranks(depth); + var levelValue=depth[vertexValue]; + + var levelName = st.nextName(); + var vertexName = st.nextName(); + var levelEntry = new shared SymEntry(levelValue); + var vertexEntry = new shared SymEntry(vertexValue); + st.addEntry(levelName, levelEntry); + st.addEntry(vertexName, vertexEntry); + repMsg = 'created ' + st.attrib(levelName) + '+created ' + st.attrib(vertexName) ; + + smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); + return repMsg; + } + + + + + + + + + proc BFS_UD(Nv:int , Ne:int ,Directed:int ,Weighted:int,restpart:string ,st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var repMsg: string; + + + var srcN, dstN, startN, neighbourN, rootN :string; + var srcRN, dstRN, startRN, neighbourRN:string; + + (srcN, dstN, startN, neighbourN,srcRN, dstRN, startRN, neighbourRN, rootN )= + restpart.splitMsgToTuple(9); + var ag = new owned SegGraphUD(Nv,Ne,Directed,Weighted, + srcN,dstN, startN,neighbourN, + srcRN,dstRN, startRN,neighbourRN, + st); + + var root=rootN:int; + var depth=-1: [0..Nv-1] int; + depth[root]=0; + var cur_level=0; + var SetCurF: domain(int); + var SetNextF: domain(int); + SetCurF.add(root); + var numCurF=1:int; + + //writeln("========================BSF_UD=================================="); + while (numCurF>0) { + SetNextF.clear(); + forall i in SetCurF { + var numNF=-1 :int; + ref nf=ag.neighbour.a; + ref sf=ag.start_i.a; + ref df=ag.dst.a; + numNF=nf[i]; + ref NF=df[sf[i]..sf[i]+numNF-1]; + if (numNF>0) { + forall j in NF { + //writeln("current node ",i, " has neibours ",NF); + //for j in NF { + if (depth[j]==-1) { + depth[j]=cur_level+1; + SetNextF.add(j); + //writeln("current node ",i, " add ", j, + // " into level ", cur_level+1, " SetNextF=", SetNextF); + } + } + } + // reverse direction + if (Directed!=1) { + + var numNFR=-1 :int; + ref nfR=ag.neighbourR.a; + ref sfR=ag.start_iR.a; + ref dfR=ag.dstR.a; + numNFR=nfR[i]; + ref NFR=dfR[sfR[i]..sfR[i]+numNFR-1]; + if (numNFR>0) { + //writeln("current node ",i, " has reverse neibours ",NFR); + forall j in NFR { + //for j in NFR { + if (depth[j]==-1) { + depth[j]=cur_level+1; + SetNextF.add(j); + //writeln("current node ",i, " add reverse ", j, + // " into level ", cur_level+1, " SetNextF=", SetNextF); + } + } + } } - if ((ag.start_i.a[i] >0) && (numNF>0)) { - var NF=ag.dst.a[ag.start_i.a[i]..ag.start_i.a[i]+numNF-1]; + + + }//end forall i + cur_level+=1; + //writeln("SetCurF= ", SetCurF, "SetNextF=", SetNextF, " level ", cur_level+1); + numCurF=SetNextF.size; + SetCurF=SetNextF; + } + var vertexValue = radixSortLSD_ranks(depth); + var levelValue=depth[vertexValue]; + + var levelName = st.nextName(); + var vertexName = st.nextName(); + var levelEntry = new shared SymEntry(levelValue); + var vertexEntry = new shared SymEntry(vertexValue); + st.addEntry(levelName, levelEntry); + st.addEntry(vertexName, vertexEntry); + repMsg = 'created ' + st.attrib(levelName) + '+created ' + st.attrib(vertexName) ; + + smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); + return repMsg; + } + + + + + + + proc BFS_UDW(Nv:int , Ne:int ,Directed:int ,Weighted:int,restpart:string ,st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var repMsg: string; + + + var srcN, dstN, startN, neighbourN,vweightN,eweightN, rootN :string; + var srcRN, dstRN, startRN, neighbourRN:string; + + (srcN, dstN, startN, neighbourN,srcRN, dstRN, startRN, neighbourRN,vweightN,eweightN, rootN )= + restpart.splitMsgToTuple(11); + var ag = new owned SegGraphUDW(Nv,Ne,Directed,Weighted, + srcN,dstN, startN,neighbourN, + srcRN,dstRN, startRN,neighbourRN, + vweightN,eweightN, st); + + //writeln("========================BSF_UDW=================================="); + var root=rootN:int; + var depth=-1: [0..Nv-1] int; + depth[root]=0; + var cur_level=0; + var SetCurF: domain(int); + var SetNextF: domain(int); + SetCurF.add(root); + var numCurF=1:int; + + /* + writeln("Fisrt Check if the values are correct"); + writeln("src="); + writeln(ag.src.a); + writeln("dst="); + writeln(ag.dst.a); + writeln("neighbours="); + writeln(ag.neighbour.a); + writeln("start="); + writeln(ag.start_i.a); + + writeln("srcR="); + writeln(ag.srcR.a); + writeln("dstR="); + writeln(ag.dstR.a); + writeln("neighbours="); + writeln(ag.neighbourR.a); + writeln("startR="); + writeln(ag.start_iR.a); + + for i in 0..ag.n_vertices-1 do { + writeln("node ",i, " has ", ag.neighbour.a[i], " neighbours", + " start=",ag.start_i.a[i], " they are ", + ag.dst.a[ag.start_i.a[i]..ag.start_i.a[i]-1+ag.neighbour.a[i]]); + } + for i in 0..ag.n_vertices-1 do { + writeln("reverse node ",i, " has ", ag.neighbourR.a[i], " neighbours", + " start=",ag.start_iR.a[i], " they are ", + ag.dstR.a[ag.start_iR.a[i]..ag.start_iR.a[i]-1+ag.neighbourR.a[i]]); + } + */ + + while (numCurF>0) { + //writeln("start loop SetCurF=", SetCurF); + SetNextF.clear(); + forall i in SetCurF { + var numNF=-1 :int; + ref nf=ag.neighbour.a; + ref sf=ag.start_i.a; + ref df=ag.dst.a; + numNF=nf[i]; + ref NF=df[sf[i]..sf[i]+numNF-1]; + //writeln("current node ",i, " has ", numNF, " neighbours and they are ",NF); + if (numNF>0) { forall j in NF { + //for j in NF { + //writeln("current node ",i, " check neibour ",j, " its depth=",depth[j]); if (depth[j]==-1) { depth[j]=cur_level+1; SetNextF.add(j); + //writeln("current node ",i, " add ", j, " into level ", cur_level+1, " SetNextF=", SetNextF); } } } + // reverse direction + if (Directed!=1) { + var numNFR=-1 :int; + ref nfR=ag.neighbourR.a; + ref sfR=ag.start_iR.a; + ref dfR=ag.dstR.a; + numNFR=nfR[i]; + ref NFR=dfR[sfR[i]..sfR[i]+numNFR-1]; + //writeln("current node ",i, " has ", numNFR ," reverse neighbours and they are ",NFR); + if ( numNFR>0) { + forall j in NFR { + //for j in NFR { + //writeln("current node ",i, " check neibour ",j, " its depth=",depth[j]); + if (depth[j]==-1) { + depth[j]=cur_level+1; + SetNextF.add(j); + //writeln("current node ",i, " add reverse ", j, + // " into level ", cur_level+1, " SetNextF=", SetNextF); + } + } + } + } + + }//end forall i cur_level+=1; + //writeln("SetCurF= ", SetCurF, "SetNextF=", SetNextF, " level ", cur_level+1); numCurF=SetNextF.size; SetCurF=SetNextF; } @@ -1431,4 +1843,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string + + } From 73a5bcfbb2e78b2f3800bb06e5f715ace5ebc0ea Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Mon, 25 Jan 2021 14:06:24 -0500 Subject: [PATCH 59/68] check comipling error --- Makefile | 8 ++++---- benchmarks/bfs.py | 4 ++-- src/SegmentedMsg.chpl | 36 ++++-------------------------------- 3 files changed, 10 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 7187369b3d..74c9804d54 100644 --- a/Makefile +++ b/Makefile @@ -12,9 +12,9 @@ default: $(DEFAULT_TARGET) VERBOSE ?= 0 -define ARKOUDA_QUICK_COMPILE -CHPL_FLAGS += --no-checks --no-loop-invariant-code-motion --no-fast-followers --ccflags="-O0" -endef +#define ARKOUDA_QUICK_COMPILE +#CHPL_FLAGS += --no-checks --no-loop-invariant-code-motion --no-fast-followers --ccflags="-O0" +#endef CHPL := chpl CHPL_DEBUG_FLAGS += --print-passes ifdef ARKOUDA_DEVELOPER @@ -22,7 +22,7 @@ CHPL_FLAGS += --ccflags="-O1" else ifdef ARKOUDA_QUICK_COMPILE CHPL_FLAGS += --no-checks --no-loop-invariant-code-motion --no-fast-followers --ccflags="-O0" else -#CHPL_FLAGS += --fast +CHPL_FLAGS += --fast endif CHPL_FLAGS += -smemTrack=true CHPL_FLAGS += -lhdf5 -lhdf5_hl -lzmq diff --git a/benchmarks/bfs.py b/benchmarks/bfs.py index 31d7e88977..f9e842fb66 100755 --- a/benchmarks/bfs.py +++ b/benchmarks/bfs.py @@ -10,10 +10,10 @@ def time_ak_bfs_graph(trials:int): print("Graph BFS") - lgNv=10 + lgNv=4 Ne_per_v=3 p=0.03 - directed=0 + directed=1 weighted=1 Graph=ak.rmat_gen(lgNv, Ne_per_v, p, directed, weighted) ''' diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index f758562a60..6453397d8c 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -1215,7 +1215,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var directed=sdire : int; var weighted=swei : int; - //writeln(lgNv, Ne_per_v, p, directed,weighted); var Nv = 2**lgNv:int; // number of edges var Ne = Ne_per_v * Nv:int; @@ -1293,9 +1292,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string startpos+=1; }//end of while startpos - //writeln("before assignment start_i="); - //writeln(start_i); - //writeln(""); for i in 0..Ne-1 do { length[src1[i]]+=1; if (start_i[src1[i]] ==-1){ @@ -1308,13 +1304,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var neighbour = length; var neighbourR = neighbour; - /* - for i in 0..Nv-1 do { - writeln("node ",i, " has ", neighbour[i], " neighbours", - " start=",start_i[i], " they are ", dst1[start_i[i]..start_i[i]-1+neighbour[i]]); - } - */ - if (directed==0) { //undirected graph srcR = dst1; @@ -1353,17 +1342,10 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string if (start_iR[srcR1[i]] ==-1){ start_iR[srcR1[i]]=i; } - } //neighbourR = (+ scan lengthR) - lengthR; neighbourR = lengthR; - /* - for i in 0..Nv-1 do { - writeln("rever node ",i, " has ", neighbourR[i], " neighbours", - " start=",start_iR[i], " they are ", dstR1[start_iR[i]..start_iR[i]-1+neighbourR[i]]); - */ - } }//end of undirected @@ -1380,7 +1362,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string st.addEntry(vwName, vwEntry); st.addEntry(ewName, ewEntry); } - //writeln("e_weight=",e_weight,"v_weight=",v_weight); var srcName = st.nextName(); var dstName = st.nextName(); var startName = st.nextName(); @@ -1425,16 +1406,6 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string st.addEntry(dstNameR, dstEntryR); st.addEntry(startNameR, startEntryR); st.addEntry(neiNameR, neiEntryR); - /* - writeln("src="); - writeln(src1); - writeln("dstR="); - writeln(dstR1); - writeln("dst="); - writeln(dst1); - writeln("srcR="); - writeln(srcR1); - */ if (weighted!=0) { repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + ' +' + sWeighted + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + @@ -1581,21 +1552,22 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string ref df=ag.dst.a; numNF=nf[i]; ref NF=df[sf[i]..sf[i]+numNF-1]; + writeln("current node ",i, " has ", numNF, " neighbours and they are ",NF); if (numNF>0) { forall j in NF { - //writeln("current node ",i, " has neibours ",NF); //for j in NF { + writeln("current node ",i, " check neibour ",j, " its depth=",depth[j]); if (depth[j]==-1) { depth[j]=cur_level+1; SetNextF.add(j); - //writeln("add ", j, " into level ", cur_level+1); + writeln("current node ",i, " add ", j, " into level ", cur_level+1, " SetNextF=", SetNextF); } } } }//end forall i cur_level+=1; - //writeln("SetCurF= ", SetCurF, "SetNextF=", SetNextF, " level ", cur_level+1); + writeln("SetCurF= ", SetCurF, "SetNextF=", SetNextF, " level ", cur_level+1); numCurF=SetNextF.size; SetCurF=SetNextF; } From 031f947ba6aa9f1db0ebd4444067f4a3febbbda3 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Mon, 25 Jan 2021 17:21:54 -0500 Subject: [PATCH 60/68] add graph read function and benchmark --- arkouda/pdarraycreation.py | 36 ++++++- benchmarks/bfs.py | 8 +- benchmarks/graphfilebfs.py | 99 +++++++++++++++++ benchmarks/rmatgen.py | 15 +-- src/SegmentedMsg.chpl | 211 ++++++++++++++++++++++++++++++++++++- src/arkouda_server.chpl | 1 + 6 files changed, 353 insertions(+), 17 deletions(-) create mode 100755 benchmarks/graphfilebfs.py diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index f31d9e7a09..c8c8e9f8ee 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -15,7 +15,7 @@ "arange", "linspace", "randint", "uniform", "standard_normal", "random_strings_uniform", "random_strings_lognormal", "from_series", "suffix_array","lcp_array","suffix_array_file", - "rmat_gen","graph_bfs"] + "rmat_gen","graph_bfs","graph_file_read"] numericDTypes = frozenset(["bool", "int64", "float64"]) @@ -945,9 +945,21 @@ def suffix_array_file(filename: str) -> tuple: @typechecked -def graph_file(filename: str) -> Union[GraphD,GraphUD,GraphDW,GraphUDW]: +def graph_file_read(Ne:int, Nv:int,Ncol:int,directed:int, filename: str) -> Union[GraphD,GraphUD,GraphDW,GraphUDW]: """ - This function is major for creating a graph from a file + This function is used for creating a graph from a file. + The file should like this + 1 5 + 13 9 + 4 8 + 7 6 + This file means the edges are <1,5>,<13,9>,<4,8>,<7,6>. If additional column is added, it is the weight + of each edge. + Ne : the total number of edges of the graph + Nv : the total number of vertices of the graph + Ncol: how many column of the file. Ncol=2 means just edges (so no weight and weighted=0) + and Ncol=3 means there is weight for each edge (so weighted=1). + directed: 0 means undirected graph and 1 means directed graph Returns ------- Graph @@ -963,9 +975,23 @@ def graph_file(filename: str) -> Union[GraphD,GraphUD,GraphDW,GraphUDW]: ------ RuntimeError """ - msg = "segmentedGraphFile {}".format( filename ) + msg = "segmentedGraphFile {} {} {} {} {}".format(Ne, Nv, Ncol,directed, filename); repMsg = generic_msg(msg) - return Graph(*(cast(str,repMsg).split('+'))) + if (int(Ncol) >2) : + weighted=1 + else: + weighted=0 + + if (directed!=0) : + if (weighted!=0) : + return GraphDW(*(cast(str,repMsg).split('+'))) + else: + return GraphD(*(cast(str,repMsg).split('+'))) + else: + if (weighted!=0) : + return GraphUDW(*(cast(str,repMsg).split('+'))) + else: + return GraphUD(*(cast(str,repMsg).split('+'))) @typechecked def rmat_gen (lgNv:int, Ne_per_v:int, p:float, directed: int,weighted:int) ->\ diff --git a/benchmarks/bfs.py b/benchmarks/bfs.py index f9e842fb66..285f13e58b 100755 --- a/benchmarks/bfs.py +++ b/benchmarks/bfs.py @@ -56,15 +56,15 @@ def time_ak_bfs_graph(trials:int): print("<",Graph.srcR[i]," -- ", Graph.dstR[i],">") ''' timings = [] - for _ in range(trials): + for root in range(trials): start = time.time() - level,nodes = ak.graph_bfs(Graph,0) + level,nodes = ak.graph_bfs(Graph,root) end = time.time() timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) - print("Average Edges = {:.4f} M/s".format(int(Graph.n_edges)/tavg/1024/1024)) - print("Average Vertices = {:.4f} M/s".format(int(Graph.n_vertices)/tavg/1024/1024)) + print("Average Edges = {:.4f} K/s".format(int(Graph.n_edges)/tavg/1024)) + print("Average Vertices = {:.4f} K/s".format(int(Graph.n_vertices)/tavg/1024)) ''' #print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) ''' diff --git a/benchmarks/graphfilebfs.py b/benchmarks/graphfilebfs.py new file mode 100755 index 0000000000..e664d77634 --- /dev/null +++ b/benchmarks/graphfilebfs.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +import time, argparse +import numpy as np +import arkouda as ak +import random +import string + +TYPES = ('int64', 'float64', 'bool', 'str') + +def time_ak_bfs_graph(trials:int): + lines=91 + vertices=17 + col=3 + directed=0 + weighted=1 + Graph=ak.graph_file_read(lines,vertices,col,directed,"kang.gr") + print("number of vertices ={}".format(Graph.n_vertices)) + print("number of edges ={}".format(Graph.n_edges)) + print("directed graph ={}".format(Graph.directed)) + print("weighted graph ={}".format(Graph.weighted)) + print("source of edges ={}".format(Graph.src)) + print("R dest of edges ={}".format(Graph.dstR)) + print("dest of edges ={}".format(Graph.dst)) + print("R source of edges ={}".format(Graph.srcR)) + print("start ={}".format(Graph.start_i)) + print("R start ={}".format(Graph.start_iR)) + print(" neighbour ={}".format(Graph.neighbour)) + print("R neighbour ={}".format(Graph.neighbourR)) + print("vertices weight ={}".format(Graph.v_weight)) + print("edges weight ={}".format(Graph.e_weight)) + ll,ver = ak.graph_bfs(Graph,4) + old=-2; + visit=[] + for i in range(int(Graph.n_vertices)): + cur=ll[i] + if (int(cur)!=int(old)): + if len(visit) >0: + print(visit) + print("current level=",cur,"the vertices at this level are") + old=cur + visit=[] + visit.append(ver[i]) + print(visit) + + print("total edges are as follows") + for i in range(int(Graph.n_edges)): + print("<",Graph.src[i]," -- ", Graph.dst[i],">") + ''' + print("total reverse edges are as follows") + for i in range(int(Graph.n_edges)): + print("<",Graph.srcR[i]," -- ", Graph.dstR[i],">") + ''' + timings = [] + for root in range(trials): + start = time.time() + level,nodes = ak.graph_bfs(Graph,root) + end = time.time() + timings.append(end - start) + tavg = sum(timings) / trials + print("Average time = {:.4f} sec".format(tavg)) + print("Average Edges = {:.4f} K/s".format(int(Graph.n_edges)/tavg/1024)) + print("Average Vertices = {:.4f} K/s".format(int(Graph.n_vertices)/tavg/1024)) + ''' + #print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) + ''' + + +def create_parser(): + parser = argparse.ArgumentParser(description="Measure the performance of suffix array building: C= suffix_array(V)") + parser.add_argument('hostname', help='Hostname of arkouda server') + parser.add_argument('port', type=int, help='Port of arkouda server') + parser.add_argument('-v', '--logvertices', type=int, default=5, help='Problem size: log number of vertices') + parser.add_argument('-e', '--vedges', type=int, default=2,help='Number of edges per vertex') + parser.add_argument('-p', '--possibility', type=float, default=0.01,help='Possibility ') + parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') + parser.add_argument('-m', '--perm', type=int, default=0 , help='if permutation ') + parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') + parser.add_argument('--correctness-only', default=False, action='store_true', help='Only check correctness, not performance.') + return parser + + + +if __name__ == "__main__": + import sys + parser = create_parser() + args = parser.parse_args() + ak.verbose = False + ak.connect(args.hostname, args.port) + + ''' + if args.correctness_only: + check_correctness(args.number, args.size, args.trials, args.dtype) + print("CORRECT") + sys.exit(0) + ''' + + time_ak_bfs_graph(args.trials) + sys.exit(0) diff --git a/benchmarks/rmatgen.py b/benchmarks/rmatgen.py index 3504635074..3169b02c0d 100755 --- a/benchmarks/rmatgen.py +++ b/benchmarks/rmatgen.py @@ -8,21 +8,21 @@ TYPES = ('int64', 'float64', 'bool', 'str') -def time_ak_rmat_graph(lgNv, Ne_per_v, p, perm): +def time_ak_rmat_graph(lgNv, Ne_per_v, p, directed, weighted): print(">>> arkouda rmat graph") cfg = ak.get_config() Nv = cfg["numLocales"] print("numLocales = {}".format(cfg["numLocales"])) - Graph = ak.rmat_gen(lgNv, Ne_per_v, p, perm) + Graph = ak.rmat_gen(lgNv, Ne_per_v, p, directed, weighted) print("number of vertices ={}".format(Graph.n_vertices)) print("number of edges ={}".format(Graph.n_edges)) print("directed graph ={}".format(Graph.directed)) print("source of edges ={}".format(Graph.src)) print("dest of edges ={}".format(Graph.dst)) - print("start ={}".format(Graph.start)) + print("start ={}".format(Graph.start_i)) print("neighbour ={}".format(Graph.neighbour)) - print("vertices weight ={}".format(Graph.v_weight)) - print("edges weight ={}".format(Graph.e_weight)) + #print("vertices weight ={}".format(Graph.v_weight)) + #print("edges weight ={}".format(Graph.e_weight)) timings = [] for _ in range(trials): start = time.time() @@ -43,7 +43,8 @@ def create_parser(): parser.add_argument('-e', '--vedges', type=int, default=2,help='Number of edges per vertex') parser.add_argument('-p', '--possibility', type=float, default=0.01,help='Possibility ') parser.add_argument('-t', '--trials', type=int, default=6, help='Number of times to run the benchmark') - parser.add_argument('-m', '--perm', type=int, default=0 , help='if permutation ') + parser.add_argument('-d', '--directed', type=int, default=0 , help='if directed ') + parser.add_argument('-w', '--weighted', type=int, default=0 , help='if weighted ') parser.add_argument('--numpy', default=False, action='store_true', help='Run the same operation in NumPy to compare performance.') parser.add_argument('--correctness-only', default=False, action='store_true', help='Only check correctness, not performance.') return parser @@ -64,5 +65,5 @@ def create_parser(): sys.exit(0) ''' - time_ak_rmat_graph(args.logvertices, args.vedges, args.possibility, args.perm) + time_ak_rmat_graph(args.logvertices, args.vedges, args.possibility, args.directed,args.weighted) sys.exit(0) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 6453397d8c..b532fc9ec0 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -1203,6 +1203,212 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } + + +// directly read a graph from given file and build the SegGraph class in memory + proc segGraphFileMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { + var pn = Reflection.getRoutineName(); + var (NeS,NvS,ColS,DirectedS, FileName) = payload.decode().splitMsgToTuple(5); + //writeln("======================Graph Reading====================="); + //writeln(NeS,NvS,ColS,DirectedS, FileName); + var Ne=NeS:int; + var Nv=NvS:int; + var NumCol=ColS:int; + var directed=DirectedS:int; + var weighted=0:int; + if NumCol>2 { + weighted=1; + } + var src,srcR,src1,srcR1: [0..Ne-1] int; + var dst,dstR,dst1,dstR1: [0..Ne-1] int; + var e_weight: [0..Ne-1] int; + var v_weight: [0..Nv-1] int; + var neighbour: [0..Nv-1] int; + var neighbourR: [0..Nv-1] int; + var start_i: [0..Nv-1] int; + var start_iR: [0..Nv-1] int; + + var linenum=0:int; + + var repMsg: string; + + var filesize:int; + var f = open(FileName, iomode.r); + var r = f.reader(kind=ionative); + var line:string; + var a,b,c:string; + var curline=0:int; + while r.readline(line) { + if NumCol==2 { + (a,b)= line.splitMsgToTuple(2); + } else { + (a,b,c)= line.splitMsgToTuple(3); + e_weight[curline]=c:int; + } + src[curline]=a:int; + dst[curline]=b:int; + curline+=1; + } + + r.close(); + src=src+(src==dst); + src=src%Nv; + dst=dst%Nv; + + var iv = radixSortLSD_ranks(src); + // permute into sorted order + src1 = src[iv]; //# permute first vertex into sorted order + dst1 = dst[iv]; //# permute second vertex into sorted order + var startpos=0, endpos:int; + var sort=0:int; + while (startpos < Ne-2) { + endpos=startpos+1; + sort=0; + while (endpos <=Ne-1) { + if (src1[startpos]==src1[endpos]) { + sort=1; + endpos+=1; + continue; + } else { + break; + } + }//end of while endpos + if (sort==1) { + var tmpary:[0..endpos-startpos-1] int; + tmpary=dst1[startpos..endpos-1]; + var ivx=radixSortLSD_ranks(tmpary); + dst1[startpos..endpos-1]=tmpary[ivx]; + sort=0; + } + startpos+=1; + }//end of while startpos + + for i in 0..Ne-1 do { + neighbour[src1[i]]+=1; + if (start_i[src1[i]] ==-1){ + start_i[src1[i]]=i; + } + + } + + if (directed==0) { //undirected graph + + srcR = dst1; + dstR = src1; + + var ivR = radixSortLSD_ranks(srcR); + srcR1 = srcR[ivR]; //# permute first vertex into sorted order + dstR1 = dstR[ivR]; //# permute second vertex into sorted order + startpos=0; + sort=0; + while (startpos < Ne-2) { + endpos=startpos+1; + sort=0; + while (endpos <=Ne-1) { + if (srcR1[startpos]==srcR1[endpos]) { + sort=1; + endpos+=1; + continue; + } else { + break; + } + }//end of while endpos + if (sort==1) { + var tmparyR:[0..endpos-startpos-1] int; + tmparyR=dstR1[startpos..endpos-1]; + var ivxR=radixSortLSD_ranks(tmparyR); + dstR1[startpos..endpos-1]=tmparyR[ivxR]; + sort=0; + } + startpos+=1; + }//end of while startpos + for i in 0..Ne-1 do { + neighbourR[srcR1[i]]+=1; + if (start_iR[srcR1[i]] ==-1){ + start_iR[srcR1[i]]=i; + } + } + + }//end of undirected + + + var ewName ,vwName:string; + if (weighted!=0) { + fillInt(v_weight,1,1000); + //fillRandom(v_weight,0,100); + ewName = st.nextName(); + vwName = st.nextName(); + var vwEntry = new shared SymEntry(v_weight); + var ewEntry = new shared SymEntry(e_weight); + st.addEntry(vwName, vwEntry); + st.addEntry(ewName, ewEntry); + } + var srcName = st.nextName(); + var dstName = st.nextName(); + var startName = st.nextName(); + var neiName = st.nextName(); + var srcEntry = new shared SymEntry(src1); + var dstEntry = new shared SymEntry(dst1); + var startEntry = new shared SymEntry(start_i); + var neiEntry = new shared SymEntry(neighbour); + st.addEntry(srcName, srcEntry); + st.addEntry(dstName, dstEntry); + st.addEntry(startName, startEntry); + st.addEntry(neiName, neiEntry); + var sNv=Nv:string; + var sNe=Ne:string; + var sDirected=directed:string; + var sWeighted=weighted:string; + + var srcNameR, dstNameR, startNameR, neiNameR:string; + if (directed!=0) {//for directed graph + if (weighted!=0) { + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + '+ ' + sWeighted + + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) + + '+created ' + st.attrib(vwName) + '+created ' + st.attrib(ewName); + } else { + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + '+ ' + sWeighted + + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) ; + + } + } else {//for undirected graph + + srcNameR = st.nextName(); + dstNameR = st.nextName(); + startNameR = st.nextName(); + neiNameR = st.nextName(); + var srcEntryR = new shared SymEntry(srcR1); + var dstEntryR = new shared SymEntry(dstR1); + var startEntryR = new shared SymEntry(start_iR); + var neiEntryR = new shared SymEntry(neighbourR); + st.addEntry(srcNameR, srcEntryR); + st.addEntry(dstNameR, dstEntryR); + st.addEntry(startNameR, startEntryR); + st.addEntry(neiNameR, neiEntryR); + if (weighted!=0) { + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + ' +' + sWeighted + + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) + + '+created ' + st.attrib(srcNameR) + '+created ' + st.attrib(dstNameR) + + '+created ' + st.attrib(startNameR) + '+created ' + st.attrib(neiNameR) + + '+created ' + st.attrib(vwName) + '+created ' + st.attrib(ewName); + } else { + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + ' +' + sWeighted + + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) + + '+created ' + st.attrib(srcNameR) + '+created ' + st.attrib(dstNameR) + + '+created ' + st.attrib(startNameR) + '+created ' + st.attrib(neiNameR) ; + } + + } + smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); + return repMsg; + } + + + proc segrmatgenMsg(cmd: string, payload: bytes, st: borrowed SymTab): string throws { var pn = Reflection.getRoutineName(); var repMsg: string; @@ -1508,14 +1714,17 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string } var vertexValue = radixSortLSD_ranks(depth); var levelValue=depth[vertexValue]; - + //var depthName =st.nextName(); var levelName = st.nextName(); var vertexName = st.nextName(); var levelEntry = new shared SymEntry(levelValue); var vertexEntry = new shared SymEntry(vertexValue); + //var depthEntry = new shared SymEntry(depth); st.addEntry(levelName, levelEntry); st.addEntry(vertexName, vertexEntry); + //st.addEntry(depthName, depthEntry); repMsg = 'created ' + st.attrib(levelName) + '+created ' + st.attrib(vertexName) ; + //repMsg = 'created ' + st.attrib(depthName); smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return repMsg; diff --git a/src/arkouda_server.chpl b/src/arkouda_server.chpl index 0d2722e05f..83efcd57ba 100644 --- a/src/arkouda_server.chpl +++ b/src/arkouda_server.chpl @@ -247,6 +247,7 @@ proc main() { when "segmentedSuffixAry"{repMsg = segSuffixArrayMsg(cmd, payload, st);} when "segmentedLCP" {repMsg = segLCPMsg(cmd, payload, st);} when "segmentedSAFile" {repMsg = segSAFileMsg(cmd, payload, st);} + when "segmentedGraphFile" {repMsg = segGraphFileMsg(cmd, payload, st);} when "segmentedRMAT" {repMsg = segrmatgenMsg(cmd, payload, st);} when "segmentedGraphBFS" {repMsg = segBFSMsg(cmd, payload, st);} when "segmentedIn1d" {repMsg = segIn1dMsg(cmd, payload, st);} From 13e19e82519f468ef11988fa2cd59c99c4bfd67e Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 27 Jan 2021 23:24:07 -0500 Subject: [PATCH 61/68] bfs return one array --- arkouda/pdarraycreation.py | 7 +++++-- benchmarks/bfs.py | 6 +++++- src/SegmentedMsg.chpl | 29 ++++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index c8c8e9f8ee..0f298057ea 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -1027,7 +1027,7 @@ def rmat_gen (lgNv:int, Ne_per_v:int, p:float, directed: int,weighted:int) ->\ return GraphUD(*(cast(str,repMsg).split('+'))) @typechecked -def graph_bfs (graph: Union[GraphD,GraphDW,GraphUD,GraphUDW], root: int ) -> tuple: +def graph_bfs (graph: Union[GraphD,GraphDW,GraphUD,GraphUDW], root: int ) -> pdarray: """ This function is generating the breadth-first search vertices sequences in given graph starting from the given root vertex @@ -1089,13 +1089,16 @@ def graph_bfs (graph: Union[GraphD,GraphDW,GraphUD,GraphUDW], root: int ) -> tup root) repMsg = generic_msg(msg) + ''' tmpmsg=cast(str,repMsg).split('+') levelstr=tmpmsg[0:1] vertexstr=tmpmsg[1:2] levelary=create_pdarray(*(cast(str,levelstr)) ) vertexary=create_pdarray(*(cast(str,vertexstr)) ) - return (levelary,vertexary) + ''' + return create_pdarray(repMsg) + #return (levelary,vertexary) @typechecked diff --git a/benchmarks/bfs.py b/benchmarks/bfs.py index 285f13e58b..e0c3f9f01f 100755 --- a/benchmarks/bfs.py +++ b/benchmarks/bfs.py @@ -32,6 +32,9 @@ def time_ak_bfs_graph(trials:int): print("vertices weight ={}".format(Graph.v_weight)) print("edges weight ={}".format(Graph.e_weight)) ''' + deparray = ak.graph_bfs(Graph,4) + print(deparray) + ''' ll,ver = ak.graph_bfs(Graph,4) old=-2; visit=[] @@ -45,6 +48,7 @@ def time_ak_bfs_graph(trials:int): visit=[] visit.append(ver[i]) print(visit) + ''' ''' print("total edges are as follows") @@ -58,7 +62,7 @@ def time_ak_bfs_graph(trials:int): timings = [] for root in range(trials): start = time.time() - level,nodes = ak.graph_bfs(Graph,root) + _ = ak.graph_bfs(Graph,root) end = time.time() timings.append(end - start) tavg = sum(timings) / trials diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index b532fc9ec0..6552da3397 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -1712,6 +1712,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string numCurF=SetNextF.size; SetCurF=SetNextF; } + + /* var vertexValue = radixSortLSD_ranks(depth); var levelValue=depth[vertexValue]; //var depthName =st.nextName(); @@ -1725,7 +1727,12 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string //st.addEntry(depthName, depthEntry); repMsg = 'created ' + st.attrib(levelName) + '+created ' + st.attrib(vertexName) ; //repMsg = 'created ' + st.attrib(depthName); + */ + var depthName = st.nextName(); + var depthEntry = new shared SymEntry(depth); + st.addEntry(depthName, depthEntry); + repMsg = 'created ' + st.attrib(depthName); smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return repMsg; } @@ -1780,6 +1787,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string numCurF=SetNextF.size; SetCurF=SetNextF; } + /* var vertexValue = radixSortLSD_ranks(depth); var levelValue=depth[vertexValue]; @@ -1790,7 +1798,11 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string st.addEntry(levelName, levelEntry); st.addEntry(vertexName, vertexEntry); repMsg = 'created ' + st.attrib(levelName) + '+created ' + st.attrib(vertexName) ; - + */ + var depthName = st.nextName(); + var depthEntry = new shared SymEntry(depth); + st.addEntry(depthName, depthEntry); + repMsg = 'created ' + st.attrib(depthName); smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return repMsg; } @@ -1878,6 +1890,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string numCurF=SetNextF.size; SetCurF=SetNextF; } + /* var vertexValue = radixSortLSD_ranks(depth); var levelValue=depth[vertexValue]; @@ -1887,7 +1900,12 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var vertexEntry = new shared SymEntry(vertexValue); st.addEntry(levelName, levelEntry); st.addEntry(vertexName, vertexEntry); - repMsg = 'created ' + st.attrib(levelName) + '+created ' + st.attrib(vertexName) ; + + */ + var depthName = st.nextName(); + var depthEntry = new shared SymEntry(depth); + st.addEntry(depthName, depthEntry); + repMsg = 'created ' + st.attrib(depthName); smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return repMsg; @@ -2007,6 +2025,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string numCurF=SetNextF.size; SetCurF=SetNextF; } + /* var vertexValue = radixSortLSD_ranks(depth); var levelValue=depth[vertexValue]; @@ -2017,7 +2036,11 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string st.addEntry(levelName, levelEntry); st.addEntry(vertexName, vertexEntry); repMsg = 'created ' + st.attrib(levelName) + '+created ' + st.attrib(vertexName) ; - + */ + var depthName = st.nextName(); + var depthEntry = new shared SymEntry(depth); + st.addEntry(depthName, depthEntry); + repMsg = 'created ' + st.attrib(depthName); smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return repMsg; } From eb00b06689d76c6bee79780d7a90f0be6226b3e1 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 27 Jan 2021 23:40:19 -0500 Subject: [PATCH 62/68] use for instead of forall --- src/SegmentedMsg.chpl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 6552da3397..d73d33792a 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -1697,8 +1697,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string numNF=nf[i]; ref NF=df[sf[i]..sf[i]+numNF-1]; if (numNF>0) { - forall j in NF { - //for j in NF { + //forall j in NF { + for j in NF { if (depth[j]==-1) { depth[j]=cur_level+1; SetNextF.add(j); @@ -1770,8 +1770,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string ref NF=df[sf[i]..sf[i]+numNF-1]; writeln("current node ",i, " has ", numNF, " neighbours and they are ",NF); if (numNF>0) { - forall j in NF { - //for j in NF { + //forall j in NF { + for j in NF { writeln("current node ",i, " check neibour ",j, " its depth=",depth[j]); if (depth[j]==-1) { depth[j]=cur_level+1; @@ -1849,9 +1849,9 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string numNF=nf[i]; ref NF=df[sf[i]..sf[i]+numNF-1]; if (numNF>0) { - forall j in NF { + //forall j in NF { //writeln("current node ",i, " has neibours ",NF); - //for j in NF { + for j in NF { if (depth[j]==-1) { depth[j]=cur_level+1; SetNextF.add(j); @@ -1871,8 +1871,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string ref NFR=dfR[sfR[i]..sfR[i]+numNFR-1]; if (numNFR>0) { //writeln("current node ",i, " has reverse neibours ",NFR); - forall j in NFR { - //for j in NFR { + //forall j in NFR { + for j in NFR { if (depth[j]==-1) { depth[j]=cur_level+1; SetNextF.add(j); @@ -1985,8 +1985,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string ref NF=df[sf[i]..sf[i]+numNF-1]; //writeln("current node ",i, " has ", numNF, " neighbours and they are ",NF); if (numNF>0) { - forall j in NF { - //for j in NF { + //forall j in NF { + for j in NF { //writeln("current node ",i, " check neibour ",j, " its depth=",depth[j]); if (depth[j]==-1) { depth[j]=cur_level+1; From 9142d28f4601be22d8e07ad14db823b739c77d46 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sat, 30 Jan 2021 23:19:26 -0500 Subject: [PATCH 63/68] forall intention --- src/SegmentedMsg.chpl | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index d73d33792a..2f0e57660d 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -1682,14 +1682,14 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var depth=-1: [0..Nv-1] int; depth[root]=0; var cur_level=0; - var SetCurF: domain(int); - var SetNextF: domain(int); + var SetCurF= new set(int,parSafe = true); + var SetNextF= new set(int,parSafe = true); SetCurF.add(root); var numCurF=1:int; while (numCurF>0) { SetNextF.clear(); - forall i in SetCurF { + forall i in SetCurF with (ref SetNextF) { var numNF=-1 :int; ref nf=ag.neighbour.a; ref sf=ag.start_i.a; @@ -1754,36 +1754,36 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var depth=-1: [0..Nv-1] int; depth[root]=0; var cur_level=0; - var SetCurF: domain(int); - var SetNextF: domain(int); + var SetCurF= new set(int,parSafe = true); + var SetNextF= new set(int,parSafe = true); SetCurF.add(root); var numCurF=1:int; while (numCurF>0) { SetNextF.clear(); - forall i in SetCurF { + forall i in SetCurF with (ref SetNextF) { var numNF=-1 :int; ref nf=ag.neighbour.a; ref sf=ag.start_i.a; ref df=ag.dst.a; numNF=nf[i]; ref NF=df[sf[i]..sf[i]+numNF-1]; - writeln("current node ",i, " has ", numNF, " neighbours and they are ",NF); + //writeln("current node ",i, " has ", numNF, " neighbours and they are ",NF); if (numNF>0) { //forall j in NF { for j in NF { - writeln("current node ",i, " check neibour ",j, " its depth=",depth[j]); + //writeln("current node ",i, " check neibour ",j, " its depth=",depth[j]); if (depth[j]==-1) { depth[j]=cur_level+1; SetNextF.add(j); - writeln("current node ",i, " add ", j, " into level ", cur_level+1, " SetNextF=", SetNextF); + //writeln("current node ",i, " add ", j, " into level ", cur_level+1, " SetNextF=", SetNextF); } } } }//end forall i cur_level+=1; - writeln("SetCurF= ", SetCurF, "SetNextF=", SetNextF, " level ", cur_level+1); + //writeln("SetCurF= ", SetCurF, "SetNextF=", SetNextF, " level ", cur_level+1); numCurF=SetNextF.size; SetCurF=SetNextF; } @@ -1833,15 +1833,15 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var depth=-1: [0..Nv-1] int; depth[root]=0; var cur_level=0; - var SetCurF: domain(int); - var SetNextF: domain(int); + var SetCurF= new set(int,parSafe = true); + var SetNextF= new set(int,parSafe = true); SetCurF.add(root); var numCurF=1:int; //writeln("========================BSF_UD=================================="); while (numCurF>0) { SetNextF.clear(); - forall i in SetCurF { + forall i in SetCurF with (ref SetNextF) { var numNF=-1 :int; ref nf=ag.neighbour.a; ref sf=ag.start_i.a; @@ -1936,8 +1936,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var depth=-1: [0..Nv-1] int; depth[root]=0; var cur_level=0; - var SetCurF: domain(int); - var SetNextF: domain(int); + var SetCurF= new set(int,parSafe = true); + var SetNextF= new set(int,parSafe = true); SetCurF.add(root); var numCurF=1:int; @@ -1976,7 +1976,7 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string while (numCurF>0) { //writeln("start loop SetCurF=", SetCurF); SetNextF.clear(); - forall i in SetCurF { + forall i in SetCurF with (ref SetNextF) { var numNF=-1 :int; ref nf=ag.neighbour.a; ref sf=ag.start_i.a; @@ -2005,8 +2005,8 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string ref NFR=dfR[sfR[i]..sfR[i]+numNFR-1]; //writeln("current node ",i, " has ", numNFR ," reverse neighbours and they are ",NFR); if ( numNFR>0) { - forall j in NFR { - //for j in NFR { + //forall j in NFR { + for j in NFR { //writeln("current node ",i, " check neibour ",j, " its depth=",depth[j]); if (depth[j]==-1) { depth[j]=cur_level+1; From b254e5b03011f7a90854a8fe253e987a1cabc8cf Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sat, 30 Jan 2021 23:35:26 -0500 Subject: [PATCH 64/68] add use set --- src/SegmentedMsg.chpl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index 2f0e57660d..d31081db7d 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -15,6 +15,7 @@ module SegmentedMsg { use SACA; use Random; use RadixSortLSD only radixSortLSD_ranks; + use Set; private config const DEBUG = false; From 758ccebbbd21c62047e2aeb53d0d3b8d50b47b1a Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 31 Jan 2021 00:09:49 -0500 Subject: [PATCH 65/68] change bfs.py parameters --- benchmarks/bfs.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/bfs.py b/benchmarks/bfs.py index e0c3f9f01f..d5a704d3c5 100755 --- a/benchmarks/bfs.py +++ b/benchmarks/bfs.py @@ -10,11 +10,11 @@ def time_ak_bfs_graph(trials:int): print("Graph BFS") - lgNv=4 - Ne_per_v=3 + lgNv=14 + Ne_per_v=5 p=0.03 directed=1 - weighted=1 + weighted=0 Graph=ak.rmat_gen(lgNv, Ne_per_v, p, directed, weighted) ''' print("number of vertices ={}".format(Graph.n_vertices)) @@ -67,6 +67,8 @@ def time_ak_bfs_graph(trials:int): timings.append(end - start) tavg = sum(timings) / trials print("Average time = {:.4f} sec".format(tavg)) + print("number of vertices ={}".format(Graph.n_vertices)) + print("number of edges ={}".format(Graph.n_edges)) print("Average Edges = {:.4f} K/s".format(int(Graph.n_edges)/tavg/1024)) print("Average Vertices = {:.4f} K/s".format(int(Graph.n_vertices)/tavg/1024)) ''' From 0448771919232625249bff742cd2a592783ca152 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 31 Jan 2021 22:05:14 -0500 Subject: [PATCH 66/68] add data filtering patterns --- src/MultiTypeSymbolTable.chpl | 76 +++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 7 deletions(-) diff --git a/src/MultiTypeSymbolTable.chpl b/src/MultiTypeSymbolTable.chpl index 731cbb7e96..bf63b7f490 100644 --- a/src/MultiTypeSymbolTable.chpl +++ b/src/MultiTypeSymbolTable.chpl @@ -9,7 +9,13 @@ module MultiTypeSymbolTable use MultiTypeSymEntry; use Map; + use Random; + use RadixSortLSD only radixSortLSD_ranks; + use RandArray; + + + var FilteringPattern=0:int; var mtLogger = new Logger(); if v { mtLogger.level = LogLevel.DEBUG; @@ -351,18 +357,74 @@ module MultiTypeSymbolTable { var e = toSymEntry(u,int); if e.size == 0 {s = "[]";} - else if e.size < thresh || e.size <= 6 { + else if e.size < thresh+4 || e.size <= 6 { s = "["; for i in 0..(e.size-2) {s += try! "%t ".format(e.a[i]);} s += try! "%t]".format(e.a[e.size-1]); } else { - s = try! "[%t %t %t ... %t %t %t]".format(e.a[0],e.a[1],e.a[2], - e.a[e.size-3], - e.a[e.size-2], - e.a[e.size-1]); - } - } + select FilteringPattern + { + when 0 //HeadAndTail + { + var half=thresh/2:int; + s = "["; + for i in 0..(half-2) {s += try! "%t ".format(e.a[i]);} + s += try! "%t ... ".format(e.a[half-1]); + for i in e.size-2-half..(e.size-2) {s += try! "%t ".format(e.a[i]);} + s += try! "%t]".format(e.a[e.size-1]); + + //s = try! "[%t %t %t ... %t %t %t]".format(e.a[0],e.a[1],e.a[2], + // e.a[e.size-3], + // e.a[e.size-2], + // e.a[e.size-1]); + } + when 1 //Head + { + s = "["; + for i in 0..thresh-2 {s += try! "%t ".format(e.a[i]);} + s += try! "%t ...] ".format(e.a[thresh-1]); + } + when 2 //Tail + { + s = "[... "; + for i in e.size-1-thresh..e.size-2 {s += try! "%t ".format(e.a[i]);} + s += try! "%t]".format(e.a[e.size-1]); + } + when 3 //Middle + { + var startM=e.size-1-thresh/2:int; + s = "[... "; + for i in startM..startM+thresh-2 {s += try! "%t ".format(e.a[i]);} + s += try! "%t ...]".format(e.a[startM+thresh-1]); + + } + when 4 //Uniform + { + var stride =(e.size-1)/thresh:int; + s = "[... "; + for i in 0..thresh-2 {s += try! "%t ".format(e.a[i*stride]);} + s += try! "%t ...]".format(e.a[ stride*(thresh-1)]); + } + when 5 //Random + { + var samplearray:[0..thresh-1]int; + var indexarray:[0..thresh-1]int; + fillInt(samplearray,0,e.size-1); + var iv = radixSortLSD_ranks(samplearray); + indexarray=samplearray[iv]:int; + s = "[... "; + for i in 0..thresh-2 { + if (e.a[indexarray[i]]!=e.a[indexarray[i+1]]) { + s += try! "%t ".format(e.a[indexarray[i]]); + } + s += try! "%t ...]".format(e.a[indexarray[thresh-1]]); + } + } + + }//end select + }//end else + }//end DType.Int64 when DType.Float64 { var e = toSymEntry(u,real); From 0fc3f405aae50828a1f51919ede2eb63e02b04c0 Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Sun, 31 Jan 2021 22:30:41 -0500 Subject: [PATCH 67/68] only fillInt --- src/MultiTypeSymbolTable.chpl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/MultiTypeSymbolTable.chpl b/src/MultiTypeSymbolTable.chpl index bf63b7f490..33e52f0008 100644 --- a/src/MultiTypeSymbolTable.chpl +++ b/src/MultiTypeSymbolTable.chpl @@ -9,9 +9,8 @@ module MultiTypeSymbolTable use MultiTypeSymEntry; use Map; - use Random; use RadixSortLSD only radixSortLSD_ranks; - use RandArray; + use RandArray only fillInt; From 08374f6cdb6706a47c32372078d57d8c0b902bea Mon Sep 17 00:00:00 2001 From: Zhihui Du Date: Wed, 3 Feb 2021 09:09:09 -0500 Subject: [PATCH 68/68] reorganize the rmat bfs code --- benchmarks/bfs.py | 10 +- src/SegmentedMsg.chpl | 625 ++++++++++++++++++++++++++++++------------ 2 files changed, 456 insertions(+), 179 deletions(-) diff --git a/benchmarks/bfs.py b/benchmarks/bfs.py index d5a704d3c5..c01abace7f 100755 --- a/benchmarks/bfs.py +++ b/benchmarks/bfs.py @@ -10,8 +10,8 @@ def time_ak_bfs_graph(trials:int): print("Graph BFS") - lgNv=14 - Ne_per_v=5 + lgNv=5 + Ne_per_v=3 p=0.03 directed=1 weighted=0 @@ -48,17 +48,16 @@ def time_ak_bfs_graph(trials:int): visit=[] visit.append(ver[i]) print(visit) - ''' - ''' print("total edges are as follows") for i in range(int(Graph.n_edges)): print("<",Graph.src[i]," -- ", Graph.dst[i],">") + ''' print("total reverse edges are as follows") for i in range(int(Graph.n_edges)): print("<",Graph.srcR[i]," -- ", Graph.dstR[i],">") - ''' + timings = [] for root in range(trials): start = time.time() @@ -71,7 +70,6 @@ def time_ak_bfs_graph(trials:int): print("number of edges ={}".format(Graph.n_edges)) print("Average Edges = {:.4f} K/s".format(int(Graph.n_edges)/tavg/1024)) print("Average Vertices = {:.4f} K/s".format(int(Graph.n_vertices)/tavg/1024)) - ''' #print("Average rate = {:.2f} GiB/sec".format(bytes_per_sec/2**30)) ''' diff --git a/src/SegmentedMsg.chpl b/src/SegmentedMsg.chpl index d31081db7d..895f005e1d 100644 --- a/src/SegmentedMsg.chpl +++ b/src/SegmentedMsg.chpl @@ -14,9 +14,9 @@ module SegmentedMsg { use SymArrayDmap; use SACA; use Random; - use RadixSortLSD only radixSortLSD_ranks; + use RadixSortLSD; use Set; - + public use ArgSortMsg; private config const DEBUG = false; const smLogger = new Logger(); @@ -1426,208 +1426,387 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string // number of edges var Ne = Ne_per_v * Nv:int; // probabilities - var a = p; - var b = (1.0 - a)/ 3.0:real; - var c = b; - var d = b; - var src,srcR,src1,srcR1: [0..Ne-1] int; - var dst,dstR,dst1,dstR1: [0..Ne-1] int; - var e_weight: [0..Ne-1] int; - var v_weight: [0..Nv-1] int; + var src: [0..Ne-1] int; + var dst: [0..Ne-1] int; + var iv: [0..Ne-1] int; + var length: [0..Nv-1] int; - var lengthR: [0..Nv-1] int; var start_i: [0..Nv-1] int; - var start_iR: [0..Nv-1] int; + var neighbour:[0..Nv-1] int; length=0; - lengthR=0; start_i=-1; - start_iR=-1; + neighbour=0; var n_vertices=Nv; var n_edges=Ne; src=1; dst=1; - // quantites to use in edge generation loop - var ab = a+b:real; - var c_norm = c / (c + d):real; - var a_norm = a / (a + b):real; - // generate edges - var src_bit: [0..Ne-1]int; - var dst_bit: [0..Ne-1]int; - for ib in 1..lgNv { - var tmpvar: [0..Ne-1] real; - fillRandom(tmpvar); - src_bit=tmpvar>ab; - fillRandom(tmpvar); - dst_bit=tmpvar>(c_norm * src_bit + a_norm * (~ src_bit)); - src = src + ((2**(ib-1)) * src_bit); - dst = dst + ((2**(ib-1)) * dst_bit); + var srcName:string ; + var dstName:string ; + var startName:string ; + var neiName:string ; + var sNv:string; + var sNe:string; + var sDirected:string; + var sWeighted:string; + + + proc rmat_gen() { + var a = p; + var b = (1.0 - a)/ 3.0:real; + var c = b; + var d = b; + var ab=a+b; + var c_norm = c / (c + d):real; + var a_norm = a / (a + b):real; + // generate edges + var src_bit: [0..Ne-1]int; + var dst_bit: [0..Ne-1]int; + for ib in 1..lgNv { + var tmpvar: [0..Ne-1] real; + fillRandom(tmpvar); + src_bit=tmpvar>ab; + fillRandom(tmpvar); + dst_bit=tmpvar>(c_norm * src_bit + a_norm * (~ src_bit)); + src = src + ((2**(ib-1)) * src_bit); + dst = dst + ((2**(ib-1)) * dst_bit); + } + src=src%Nv; + dst=dst%Nv; + //remove self loop + src=src+(src==dst); + src=src%Nv; } - src=src%Nv; - dst=dst%Nv; - //remove self loop - src=src+(src==dst); - src=src%Nv; - - var iv = radixSortLSD_ranks(src); - // permute into sorted order - src1 = src[iv]; //# permute first vertex into sorted order - dst1 = dst[iv]; //# permute second vertex into sorted order - //# to premute/rename vertices - var startpos=0, endpos:int; - var sort=0:int; - while (startpos < Ne-2) { - endpos=startpos+1; - sort=0; - //writeln("startpos=",startpos,"endpos=",endpos); - while (endpos <=Ne-1) { - if (src1[startpos]==src1[endpos]) { - sort=1; - endpos+=1; - continue; - } else { - break; - } - }//end of while endpos - if (sort==1) { - var tmpary:[0..endpos-startpos-1] int; - tmpary=dst1[startpos..endpos-1]; - var ivx=radixSortLSD_ranks(tmpary); - dst1[startpos..endpos-1]=tmpary[ivx]; - //writeln("src1=",src1,"dst1=",dst1,"ivx=",ivx); - sort=0; - } - startpos+=1; - }//end of while startpos - - for i in 0..Ne-1 do { - length[src1[i]]+=1; - if (start_i[src1[i]] ==-1){ - start_i[src1[i]]=i; - //writeln("assign index ",i, " to vertex ",src1[i]); - } - - } - //var neighbour = (+ scan length) - length; - var neighbour = length; - var neighbourR = neighbour; - - if (directed==0) { //undirected graph - - srcR = dst1; - dstR = src1; - - var ivR = radixSortLSD_ranks(srcR); - srcR1 = srcR[ivR]; //# permute first vertex into sorted order - dstR1 = dstR[ivR]; //# permute second vertex into sorted order - startpos=0; - sort=0; - while (startpos < Ne-2) { - endpos=startpos+1; - sort=0; - while (endpos <=Ne-1) { - if (srcR1[startpos]==srcR1[endpos]) { - sort=1; - endpos+=1; - continue; - } else { - break; - } - }//end of while endpos - if (sort==1) { - var tmparyR:[0..endpos-startpos-1] int; - tmparyR=dstR1[startpos..endpos-1]; - var ivxR=radixSortLSD_ranks(tmparyR); - dstR1[startpos..endpos-1]=tmparyR[ivxR]; - sort=0; - } - startpos+=1; - }//end of while startpos - - - for i in 0..Ne-1 do { - lengthR[srcR1[i]]+=1; - if (start_iR[srcR1[i]] ==-1){ - start_iR[srcR1[i]]=i; - } - } - //neighbourR = (+ scan lengthR) - lengthR; - neighbourR = lengthR; + proc combine_sort(){ + + /* we cannot use the coargsort version because it will break the memory limit */ + // coargsort + param bitsPerDigit = RSLSD_bitsPerDigit; + var bitWidths: [0..1] int; + var negs: [0..1] bool; + var totalDigits: int; + var size=Nv: int; + + for (bitWidth, ary, neg) in zip(bitWidths, [src,dst], negs) { + (bitWidth, neg) = getBitWidth(ary); + totalDigits += (bitWidth + (bitsPerDigit-1)) / bitsPerDigit; + } + proc mergedArgsort(param numDigits) throws { + + //overMemLimit(((4 + 3) * size * (numDigits * bitsPerDigit / 8)) + // + (2 * here.maxTaskPar * numLocales * 2**16 * 8)); + var merged = makeDistArray(size, numDigits*uint(bitsPerDigit)); + var curDigit = RSLSD_tupleLow + numDigits - totalDigits; + for (ary , nBits, neg) in zip([src,dst], bitWidths, negs) { + proc mergeArray(type t) { + ref A = ary; + const r = 0..#nBits by bitsPerDigit; + for rshift in r { + const myDigit = (r.high - rshift) / bitsPerDigit; + const last = myDigit == 0; + forall (m, a) in zip(merged, A) { + m[curDigit+myDigit] = getDigit(a, rshift, last, neg):uint(bitsPerDigit); + } + } + curDigit += r.size; + } + mergeArray(int); + } + var iv = argsortDefault(merged); + return iv; + } - }//end of undirected + if totalDigits <= 4 { + iv = mergedArgsort( 4); + } + if totalDigits <= 8 { + iv = mergedArgsort( 8); + } + if totalDigits <= 16 { + iv = mergedArgsort(16); + } - var ewName ,vwName:string; - if (weighted!=0) { - fillInt(e_weight,1,1000); - //fillRandom(e_weight,0,100); - fillInt(v_weight,1,1000); - //fillRandom(v_weight,0,100); - ewName = st.nextName(); - vwName = st.nextName(); - var vwEntry = new shared SymEntry(v_weight); - var ewEntry = new shared SymEntry(e_weight); - st.addEntry(vwName, vwEntry); - st.addEntry(ewName, ewEntry); } - var srcName = st.nextName(); - var dstName = st.nextName(); - var startName = st.nextName(); - var neiName = st.nextName(); - var srcEntry = new shared SymEntry(src1); - var dstEntry = new shared SymEntry(dst1); - var startEntry = new shared SymEntry(start_i); - var neiEntry = new shared SymEntry(neighbour); - st.addEntry(srcName, srcEntry); - st.addEntry(dstName, dstEntry); - st.addEntry(startName, startEntry); - st.addEntry(neiName, neiEntry); - var sNv=Nv:string; - var sNe=Ne:string; - var sDirected=directed:string; - var sWeighted=weighted:string; - var srcNameR, dstNameR, startNameR, neiNameR:string; - if (directed!=0) {//for directed graph - if (weighted!=0) { - repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + '+ ' + sWeighted + + proc twostep_sort(){ + iv = radixSortLSD_ranks(src); + // permute into sorted order + var tmpedge=src; + tmpedge=src[iv]; + src=tmpedge; + tmpedge=dst[iv]; + dst=tmpedge; + //# to premute/rename vertices + var startpos=0, endpos:int; + var sort=0:int; + while (startpos < Ne-2) { + endpos=startpos+1; + sort=0; + //writeln("startpos=",startpos,"endpos=",endpos); + while (endpos <=Ne-1) { + if (src[startpos]==src[endpos]) { + sort=1; + endpos+=1; + continue; + } else { + break; + } + }//end of while endpos + if (sort==1) { + var tmpary:[0..endpos-startpos-1] int; + tmpary=dst[startpos..endpos-1]; + var ivx=radixSortLSD_ranks(tmpary); + dst[startpos..endpos-1]=tmpary[ivx]; + //writeln("src1=",src1,"dst1=",dst1,"ivx=",ivx); + sort=0; + } + startpos+=1; + }//end of while startpos + } + proc set_neighbour(){ + for i in 0..Ne-1 do { + length[src[i]]+=1; + if (start_i[src[i]] ==-1){ + start_i[src[i]]=i; + //writeln("assign index ",i, " to vertex ",src[i]); + } + + } + neighbour = length; + } + //proc set_common_symtable(): string throws { + proc set_common_symtable() { + srcName = st.nextName(); + dstName = st.nextName(); + startName = st.nextName(); + neiName = st.nextName(); + var srcEntry = new shared SymEntry(src); + var dstEntry = new shared SymEntry(dst); + var startEntry = new shared SymEntry(start_i); + var neiEntry = new shared SymEntry(neighbour); + try! st.addEntry(srcName, srcEntry); + try! st.addEntry(dstName, dstEntry); + try! st.addEntry(startName, startEntry); + try! st.addEntry(neiName, neiEntry); + sNv=Nv:string; + sNe=Ne:string; + sDirected=directed:string; + sWeighted=weighted:string; + } + if (directed!=0) {// for directed graph + if (weighted!=0) { // for weighted graph + var e_weight: [0..Ne-1] int; + var v_weight: [0..Nv-1] int; + rmat_gen(); + twostep_sort(); + set_neighbour(); + + var ewName ,vwName:string; + fillInt(e_weight,1,1000); + //fillRandom(e_weight,0,100); + fillInt(v_weight,1,1000); + //fillRandom(v_weight,0,100); + ewName = st.nextName(); + vwName = st.nextName(); + var vwEntry = new shared SymEntry(v_weight); + var ewEntry = new shared SymEntry(e_weight); + try! st.addEntry(vwName, vwEntry); + try! st.addEntry(ewName, ewEntry); + + set_common_symtable(); + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + '+ ' + sWeighted + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) + '+created ' + st.attrib(vwName) + '+created ' + st.attrib(ewName); + } else { - repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + '+ ' + sWeighted + + rmat_gen(); + twostep_sort(); + set_neighbour(); + set_common_symtable(); + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + '+ ' + sWeighted + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) ; + } + } + else { + // only for undirected graph, we only declare R variables here + var srcR: [0..Ne-1] int; + var dstR: [0..Ne-1] int; + ref ivR=iv; + var start_iR: [0..Nv-1] int; + var lengthR: [0..Nv-1] int; + var neighbourR: [0..Nv-1] int; + start_iR=-1; + lengthR=0; + neighbourR=0; + var srcNameR, dstNameR, startNameR, neiNameR:string; + + proc combine_sortR(){ + + /* we cannot use the coargsort version because it will break the memory limit */ + param bitsPerDigit = RSLSD_bitsPerDigit; + var bitWidths: [0..1] int; + var negs: [0..1] bool; + var totalDigits: int; + var size=Nv: int; + for (bitWidth, ary, neg) in zip(bitWidths, [srcR,dstR], negs) { + (bitWidth, neg) = getBitWidth(ary); + totalDigits += (bitWidth + (bitsPerDigit-1)) / bitsPerDigit; + + } + proc mergedArgsort(param numDigits) throws { + + //overMemLimit(((4 + 3) * size * (numDigits * bitsPerDigit / 8)) + // + (2 * here.maxTaskPar * numLocales * 2**16 * 8)); + var merged = makeDistArray(size, numDigits*uint(bitsPerDigit)); + var curDigit = RSLSD_tupleLow + numDigits - totalDigits; + for (ary , nBits, neg) in zip([src,dst], bitWidths, negs) { + proc mergeArray(type t) { + ref A = ary; + const r = 0..#nBits by bitsPerDigit; + for rshift in r { + const myDigit = (r.high - rshift) / bitsPerDigit; + const last = myDigit == 0; + forall (m, a) in zip(merged, A) { + m[curDigit+myDigit] = getDigit(a, rshift, last, neg):uint(bitsPerDigit); + } + } + curDigit += r.size; + } + mergeArray(int); + } + var iv = argsortDefault(merged); + return iv; + } + + if totalDigits <= 4 { + ivR = mergedArgsort( 4); + } + + if totalDigits <= 8 { + ivR = mergedArgsort( 8); + } + if totalDigits <= 16 { + ivR = mergedArgsort(16); + } + } - } else {//for undirected graph - srcNameR = st.nextName(); - dstNameR = st.nextName(); - startNameR = st.nextName(); - neiNameR = st.nextName(); - var srcEntryR = new shared SymEntry(srcR1); - var dstEntryR = new shared SymEntry(dstR1); - var startEntryR = new shared SymEntry(start_iR); - var neiEntryR = new shared SymEntry(neighbourR); - st.addEntry(srcNameR, srcEntryR); - st.addEntry(dstNameR, dstEntryR); - st.addEntry(startNameR, startEntryR); - st.addEntry(neiNameR, neiEntryR); + proc twostep_sortR() { + ivR = radixSortLSD_ranks(srcR); + var tmpedges = srcR[ivR]; //# permute first vertex into sorted order + srcR=tmpedges; + tmpedges = dstR[ivR]; //# permute second vertex into sorted order + dstR=tmpedges; + var startpos=0:int; + var endpos:int; + var sort=0; + while (startpos < Ne-2) { + endpos=startpos+1; + sort=0; + while (endpos <=Ne-1) { + if (srcR[startpos]==srcR[endpos]) { + sort=1; + endpos+=1; + continue; + } else { + break; + } + }//end of while endpos + if (sort==1) { + var tmparyR:[0..endpos-startpos-1] int; + tmparyR=dstR[startpos..endpos-1]; + var ivxR=radixSortLSD_ranks(tmparyR); + dstR[startpos..endpos-1]=tmparyR[ivxR]; + sort=0; + } + startpos+=1; + } //end of while startpos + } + proc set_neighbourR(){ + for i in 0..Ne-1 do { + lengthR[srcR[i]]+=1; + if (start_iR[srcR[i]] ==-1){ + start_iR[srcR[i]]=i; + } + } + neighbourR = lengthR; + + } + //proc set_common_symtableR():string throws { + proc set_common_symtableR() { + srcNameR = st.nextName(); + dstNameR = st.nextName(); + startNameR = st.nextName(); + neiNameR = st.nextName(); + var srcEntryR = new shared SymEntry(srcR); + var dstEntryR = new shared SymEntry(dstR); + var startEntryR = new shared SymEntry(start_iR); + var neiEntryR = new shared SymEntry(neighbourR); + try! st.addEntry(srcNameR, srcEntryR); + try! st.addEntry(dstNameR, dstEntryR); + try! st.addEntry(startNameR, startEntryR); + try! st.addEntry(neiNameR, neiEntryR); + } + + if (weighted!=0) { - repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + ' +' + sWeighted + + rmat_gen(); + twostep_sort(); + set_neighbour(); + srcR = dst; + dstR = src; + twostep_sortR(); + set_neighbourR(); + + //only for weighted graph + var ewName ,vwName:string; + var e_weight: [0..Ne-1] int; + var v_weight: [0..Nv-1] int; + + fillInt(e_weight,1,1000); + //fillRandom(e_weight,0,100); + fillInt(v_weight,1,1000); + //fillRandom(v_weight,0,100); + ewName = st.nextName(); + vwName = st.nextName(); + var vwEntry = new shared SymEntry(v_weight); + var ewEntry = new shared SymEntry(e_weight); + st.addEntry(vwName, vwEntry); + st.addEntry(ewName, ewEntry); + // end of weighted!=0 + + set_common_symtable(); + set_common_symtableR(); + + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + ' +' + sWeighted + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) + '+created ' + st.attrib(srcNameR) + '+created ' + st.attrib(dstNameR) + '+created ' + st.attrib(startNameR) + '+created ' + st.attrib(neiNameR) + '+created ' + st.attrib(vwName) + '+created ' + st.attrib(ewName); + + } else { - repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + ' +' + sWeighted + + + rmat_gen(); + twostep_sort(); + set_neighbour(); + srcR = dst; + dstR = src; + twostep_sortR(); + set_neighbourR(); + + repMsg = sNv + '+ ' + sNe + '+ ' + sDirected + ' +' + sWeighted + '+created ' + st.attrib(srcName) + '+created ' + st.attrib(dstName) + '+created ' + st.attrib(startName) + '+created ' + st.attrib(neiName) + '+created ' + st.attrib(srcNameR) + '+created ' + st.attrib(dstNameR) + '+created ' + st.attrib(startNameR) + '+created ' + st.attrib(neiNameR) ; - } + + } } smLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg); return repMsg; @@ -1645,20 +1824,120 @@ proc segmentedPeelMsg(cmd: string, payload: bytes, st: borrowed SymTab): string var Ne=n_edgesN:int; var Directed=directedN:int; var Weighted=weightedN:int; + var depthName:string; + var depth=-1: [0..Nv-1] int; + var root:int; + depth[root]=0; + var srcN, dstN, startN, neighbourN,vweightN,eweightN, rootN :string; + var srcRN, dstRN, startRN, neighbourRN:string; + + //proc bfs_kernel(nei:[?D1] int, start_i:[?D2] int,dst:[?D3] int):string throws{ + proc bfs_kernel(nei:[?D1] int, start_i:[?D2] int,dst:[?D3] int){ + root=try! rootN:int; + var cur_level=0; + var SetCurF= try! new set(int,parSafe = true); + var SetNextF=try! new set(int,parSafe = true); + try! SetCurF.add(root); + var numCurF=1:int; + + while (numCurF>0) { + SetNextF.clear(); + forall i in SetCurF with (ref SetNextF) { + var numNF=-1 :int; + ref nf=nei; + ref sf=start_i; + ref df=dst; + numNF=nf[i]; + ref NF=df[sf[i]..sf[i]+numNF-1]; + if (numNF>0) { + //forall j in NF { + for j in NF { + if (depth[j]==-1) { + depth[j]=cur_level+1; + SetNextF.add(j); + } + } + } + + }//end forall i + cur_level+=1; + //writeln("SetCurF= ", SetCurF, "SetNextF=", SetNextF, " level ", cur_level+1); + numCurF=SetNextF.size; + SetCurF=SetNextF; + } + } + + //proc return_depth(): string throws{ + proc return_depth(){ + var depthName = st.nextName(); + var depthEntry = new shared SymEntry(depth); + try! st.addEntry(depthName, depthEntry); + repMsg = 'created ' + (try! st.attrib(depthName)); + } + //proc return_pair():string throws{ + proc return_pair(){ + var vertexValue = radixSortLSD_ranks(depth); + var levelValue=depth[vertexValue]; + var levelName = st.nextName(); + var vertexName = st.nextName(); + var levelEntry = new shared SymEntry(levelValue); + var vertexEntry = new shared SymEntry(vertexValue); + try! st.addEntry(levelName, levelEntry); + try! st.addEntry(vertexName, vertexEntry); + repMsg = 'created ' + st.attrib(levelName) + '+created ' + st.attrib(vertexName) ; + } if (Directed!=0) { if (Weighted!=0) { - repMsg=BFS_DW(Nv, Ne,Directed,Weighted,restpart,st); + //repMsg=BFS_DW(Nv, Ne,Directed,Weighted,restpart,st); + //var pn = Reflection.getRoutineName(); + (srcN, dstN, startN, neighbourN,vweightN,eweightN, rootN)= + restpart.splitMsgToTuple(7); + + var ag = new owned SegGraphDW(Nv,Ne,Directed,Weighted,srcN,dstN, + startN,neighbourN,vweightN,eweightN, st); + bfs_kernel(ag.neighbour.a, ag.start_i.a,ag.dst.a); + return_depth(); + } else { - repMsg=BFS_D(Nv, Ne,Directed,Weighted,restpart,st); + //repMsg=BFS_D(Nv, Ne,Directed,Weighted,restpart,st); + + (srcN, dstN, startN, neighbourN,rootN )=restpart.splitMsgToTuple(5); + var ag = new owned SegGraphD(Nv,Ne,Directed,Weighted,srcN,dstN, + startN,neighbourN,st); + + + bfs_kernel(ag.neighbour.a, ag.start_i.a,ag.dst.a); + return_depth(); + } } else { if (Weighted!=0) { - repMsg=BFS_UDW(Nv, Ne,Directed,Weighted,restpart,st); + //repMsg=BFS_UDW(Nv, Ne,Directed,Weighted,restpart,st); + + (srcN, dstN, startN, neighbourN,srcRN, dstRN, startRN, neighbourRN,vweightN,eweightN, rootN )= + restpart.splitMsgToTuple(11); + var ag = new owned SegGraphUDW(Nv,Ne,Directed,Weighted, + srcN,dstN, startN,neighbourN, + srcRN,dstRN, startRN,neighbourRN, + vweightN,eweightN, st); + bfs_kernel(ag.neighbour.a, ag.start_i.a,ag.dst.a); + return_depth(); + } else { - repMsg=BFS_UD(Nv, Ne,Directed,Weighted,restpart,st); + //repMsg=BFS_UD(Nv, Ne,Directed,Weighted,restpart,st); + + (srcN, dstN, startN, neighbourN,srcRN, dstRN, startRN, neighbourRN, rootN )= + restpart.splitMsgToTuple(9); + var ag = new owned SegGraphUD(Nv,Ne,Directed,Weighted, + srcN,dstN, startN,neighbourN, + srcRN,dstRN, startRN,neighbourRN, + st); + + bfs_kernel(ag.neighbour.a, ag.start_i.a,ag.dst.a); + return_depth(); } } return repMsg;