Skip to content

Commit

Permalink
Closes Bears-R-Us#3183, Bears-R-Us#3364: Add exponential distributi…
Browse files Browse the repository at this point in the history
…on and aggregation to random generator loop (Bears-R-Us#3310)

* Closes Bears-R-Us#3183, Bears-R-Us#3364: Add `exponential` distribution and aggregation to random generation loop

This PR (closes Bears-R-Us#3183) adds `exponential` and `standard_exponential` to random number generators implementing both the inverse cdf and ziggurat methods

Added copy aggregation to `uniformStreamPerElem`. When a chunk a random stream is assigned to overflows onto the next locale, we have the current locale "steal" those indices. This results in some PUTs to write those results to the correct locale after they are generated.

This seems to be the only communication based on my local testing. At first, I created a copy aggregator every time I created a randomStream (every `elemsPerStream` number of elements). I then compared these communication numbers to creating a copy aggregator for only the last chunk on a locale and they didn't change. So I left that implementation, this will hopefully reduce start up costs since we only create `numLocales` aggregators instead of `total_size / elemsPerStream` aggregators

I'd also definitely like feedback from someone who understands aggregation a bit better than me to sanity check that I'm not doing anything dumb since I normally only use them with `forall` loops. I based this off radixSortLSD since it also uses aggregations with coforalls. But that creates an aggregator per task so idk how this would change things

* make a few consts params instead

---------

Co-authored-by: Tess Hayes <stress-tess@users.noreply.github.com>
  • Loading branch information
stress-tess and stress-tess authored Jun 26, 2024
1 parent 4e5d4c6 commit e0cc053
Show file tree
Hide file tree
Showing 7 changed files with 1,039 additions and 18 deletions.
35 changes: 35 additions & 0 deletions PROTO_tests/tests/random_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,22 @@ def test_poissson(self):
assert rng.poisson(lam=scal_lam, size=num_samples).to_list() == scal_sample
assert rng.poisson(lam=arr_lam, size=num_samples).to_list() == arr_sample

def test_exponential(self):
rng = ak.random.default_rng(17)
num_samples = 5
# scalar scale
scal_scale = 2
scal_sample = rng.exponential(scale=scal_scale, size=num_samples).to_list()

# array scale
arr_scale = ak.arange(5)
arr_sample = rng.exponential(scale=arr_scale, size=num_samples).to_list()

# reset rng with same seed and ensure we get same results
rng = ak.random.default_rng(17)
assert rng.exponential(scale=scal_scale, size=num_samples).to_list() == scal_sample
assert rng.exponential(scale=arr_scale, size=num_samples).to_list() == arr_sample

def test_choice_hypothesis_testing(self):
# perform a weighted sample and use chisquare to test
# if the observed frequency matches the expected frequency
Expand Down Expand Up @@ -304,6 +320,25 @@ def test_poisson_hypothesis_testing(self):
_, pval = sp_stats.chisquare(f_obs=obs_counts, f_exp=exp_counts)
assert pval > 0.05

def test_exponential_hypothesis_testing(self):
# I tested this many times without a set seed, but with no seed
# it's expected to fail one out of every ~20 runs given a pval limit of 0.05.
rng = ak.random.default_rng(43)
num_samples = 10**4

scale = rng.uniform(0, 10)
for method in "zig", "inv":
sample = rng.exponential(scale=scale, size=num_samples, method=method)
sample_list = sample.to_list()

# do the Kolmogorov-Smirnov test for goodness of fit
ks_res = sp_stats.kstest(
rvs=sample_list,
cdf=sp_stats.expon.cdf,
args=(0, scale),
)
assert ks_res.pvalue > 0.05

def test_legacy_randint(self):
testArray = ak.random.randint(0, 10, 5)
assert isinstance(testArray, ak.pdarray)
Expand Down
77 changes: 77 additions & 0 deletions arkouda/random/_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,83 @@ def choice(self, a, size=None, replace=True, p=None):
pda = create_pdarray(rep_msg)
return pda if not ret_scalar else pda[0]

def exponential(self, scale=1.0, size=None, method="zig"):
r"""
Draw samples from an exponential distribution.
Its probability density function is
.. math::
f(x; \frac{1}{\beta}) = \frac{1}{\beta} \exp(-\frac{x}{\beta}),
for ``x > 0`` and 0 elsewhere. :math:`\beta` is the scale parameter,
which is the inverse of the rate parameter :math:`\lambda = 1/\beta`.
The rate parameter is an alternative, widely used parameterization
of the exponential distribution.
Parameters
----------
scale: float or pdarray
The scale parameter, :math:`\beta = 1/\lambda`. Must be
non-negative. An array must have the same size as the size argument.
size: numeric_scalars, optional
Output shape. Default is None, in which case a single value is returned.
method : str, optional
Either 'inv' or 'zig'. 'inv' uses the default inverse CDF method.
'zig' uses the Ziggurat method.
Returns
-------
pdarray
Drawn samples from the parameterized exponential distribution.
"""
if isinstance(scale, pdarray):
if (scale < 0).any():
raise ValueError("scale cannot be less then 0")
elif _val_isinstance_of_union(scale, numeric_scalars):
if scale < 0:
raise ValueError("scale cannot be less then 0")
else:
raise TypeError("scale must be either a float scalar or pdarray")
return scale * self.standard_exponential(size, method=method)

def standard_exponential(self, size=None, method="zig"):
"""
Draw samples from the standard exponential distribution.
`standard_exponential` is identical to the exponential distribution
with a scale parameter of 1.
Parameters
----------
size: numeric_scalars, optional
Output shape. Default is None, in which case a single value is returned.
method : str, optional
Either 'inv' or 'zig'. 'inv' uses the default inverse CDF method.
'zig' uses the Ziggurat method.
Returns
-------
pdarray
Drawn samples from the standard exponential distribution.
"""
if size is None:
# delegate to numpy when return size is 1
return self._np_generator.standard_exponential(method=method)

rep_msg = generic_msg(
cmd="standardExponential",
args={
"name": self._name_dict[akfloat64],
"size": size,
"method": method.upper(),
"has_seed": self._seed is not None,
"state": self._state,
},
)
self._state += size if method.upper() == "INV" else 1
return create_pdarray(rep_msg)

def integers(self, low, high=None, size=None, dtype=akint64, endpoint=False):
"""
Return random integers from low (inclusive) to high (exclusive),
Expand Down
8 changes: 8 additions & 0 deletions pydoc/usage/random.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ choice
---------
.. autofunction:: arkouda.random.Generator.choice

exponential
---------
.. autofunction:: arkouda.random.Generator.exponential

integers
---------
.. autofunction:: arkouda.random.Generator.integers
Expand All @@ -45,6 +49,10 @@ random
---------
.. autofunction:: arkouda.random.Generator.random

standard_exponential
---------
.. autofunction:: arkouda.random.Generator.standard_exponential

standard_normal
---------
.. autofunction:: arkouda.random.Generator.standard_normal
Expand Down
110 changes: 109 additions & 1 deletion src/RandMsg.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module RandMsg
use RandArray;
use RandUtil;
use CommAggregation;
use ZigguratConstants;

use MultiTypeSymbolTable;
use MultiTypeSymEntry;
Expand Down Expand Up @@ -333,6 +334,112 @@ module RandMsg
return new MsgTuple(repMsg, MsgType.NORMAL);
}

/*
Use the ziggurat method (https://en.wikipedia.org/wiki/Ziggurat_algorithm#Theory_of_operation)
to generate exponentially distributed numbers using n (num of rectangles) = 256.
The number of rectangles only impacts how quick the algorithm is, not it's accuracy.
This is relatively fast because the common case is not computationally expensive
In this algorithm we choose uniformly and then decide whether to accept the candidate or not
depending on whether it falls under the pdf of our distribution
A good explaination of the ziggurat algorithm is:
https://blogs.mathworks.com/cleve/2015/05/18/the-ziggurat-random-normal-generator/
This implementation based on numpy's:
https://github.com/numpy/numpy/blob/main/numpy/random/src/distributions/distributions.c
Uses constants from lists of size 256 (number of rectangles) found in ZigguratConstants.chpl
*/
inline proc standardExponentialZig(ref realRng, ref uintRng): real {
// modified from numpy to use while loop instead of recusrsion so we can inline the proc
var count = 0;
// to guarantee completion this should be while true, but limiting to 100 tries will make the chance
// of failure near zero while avoiding the possibility of an infinite loop
// the odds of failing 100 times in a row is (1 - .989)**100 = 1.3781e-196
while count <= 100 {
var ri = uintRng.next();
ri >>= 3;

// AND with 0xFF (255) to get our index into our 256 long const arrays
var idx = ri & 0xFF;
ri >>= 8;
var x = ri * we_double[idx];
if ri < ke_double[idx] {
// the point fell in the core of one of our rectangular slices, so we're guaranteed
// it falls under the pdf curve. We can return it as a sample from our distribution.
// We will return here 98.9% of the time on the 1st try
return x;
}

// The fall back algorithm for calculating if the sample point lies under the pdf.
// Either in the tip of one of the rectangles or in the tail of the distribution.
// See https://blogs.mathworks.com/cleve/2015/05/18/the-ziggurat-random-normal-generator/
// the tip calculation is based on standardExponentialUnlikely from numpy defined here:
// https://github.com/numpy/numpy/blob/main/numpy/random/src/distributions/distributions.c

// candidate point did not fall in the core of any rectangular slices. Either it lies in the
// first slice (which doesn't have a core), in the tail of the distribution, or in the tip of one of slices.
if idx == 0 {
// first rectangular slice; let x = x1 − ln(U1)
return ziggurat_exp_r - log1p(-realRng.next());
}
else if (fe_double[idx-1] - fe_double[idx]) * realRng.next() + fe_double[idx] < exp(-x) {
// tip calculation
return x;
}
// reject sample and retry
}
return -1.0; // we failed 100 times in a row which should practically never happen
}

proc standardExponentialMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab): MsgTuple throws {
const pn = Reflection.getRoutineName(),
name = msgArgs.getValueOf("name"), // generator name
size = msgArgs.get("size").getIntValue(), // population size
method = msgArgs.getValueOf("method"), // method to use to generate exponential samples
hasSeed = msgArgs.get("has_seed").getBoolValue(), // boolean indicating if the generator has a seed
state = msgArgs.get("state").getIntValue(), // rng state
rname = st.nextName();

randLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),
"name: %? size %i method %s state %i".doFormat(name, size, method, state));

st.checkTable(name);

var generatorEntry: borrowed GeneratorSymEntry(real) = toGeneratorSymEntry(st.lookup(name), real);
ref rng = generatorEntry.generator;
if state != 1 {
// you have to skip to one before where you want to be
rng.skipTo(state-1);
}

select method {
when "ZIG" {
var exponentialArr = makeDistArray(size, real);
uniformStreamPerElem(exponentialArr, rng, GenerationFunction.ExponentialGenerator, hasSeed);
st.addEntry(rname, createSymEntry(exponentialArr));
}
when "INV" {
var u1 = makeDistArray(size, real);
rng.fill(u1);

// calculate the exponential by doing the inverse of the cdf
var exponentialArr = -log1p(-u1);
st.addEntry(rname, createSymEntry(exponentialArr));
}
otherwise {
var errorMsg = "Only ZIG and INV are supported for method. Recieved: %s".doFormat(method);
randLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg);
return new MsgTuple(notImplementedError(pn, errorMsg), MsgType.ERROR);
}
}

var repMsg = "created " + st.attrib(rname);
randLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg);
return new MsgTuple(repMsg, MsgType.NORMAL);
}

proc segmentedSampleMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab): MsgTuple throws {
const pn = Reflection.getRoutineName(),
genName = msgArgs.getValueOf("genName"), // generator name
Expand Down Expand Up @@ -588,7 +695,7 @@ module RandMsg
var poissonArr = makeDistArray(size, int);
const lam = new scalarOrArray(lamStr, !isSingleLam, st);

uniformStreamPerElem(poissonArr, GenerationFunction.PoissonGenerator, hasSeed, lam, rng);
uniformStreamPerElem(poissonArr, rng, GenerationFunction.PoissonGenerator, hasSeed, lam);
st.addEntry(rname, createSymEntry(poissonArr));

const repMsg = "created " + st.attrib(rname);
Expand Down Expand Up @@ -652,6 +759,7 @@ module RandMsg
registerFunction("createGenerator", createGeneratorMsg, getModuleName());
registerFunction("uniformGenerator", uniformGeneratorMsg, getModuleName());
registerFunction("standardNormalGenerator", standardNormalGeneratorMsg, getModuleName());
registerFunction("standardExponential", standardExponentialMsg, getModuleName());
registerFunction("segmentedSample", segmentedSampleMsg, getModuleName());
registerFunction("choice", choiceMsg, getModuleName());
registerFunction("permutation", permutationMsg, getModuleName());
Expand Down
Loading

0 comments on commit e0cc053

Please sign in to comment.