Closes Bears-R-Us#3183, Bears-R-Us#3364: Add exponential distributi…

…on and aggregation to random generator loop (Bears-R-Us#3310) * Closes Bears-R-Us#3183, Bears-R-Us#3364: Add `exponential` distribution and aggregation to random generation loop This PR (closes Bears-R-Us#3183) adds `exponential` and `standard_exponential` to random number generators implementing both the inverse cdf and ziggurat methods Added copy aggregation to `uniformStreamPerElem`. When a chunk a random stream is assigned to overflows onto the next locale, we have the current locale "steal" those indices. This results in some PUTs to write those results to the correct locale after they are generated. This seems to be the only communication based on my local testing. At first, I created a copy aggregator every time I created a randomStream (every `elemsPerStream` number of elements). I then compared these communication numbers to creating a copy aggregator for only the last chunk on a locale and they didn't change. So I left that implementation, this will hopefully reduce start up costs since we only create `numLocales` aggregators instead of `total_size / elemsPerStream` aggregators I'd also definitely like feedback from someone who understands aggregation a bit better than me to sanity check that I'm not doing anything dumb since I normally only use them with `forall` loops. I based this off radixSortLSD since it also uses aggregations with coforalls. But that creates an aggregator per task so idk how this would change things * make a few consts params instead --------- Co-authored-by: Tess Hayes <stress-tess@users.noreply.github.com>
ajpotts · Jun 26, 2024 · e0cc053 · e0cc053
1 parent 4e5d4c6
commit e0cc053
Show file tree

Hide file tree

Showing 7 changed files with 1,039 additions and 18 deletions.
diff --git a/PROTO_tests/tests/random_test.py b/PROTO_tests/tests/random_test.py
@@ -217,6 +217,22 @@ def test_poissson(self):
         assert rng.poisson(lam=scal_lam, size=num_samples).to_list() == scal_sample
         assert rng.poisson(lam=arr_lam, size=num_samples).to_list() == arr_sample
 
+    def test_exponential(self):
+        rng = ak.random.default_rng(17)
+        num_samples = 5
+        # scalar scale
+        scal_scale = 2
+        scal_sample = rng.exponential(scale=scal_scale, size=num_samples).to_list()
+
+        # array scale
+        arr_scale = ak.arange(5)
+        arr_sample = rng.exponential(scale=arr_scale, size=num_samples).to_list()
+
+        # reset rng with same seed and ensure we get same results
+        rng = ak.random.default_rng(17)
+        assert rng.exponential(scale=scal_scale, size=num_samples).to_list() == scal_sample
+        assert rng.exponential(scale=arr_scale, size=num_samples).to_list() == arr_sample
+
     def test_choice_hypothesis_testing(self):
         # perform a weighted sample and use chisquare to test
         # if the observed frequency matches the expected frequency
@@ -304,6 +320,25 @@ def test_poisson_hypothesis_testing(self):
         _, pval = sp_stats.chisquare(f_obs=obs_counts, f_exp=exp_counts)
         assert pval > 0.05
 
+    def test_exponential_hypothesis_testing(self):
+        # I tested this many times without a set seed, but with no seed
+        # it's expected to fail one out of every ~20 runs given a pval limit of 0.05.
+        rng = ak.random.default_rng(43)
+        num_samples = 10**4
+
+        scale = rng.uniform(0, 10)
+        for method in "zig", "inv":
+            sample = rng.exponential(scale=scale, size=num_samples, method=method)
+            sample_list = sample.to_list()
+
+            # do the Kolmogorov-Smirnov test for goodness of fit
+            ks_res = sp_stats.kstest(
+                rvs=sample_list,
+                cdf=sp_stats.expon.cdf,
+                args=(0, scale),
+            )
+            assert ks_res.pvalue > 0.05
+
     def test_legacy_randint(self):
         testArray = ak.random.randint(0, 10, 5)
         assert isinstance(testArray, ak.pdarray)

diff --git a/arkouda/random/_generator.py b/arkouda/random/_generator.py
@@ -138,6 +138,83 @@ def choice(self, a, size=None, replace=True, p=None):
         pda = create_pdarray(rep_msg)
         return pda if not ret_scalar else pda[0]
 
+    def exponential(self, scale=1.0, size=None, method="zig"):
+        r"""
+        Draw samples from an exponential distribution.
+
+        Its probability density function is
+
+        .. math::
+            f(x; \frac{1}{\beta}) = \frac{1}{\beta} \exp(-\frac{x}{\beta}),
+
+        for ``x > 0`` and 0 elsewhere. :math:`\beta` is the scale parameter,
+        which is the inverse of the rate parameter :math:`\lambda = 1/\beta`.
+        The rate parameter is an alternative, widely used parameterization
+        of the exponential distribution.
+
+        Parameters
+        ----------
+        scale: float or pdarray
+            The scale parameter, :math:`\beta = 1/\lambda`. Must be
+            non-negative. An array must have the same size as the size argument.
+        size: numeric_scalars, optional
+            Output shape. Default is None, in which case a single value is returned.
+        method : str, optional
+            Either 'inv' or 'zig'. 'inv' uses the default inverse CDF method.
+            'zig' uses the Ziggurat method.
+
+        Returns
+        -------
+        pdarray
+            Drawn samples from the parameterized exponential distribution.
+        """
+        if isinstance(scale, pdarray):
+            if (scale < 0).any():
+                raise ValueError("scale cannot be less then 0")
+        elif _val_isinstance_of_union(scale, numeric_scalars):
+            if scale < 0:
+                raise ValueError("scale cannot be less then 0")
+        else:
+            raise TypeError("scale must be either a float scalar or pdarray")
+        return scale * self.standard_exponential(size, method=method)
+
+    def standard_exponential(self, size=None, method="zig"):
+        """
+        Draw samples from the standard exponential distribution.
+
+        `standard_exponential` is identical to the exponential distribution
+        with a scale parameter of 1.
+
+        Parameters
+        ----------
+        size: numeric_scalars, optional
+            Output shape. Default is None, in which case a single value is returned.
+        method : str, optional
+            Either 'inv' or 'zig'. 'inv' uses the default inverse CDF method.
+            'zig' uses the Ziggurat method.
+
+        Returns
+        -------
+        pdarray
+            Drawn samples from the standard exponential distribution.
+        """
+        if size is None:
+            # delegate to numpy when return size is 1
+            return self._np_generator.standard_exponential(method=method)
+
+        rep_msg = generic_msg(
+            cmd="standardExponential",
+            args={
+                "name": self._name_dict[akfloat64],
+                "size": size,
+                "method": method.upper(),
+                "has_seed": self._seed is not None,
+                "state": self._state,
+            },
+        )
+        self._state += size if method.upper() == "INV" else 1
+        return create_pdarray(rep_msg)
+
     def integers(self, low, high=None, size=None, dtype=akint64, endpoint=False):
         """
         Return random integers from low (inclusive) to high (exclusive),

diff --git a/pydoc/usage/random.rst b/pydoc/usage/random.rst
@@ -21,6 +21,10 @@ choice
 ---------
 .. autofunction:: arkouda.random.Generator.choice
 
+exponential
+---------
+.. autofunction:: arkouda.random.Generator.exponential
+
 integers
 ---------
 .. autofunction:: arkouda.random.Generator.integers
@@ -45,6 +49,10 @@ random
 ---------
 .. autofunction:: arkouda.random.Generator.random
 
+standard_exponential
+---------
+.. autofunction:: arkouda.random.Generator.standard_exponential
+
 standard_normal
 ---------
 .. autofunction:: arkouda.random.Generator.standard_normal

diff --git a/src/RandMsg.chpl b/src/RandMsg.chpl
@@ -12,6 +12,7 @@ module RandMsg
     use RandArray;
     use RandUtil;
     use CommAggregation;
+    use ZigguratConstants;
 
     use MultiTypeSymbolTable;
     use MultiTypeSymEntry;
@@ -333,6 +334,112 @@ module RandMsg
         return new MsgTuple(repMsg, MsgType.NORMAL);
     }
 
+    /*
+        Use the ziggurat method (https://en.wikipedia.org/wiki/Ziggurat_algorithm#Theory_of_operation)
+        to generate exponentially distributed numbers using n (num of rectangles) = 256.
+        The number of rectangles only impacts how quick the algorithm is, not it's accuracy.
+        This is relatively fast because the common case is not computationally expensive
+
+        In this algorithm we choose uniformly and then decide whether to accept the candidate or not
+        depending on whether it falls under the pdf of our distribution
+
+        A good explaination of the ziggurat algorithm is:
+        https://blogs.mathworks.com/cleve/2015/05/18/the-ziggurat-random-normal-generator/
+
+        This implementation based on numpy's:
+        https://github.com/numpy/numpy/blob/main/numpy/random/src/distributions/distributions.c
+
+        Uses constants from lists of size 256 (number of rectangles) found in ZigguratConstants.chpl
+    */
+    inline proc standardExponentialZig(ref realRng, ref uintRng): real {
+        // modified from numpy to use while loop instead of recusrsion so we can inline the proc
+        var count = 0;
+        // to guarantee completion this should be while true, but limiting to 100 tries will make the chance
+        // of failure near zero while avoiding the possibility of an infinite loop
+        // the odds of failing 100 times in a row is (1 - .989)**100 = 1.3781e-196
+        while count <= 100 {
+            var ri = uintRng.next();
+            ri >>= 3;
+
+            // AND with 0xFF (255) to get our index into our 256 long const arrays
+            var idx = ri & 0xFF;
+            ri >>= 8;
+            var x = ri * we_double[idx];
+            if ri < ke_double[idx] {
+                // the point fell in the core of one of our rectangular slices, so we're guaranteed
+                // it falls under the pdf curve. We can return it as a sample from our distribution.
+                // We will return here 98.9% of the time on the 1st try
+                return x;
+            }
+
+            // The fall back algorithm for calculating if the sample point lies under the pdf.
+            // Either in the tip of one of the rectangles or in the tail of the distribution.
+            // See https://blogs.mathworks.com/cleve/2015/05/18/the-ziggurat-random-normal-generator/
+            // the tip calculation is based on standardExponentialUnlikely from numpy defined here:
+            // https://github.com/numpy/numpy/blob/main/numpy/random/src/distributions/distributions.c
+
+            // candidate point did not fall in the core of any rectangular slices. Either it lies in the
+            // first slice (which doesn't have a core), in the tail of the distribution, or in the tip of one of slices.
+            if idx == 0 {
+                // first rectangular slice; let x = x1 − ln(U1)
+                return ziggurat_exp_r - log1p(-realRng.next());
+            }
+            else if (fe_double[idx-1] - fe_double[idx]) * realRng.next() + fe_double[idx] < exp(-x) {
+                // tip calculation
+                return x;
+            }
+            // reject sample and retry
+        }
+        return -1.0;  // we failed 100 times in a row which should practically never happen
+    }
+
+    proc standardExponentialMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab): MsgTuple throws {
+        const pn = Reflection.getRoutineName(),
+              name = msgArgs.getValueOf("name"),                // generator name
+              size = msgArgs.get("size").getIntValue(),         // population size
+              method = msgArgs.getValueOf("method"),            // method to use to generate exponential samples
+              hasSeed = msgArgs.get("has_seed").getBoolValue(), // boolean indicating if the generator has a seed
+              state = msgArgs.get("state").getIntValue(),       // rng state
+              rname = st.nextName();
+
+        randLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),
+                                "name: %? size %i method %s state %i".doFormat(name, size, method, state));
+
+        st.checkTable(name);
+
+        var generatorEntry: borrowed GeneratorSymEntry(real) = toGeneratorSymEntry(st.lookup(name), real);
+        ref rng = generatorEntry.generator;
+        if state != 1 {
+            // you have to skip to one before where you want to be
+            rng.skipTo(state-1);
+        }
+
+        select method {
+            when "ZIG" {
+                var exponentialArr = makeDistArray(size, real);
+                uniformStreamPerElem(exponentialArr, rng, GenerationFunction.ExponentialGenerator, hasSeed);
+                st.addEntry(rname, createSymEntry(exponentialArr));
+            }
+            when "INV" {
+                var u1 = makeDistArray(size, real);
+                rng.fill(u1);
+
+                // calculate the exponential by doing the inverse of the cdf
+                var exponentialArr = -log1p(-u1);
+                st.addEntry(rname, createSymEntry(exponentialArr));
+            }
+            otherwise {
+                var errorMsg = "Only ZIG and INV are supported for method. Recieved: %s".doFormat(method);
+                randLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg);
+                return new MsgTuple(notImplementedError(pn, errorMsg), MsgType.ERROR);
+            }
+        }
+
+        var repMsg = "created " + st.attrib(rname);
+        randLogger.debug(getModuleName(),getRoutineName(),getLineNumber(),repMsg);
+        return new MsgTuple(repMsg, MsgType.NORMAL);
+    }
+
     proc segmentedSampleMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab): MsgTuple throws {
         const pn = Reflection.getRoutineName(),
               genName = msgArgs.getValueOf("genName"),                          // generator name
@@ -588,7 +695,7 @@ module RandMsg
         var poissonArr = makeDistArray(size, int);
         const lam = new scalarOrArray(lamStr, !isSingleLam, st);
 
-        uniformStreamPerElem(poissonArr, GenerationFunction.PoissonGenerator, hasSeed, lam, rng);
+        uniformStreamPerElem(poissonArr, rng, GenerationFunction.PoissonGenerator, hasSeed, lam);
         st.addEntry(rname, createSymEntry(poissonArr));
 
         const repMsg = "created " + st.attrib(rname);
@@ -652,6 +759,7 @@ module RandMsg
     registerFunction("createGenerator", createGeneratorMsg, getModuleName());
     registerFunction("uniformGenerator", uniformGeneratorMsg, getModuleName());
     registerFunction("standardNormalGenerator", standardNormalGeneratorMsg, getModuleName());
+    registerFunction("standardExponential", standardExponentialMsg, getModuleName());
     registerFunction("segmentedSample", segmentedSampleMsg, getModuleName());
     registerFunction("choice", choiceMsg, getModuleName());
     registerFunction("permutation", permutationMsg, getModuleName());