diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java index e32b73577..a50d92046 100644 --- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java @@ -172,11 +172,6 @@ private enum Flags { IS_BIG_ENDIAN, IS_IN_SAMPLING_MODE, IS_EMPTY, HAS_ENTRIES, count = mem.getInt(offset); offset += Integer.BYTES; } - // if (version == serialVersionWithSummaryFactoryUID) { - // final DeserializeResult> factoryResult = - // SerializerDeserializer.deserializeFromMemory(mem, offset); - // offset += factoryResult.getSize(); - // } final int currentCapacity = 1 << lgCurrentCapacity_; keys_ = new long[currentCapacity]; for (int i = 0; i < count; i++) { diff --git a/src/main/java/org/apache/datasketches/tuple/Union.java b/src/main/java/org/apache/datasketches/tuple/Union.java index 6b024c1dd..7800b55fa 100644 --- a/src/main/java/org/apache/datasketches/tuple/Union.java +++ b/src/main/java/org/apache/datasketches/tuple/Union.java @@ -19,8 +19,13 @@ package org.apache.datasketches.tuple; +import static java.lang.Math.min; import static org.apache.datasketches.Util.DEFAULT_NOMINAL_ENTRIES; +import java.lang.reflect.Array; + +import org.apache.datasketches.QuickSelect; + /** * Compute a union of two or more tuple sketches. * A new instance represents an empty set. @@ -29,10 +34,10 @@ * @param Type of Summary */ public class Union { - private final int nomEntries_; private final SummarySetOperations summarySetOps_; private QuickSelectSketch sketch_; private long theta_; // need to maintain outside of the sketch + private boolean isEmpty_; /** * Creates new instance with default nominal entries @@ -49,10 +54,10 @@ public Union(final SummarySetOperations summarySetOps) { * @param summarySetOps instance of SummarySetOperations */ public Union(final int nomEntries, final SummarySetOperations summarySetOps) { - nomEntries_ = nomEntries; summarySetOps_ = summarySetOps; - sketch_ = new QuickSelectSketch(nomEntries, null); + sketch_ = new QuickSelectSketch<>(nomEntries, null); theta_ = sketch_.getThetaLong(); + isEmpty_ = true; } /** @@ -60,31 +65,72 @@ public Union(final int nomEntries, final SummarySetOperations summarySetOps) * @param sketchIn input sketch to add to the internal set */ public void update(final Sketch sketchIn) { - if (sketchIn == null || sketchIn.isEmpty()) { return; } + if ((sketchIn == null) || sketchIn.isEmpty()) { return; } + isEmpty_ = false; if (sketchIn.theta_ < theta_) { theta_ = sketchIn.theta_; } final SketchIterator it = sketchIn.iterator(); while (it.next()) { sketch_.merge(it.getKey(), it.getSummary(), summarySetOps_); } + if (sketch_.theta_ < theta_) { + theta_ = sketch_.theta_; + } } /** * Gets the internal set as a CompactSketch * @return result of the unions so far */ + @SuppressWarnings("unchecked") public CompactSketch getResult() { - sketch_.trim(); - if (theta_ < sketch_.theta_) { - sketch_.setThetaLong(theta_); - sketch_.rebuild(); + if (isEmpty_) { + return sketch_.compact(); + } + if ((theta_ >= sketch_.theta_) && (sketch_.getRetainedEntries() <= sketch_.getNominalEntries())) { + return sketch_.compact(); + } + long theta = min(theta_, sketch_.theta_); + + int num = 0; + { + final SketchIterator it = sketch_.iterator(); + while (it.next()) { + if (it.getKey() < theta) { num++; } + } } - return sketch_.compact(); + if (num == 0) { + return new CompactSketch<>(null, null, theta, isEmpty_); + } + if (num > sketch_.getNominalEntries()) { + final long[] keys = new long[num]; // temporary since the order will be destroyed by quick select + final SketchIterator it = sketch_.iterator(); + int i = 0; + while (it.next()) { + if (it.getKey() < theta) { keys[i++] = it.getKey(); } + } + theta = QuickSelect.select(keys, 0, num - 1, sketch_.getNominalEntries()); + num = sketch_.getNominalEntries(); + } + final long[] keys = new long[num]; + final S[] summaries = (S[]) Array.newInstance(sketch_.summaries_.getClass().getComponentType(), num); + final SketchIterator it = sketch_.iterator(); + int i = 0; + while (it.next()) { + if (it.getKey() < theta) { + keys[i] = it.getKey(); + summaries[i] = (S) it.getSummary().copy(); + i++; + } + } + return new CompactSketch<>(keys, summaries, theta, isEmpty_); } /** * Resets the internal set to the initial state, which represents an empty set */ public void reset() { - sketch_ = new QuickSelectSketch(nomEntries_, null); + sketch_.reset(); + theta_ = sketch_.getThetaLong(); + isEmpty_ = true; } } diff --git a/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java b/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java index dd07d428e..174e28232 100644 --- a/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java @@ -55,8 +55,8 @@ public class UpdatableSketch> extends QuickSele * See Sampling Probability * @param summaryFactory An instance of a SummaryFactory. */ - public UpdatableSketch(final int nomEntries, final int lgResizeFactor, final float samplingProbability, - final SummaryFactory summaryFactory) { + public UpdatableSketch(final int nomEntries, final int lgResizeFactor, + final float samplingProbability, final SummaryFactory summaryFactory) { super(nomEntries, lgResizeFactor, samplingProbability, summaryFactory); } diff --git a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java index 57cc8e6d8..92e2cbe98 100644 --- a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java @@ -35,7 +35,26 @@ public class DoubleSketch extends UpdatableSketch { * @param mode The DoubleSummary mode to be used */ public DoubleSketch(final int lgK, final DoubleSummary.Mode mode) { - super(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, new DoubleSummaryFactory(mode)); + this(lgK, ResizeFactor.X8.ordinal(), 1.0F, mode); + } + + /** + * Creates this sketch with the following parameters: + * @param lgK Log_base2 of Nominal Entries. + * @param lgResizeFactor log2(resizeFactor) - value from 0 to 3: + *
+   * 0 - no resizing (max size allocated),
+   * 1 - double internal hash table each time it reaches a threshold
+   * 2 - grow four times
+   * 3 - grow eight times (default)
+   * 
+ * @param samplingProbability + * See Sampling Probability + * @param mode The DoubleSummary mode to be used + */ + public DoubleSketch(final int lgK, final int lgResizeFactor, final float samplingProbability, + final DoubleSummary.Mode mode) { + super(1 << lgK, lgResizeFactor, samplingProbability, new DoubleSummaryFactory(mode)); } /** diff --git a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java index 9d759125b..03ca7d093 100644 --- a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java @@ -35,7 +35,26 @@ public class IntegerSketch extends UpdatableSketch { * @param mode The IntegerSummary mode to be used */ public IntegerSketch(final int lgK, final IntegerSummary.Mode mode) { - super(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, new IntegerSummaryFactory(mode)); + this(lgK, ResizeFactor.X8.ordinal(), 1.0F, mode); + } + + /** + * Creates this sketch with the following parameters: + * @param lgK Log_base2 of Nominal Entries. + * @param lgResizeFactor log2(resizeFactor) - value from 0 to 3: + *
+   * 0 - no resizing (max size allocated),
+   * 1 - double internal hash table each time it reaches a threshold
+   * 2 - grow four times
+   * 3 - grow eight times (default)
+   * 
+ * @param samplingProbability + * See Sampling Probability + * @param mode The IntegerSummary mode to be used + */ + public IntegerSketch(final int lgK, final int lgResizeFactor, final float samplingProbability, + final IntegerSummary.Mode mode) { + super(1 << lgK, lgResizeFactor, samplingProbability, new IntegerSummaryFactory(mode)); } /** diff --git a/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java b/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java index 8e1aefac7..cfc4dcf71 100644 --- a/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java +++ b/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java @@ -230,13 +230,6 @@ public void updatesOfAllKeyTypes() { Assert.assertEquals(sketch.getEstimate(), 6.0); } -// @Test -// public void updateDoubleSummary() { -// DoubleSummary ds = new DoubleSummary(); -// ds.update(1.0); -// Assert.assertEquals(ds.getValue(), 1.0); -// } - @Test public void doubleSummaryDefaultSumMode() { UpdatableSketch sketch = @@ -402,6 +395,22 @@ public void serializeDeserializeSampling() throws Exception { Assert.assertEquals(sketch1.getTheta(), sketch2.getTheta()); } + @Test + public void unionEmptySampling() { + UpdatableSketch sketch = + new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).setSamplingProbability(0.01f).build(); + sketch.update(1, 1.0); + Assert.assertEquals(sketch.getRetainedEntries(), 0); // not retained due to low sampling probability + + Union union = new Union<>(new DoubleSummarySetOperations(mode)); + union.update(sketch); + CompactSketch result = union.getResult(); + Assert.assertEquals(result.getRetainedEntries(), 0); + Assert.assertFalse(result.isEmpty()); + Assert.assertTrue(result.isEstimationMode()); + Assert.assertEquals(result.getEstimate(), 0.0); + } + @Test public void unionExactMode() { UpdatableSketch sketch1 =