From 8be970c09331ec5f52072e1d179556a39069c956 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 16 Jan 2020 16:36:18 -0800 Subject: [PATCH 1/6] fixed getResult() --- .../org/apache/datasketches/tuple/Union.java | 54 ++++++++++++++++--- .../UpdatableSketchWithDoubleSummaryTest.java | 23 +++++--- 2 files changed, 64 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/apache/datasketches/tuple/Union.java b/src/main/java/org/apache/datasketches/tuple/Union.java index 6b024c1dd..f37155ce8 100644 --- a/src/main/java/org/apache/datasketches/tuple/Union.java +++ b/src/main/java/org/apache/datasketches/tuple/Union.java @@ -19,8 +19,13 @@ package org.apache.datasketches.tuple; +import static java.lang.Math.min; import static org.apache.datasketches.Util.DEFAULT_NOMINAL_ENTRIES; +import java.lang.reflect.Array; + +import org.apache.datasketches.QuickSelect; + /** * Compute a union of two or more tuple sketches. * A new instance represents an empty set. @@ -33,6 +38,7 @@ public class Union { private final SummarySetOperations summarySetOps_; private QuickSelectSketch sketch_; private long theta_; // need to maintain outside of the sketch + private boolean isEmpty_; /** * Creates new instance with default nominal entries @@ -53,6 +59,7 @@ public Union(final int nomEntries, final SummarySetOperations summarySetOps) summarySetOps_ = summarySetOps; sketch_ = new QuickSelectSketch(nomEntries, null); theta_ = sketch_.getThetaLong(); + isEmpty_ = true; } /** @@ -61,30 +68,65 @@ public Union(final int nomEntries, final SummarySetOperations summarySetOps) */ public void update(final Sketch sketchIn) { if (sketchIn == null || sketchIn.isEmpty()) { return; } + isEmpty_ = false; if (sketchIn.theta_ < theta_) { theta_ = sketchIn.theta_; } final SketchIterator it = sketchIn.iterator(); while (it.next()) { sketch_.merge(it.getKey(), it.getSummary(), summarySetOps_); } + if (sketch_.theta_ < theta_) theta_ = sketch_.theta_; } /** * Gets the internal set as a CompactSketch * @return result of the unions so far */ + @SuppressWarnings("unchecked") public CompactSketch getResult() { - sketch_.trim(); - if (theta_ < sketch_.theta_) { - sketch_.setThetaLong(theta_); - sketch_.rebuild(); + if (isEmpty_) return sketch_.compact(); + if (theta_ >= sketch_.theta_ && sketch_.getRetainedEntries() <= sketch_.getNominalEntries()) { + return sketch_.compact(); + } + long theta = min(theta_, sketch_.theta_); + + int num = 0; + { + final SketchIterator it = sketch_.iterator(); + while (it.next()) { + if (it.getKey() < theta) { num++; } + } } - return sketch_.compact(); + if (num == 0) return new CompactSketch<>(null, null, theta, isEmpty_); + if (num > sketch_.getNominalEntries()) { + final long[] keys = new long[num]; // temporary since the order will be destroyed by quick select + final SketchIterator it = sketch_.iterator(); + int i = 0; + while (it.next()) { + if (it.getKey() < theta) { keys[i++] = it.getKey(); } + } + theta = QuickSelect.select(keys, 0, num - 1, sketch_.getNominalEntries()); + num = sketch_.getNominalEntries(); + } + final long[] keys = new long[num]; + final S[] summaries = (S[]) Array.newInstance(sketch_.summaries_.getClass().getComponentType(), num); + final SketchIterator it = sketch_.iterator(); + int i = 0; + while (it.next()) { + if (it.getKey() < theta) { + keys[i] = it.getKey(); + summaries[i] = (S) it.getSummary().copy(); + i++; + } + } + return new CompactSketch<>(keys, summaries, theta, isEmpty_); } /** * Resets the internal set to the initial state, which represents an empty set */ public void reset() { - sketch_ = new QuickSelectSketch(nomEntries_, null); + sketch_.reset(); + theta_ = sketch_.getThetaLong(); + isEmpty_ = true; } } diff --git a/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java b/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java index 8e1aefac7..cfc4dcf71 100644 --- a/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java +++ b/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java @@ -230,13 +230,6 @@ public void updatesOfAllKeyTypes() { Assert.assertEquals(sketch.getEstimate(), 6.0); } -// @Test -// public void updateDoubleSummary() { -// DoubleSummary ds = new DoubleSummary(); -// ds.update(1.0); -// Assert.assertEquals(ds.getValue(), 1.0); -// } - @Test public void doubleSummaryDefaultSumMode() { UpdatableSketch sketch = @@ -402,6 +395,22 @@ public void serializeDeserializeSampling() throws Exception { Assert.assertEquals(sketch1.getTheta(), sketch2.getTheta()); } + @Test + public void unionEmptySampling() { + UpdatableSketch sketch = + new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).setSamplingProbability(0.01f).build(); + sketch.update(1, 1.0); + Assert.assertEquals(sketch.getRetainedEntries(), 0); // not retained due to low sampling probability + + Union union = new Union<>(new DoubleSummarySetOperations(mode)); + union.update(sketch); + CompactSketch result = union.getResult(); + Assert.assertEquals(result.getRetainedEntries(), 0); + Assert.assertFalse(result.isEmpty()); + Assert.assertTrue(result.isEstimationMode()); + Assert.assertEquals(result.getEstimate(), 0.0); + } + @Test public void unionExactMode() { UpdatableSketch sketch1 = From dcaa2dd303a438a0f6a1f7f98ae52a918f3a198a Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 16 Jan 2020 16:36:42 -0800 Subject: [PATCH 2/6] removed obsolete section --- .../org/apache/datasketches/tuple/QuickSelectSketch.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java index e32b73577..a50d92046 100644 --- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java @@ -172,11 +172,6 @@ private enum Flags { IS_BIG_ENDIAN, IS_IN_SAMPLING_MODE, IS_EMPTY, HAS_ENTRIES, count = mem.getInt(offset); offset += Integer.BYTES; } - // if (version == serialVersionWithSummaryFactoryUID) { - // final DeserializeResult> factoryResult = - // SerializerDeserializer.deserializeFromMemory(mem, offset); - // offset += factoryResult.getSize(); - // } final int currentCapacity = 1 << lgCurrentCapacity_; keys_ = new long[currentCapacity]; for (int i = 0; i < count; i++) { From 8b43f12eaa3f849fab9f544c55378bf7bdc45903 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 16 Jan 2020 16:41:43 -0800 Subject: [PATCH 3/6] Add additional ctr to IntegerSketch and DoubleSketch --- .../org/apache/datasketches/tuple/Union.java | 6 +++--- .../datasketches/tuple/UpdatableSketch.java | 4 ++-- .../tuple/adouble/DoubleSketch.java | 19 +++++++++++++++++++ .../tuple/aninteger/IntegerSketch.java | 19 +++++++++++++++++++ 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apache/datasketches/tuple/Union.java b/src/main/java/org/apache/datasketches/tuple/Union.java index 6b024c1dd..b71ad3b43 100644 --- a/src/main/java/org/apache/datasketches/tuple/Union.java +++ b/src/main/java/org/apache/datasketches/tuple/Union.java @@ -51,7 +51,7 @@ public Union(final SummarySetOperations summarySetOps) { public Union(final int nomEntries, final SummarySetOperations summarySetOps) { nomEntries_ = nomEntries; summarySetOps_ = summarySetOps; - sketch_ = new QuickSelectSketch(nomEntries, null); + sketch_ = new QuickSelectSketch<>(nomEntries, null); theta_ = sketch_.getThetaLong(); } @@ -60,7 +60,7 @@ public Union(final int nomEntries, final SummarySetOperations summarySetOps) * @param sketchIn input sketch to add to the internal set */ public void update(final Sketch sketchIn) { - if (sketchIn == null || sketchIn.isEmpty()) { return; } + if ((sketchIn == null) || sketchIn.isEmpty()) { return; } if (sketchIn.theta_ < theta_) { theta_ = sketchIn.theta_; } final SketchIterator it = sketchIn.iterator(); while (it.next()) { @@ -85,6 +85,6 @@ public CompactSketch getResult() { * Resets the internal set to the initial state, which represents an empty set */ public void reset() { - sketch_ = new QuickSelectSketch(nomEntries_, null); + sketch_ = new QuickSelectSketch<>(nomEntries_, null); } } diff --git a/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java b/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java index dd07d428e..174e28232 100644 --- a/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/UpdatableSketch.java @@ -55,8 +55,8 @@ public class UpdatableSketch> extends QuickSele * See Sampling Probability * @param summaryFactory An instance of a SummaryFactory. */ - public UpdatableSketch(final int nomEntries, final int lgResizeFactor, final float samplingProbability, - final SummaryFactory summaryFactory) { + public UpdatableSketch(final int nomEntries, final int lgResizeFactor, + final float samplingProbability, final SummaryFactory summaryFactory) { super(nomEntries, lgResizeFactor, samplingProbability, summaryFactory); } diff --git a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java index 57cc8e6d8..c4538350c 100644 --- a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java @@ -38,6 +38,25 @@ public DoubleSketch(final int lgK, final DoubleSummary.Mode mode) { super(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, new DoubleSummaryFactory(mode)); } + /** + * Creates this sketch with the following parameters: + * @param lgK Log_base2 of Nominal Entries. + * @param lgResizeFactor log2(resizeFactor) - value from 0 to 3: + *
+   * 0 - no resizing (max size allocated),
+   * 1 - double internal hash table each time it reaches a threshold
+   * 2 - grow four times
+   * 3 - grow eight times (default)
+   * 
+ * @param samplingProbability + * See Sampling Probability + * @param mode The DoubleSummary mode to be used + */ + public DoubleSketch(final int lgK, final int lgResizeFactor, final float samplingProbability, + final DoubleSummary.Mode mode) { + super(1 << lgK, lgResizeFactor, samplingProbability, new DoubleSummaryFactory(mode)); + } + /** * Constructs this sketch from a Memory image, which must be from an DoubleSketch, and * usually with data. diff --git a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java index 9d759125b..7e0a8f3ca 100644 --- a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java @@ -38,6 +38,25 @@ public IntegerSketch(final int lgK, final IntegerSummary.Mode mode) { super(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, new IntegerSummaryFactory(mode)); } + /** + * Creates this sketch with the following parameters: + * @param lgK Log_base2 of Nominal Entries. + * @param lgResizeFactor log2(resizeFactor) - value from 0 to 3: + *
+   * 0 - no resizing (max size allocated),
+   * 1 - double internal hash table each time it reaches a threshold
+   * 2 - grow four times
+   * 3 - grow eight times (default)
+   * 
+ * @param samplingProbability + * See Sampling Probability + * @param mode The IntegerSummary mode to be used + */ + public IntegerSketch(final int lgK, final int lgResizeFactor, final float samplingProbability, + final IntegerSummary.Mode mode) { + super(1 << lgK, lgResizeFactor, samplingProbability, new IntegerSummaryFactory(mode)); + } + /** * Constructs this sketch from a Memory image, which must be from an IntegerSketch, and * usually with data. From e9b3ec3eae527ac94edfab16457a3121d5ac666f Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 16 Jan 2020 16:51:11 -0800 Subject: [PATCH 4/6] update DoubleSketch and IntegerSketch constructors to chain. --- .../org/apache/datasketches/tuple/adouble/DoubleSketch.java | 2 +- .../org/apache/datasketches/tuple/aninteger/IntegerSketch.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java index c4538350c..8159886d9 100644 --- a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java @@ -35,7 +35,7 @@ public class DoubleSketch extends UpdatableSketch { * @param mode The DoubleSummary mode to be used */ public DoubleSketch(final int lgK, final DoubleSummary.Mode mode) { - super(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, new DoubleSummaryFactory(mode)); + this(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, mode); } /** diff --git a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java index 7e0a8f3ca..394cc341d 100644 --- a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java @@ -35,7 +35,7 @@ public class IntegerSketch extends UpdatableSketch { * @param mode The IntegerSummary mode to be used */ public IntegerSketch(final int lgK, final IntegerSummary.Mode mode) { - super(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, new IntegerSummaryFactory(mode)); + this(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, mode); } /** From 4a89ae502e425dbf4beab750383808da8cf0914b Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 16 Jan 2020 17:09:30 -0800 Subject: [PATCH 5/6] removed unused variable --- src/main/java/org/apache/datasketches/tuple/Union.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/apache/datasketches/tuple/Union.java b/src/main/java/org/apache/datasketches/tuple/Union.java index f37155ce8..fd6cc0c98 100644 --- a/src/main/java/org/apache/datasketches/tuple/Union.java +++ b/src/main/java/org/apache/datasketches/tuple/Union.java @@ -34,7 +34,6 @@ * @param Type of Summary */ public class Union { - private final int nomEntries_; private final SummarySetOperations summarySetOps_; private QuickSelectSketch sketch_; private long theta_; // need to maintain outside of the sketch @@ -55,7 +54,6 @@ public Union(final SummarySetOperations summarySetOps) { * @param summarySetOps instance of SummarySetOperations */ public Union(final int nomEntries, final SummarySetOperations summarySetOps) { - nomEntries_ = nomEntries; summarySetOps_ = summarySetOps; sketch_ = new QuickSelectSketch(nomEntries, null); theta_ = sketch_.getThetaLong(); From 64358ff38d8a5a8753fdd8cc4ae6438856f5104c Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 16 Jan 2020 17:28:54 -0800 Subject: [PATCH 6/6] Fix IntegerSketch & DoubleSketch Ctrs. --- .../org/apache/datasketches/tuple/adouble/DoubleSketch.java | 2 +- .../org/apache/datasketches/tuple/aninteger/IntegerSketch.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java index 8159886d9..92e2cbe98 100644 --- a/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/adouble/DoubleSketch.java @@ -35,7 +35,7 @@ public class DoubleSketch extends UpdatableSketch { * @param mode The DoubleSummary mode to be used */ public DoubleSketch(final int lgK, final DoubleSummary.Mode mode) { - this(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, mode); + this(lgK, ResizeFactor.X8.ordinal(), 1.0F, mode); } /** diff --git a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java index 394cc341d..03ca7d093 100644 --- a/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/aninteger/IntegerSketch.java @@ -35,7 +35,7 @@ public class IntegerSketch extends UpdatableSketch { * @param mode The IntegerSummary mode to be used */ public IntegerSketch(final int lgK, final IntegerSummary.Mode mode) { - this(1 << lgK, ResizeFactor.X8.ordinal(), 1.0F, mode); + this(lgK, ResizeFactor.X8.ordinal(), 1.0F, mode); } /**