twitter
diff --git a/‎algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/ReservoirSamplingBenchmark.scala
Lines changed: 42 additions & 0 deletions b/‎algebird-benchmark/src/main/scala/com/twitter/algebird/benchmark/ReservoirSamplingBenchmark.scala
Lines changed: 42 additions & 0 deletions
diff --git a/‎algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala
Lines changed: 5 additions & 6 deletions b/‎algebird-core/src/main/scala/com/twitter/algebird/Aggregator.scala
Lines changed: 5 additions & 6 deletions
diff --git a/‎algebird-core/src/main/scala/com/twitter/algebird/mutable/ReservoirSampling.scala
Lines changed: 213 additions & 0 deletions b/‎algebird-core/src/main/scala/com/twitter/algebird/mutable/ReservoirSampling.scala
Lines changed: 213 additions & 0 deletions
diff --git a/‎algebird-test/src/main/scala/com/twitter/algebird/RandomSamplingLaws.scala
Lines changed: 77 additions & 0 deletions b/‎algebird-test/src/main/scala/com/twitter/algebird/RandomSamplingLaws.scala
Lines changed: 77 additions & 0 deletions
@@ -0,0 +1,42 @@
+package com.twitter.algebird.benchmark
+
+import com.twitter.algebird.mutable.ReservoirSamplingToListAggregator
+import com.twitter.algebird.{Aggregator, Preparer}
+import org.openjdk.jmh.annotations.{Benchmark, Param, Scope, State}
+import org.openjdk.jmh.infra.Blackhole
+
+import scala.util.Random
+
+object ReservoirSamplingBenchmark {
+  @State(Scope.Benchmark)
+  class BenchmarkState {
+    @Param(Array("100", "10000", "1000000"))
+    var collectionSize: Int = 0
+
+    @Param(Array("0.001", "0.01", "0.1"))
+    var sampleRate: Double = 0.0
+
+    def samples: Int = (sampleRate * collectionSize).ceil.toInt
+  }
+
+  val rng = new Random()
+  implicit val randomSupplier: () => Random = () => rng
+}
+
+class ReservoirSamplingBenchmark {
+  import ReservoirSamplingBenchmark._
+
+  private def prioQueueSampler[T](count: Int) =
+    Preparer[T]
+      .map(rng.nextDouble() -> _)
+      .monoidAggregate(Aggregator.sortByTake(count)(_._1))
+      .andThenPresent(_.map(_._2))
+
+  @Benchmark
+  def timeAlgorithmL(state: BenchmarkState, bh: Blackhole): Unit =
+    bh.consume(new ReservoirSamplingToListAggregator[Int](state.samples).apply(0 until state.collectionSize))
+
+  @Benchmark
+  def timePriorityQeueue(state: BenchmarkState, bh: Blackhole): Unit =
+    bh.consume(prioQueueSampler(state.samples).apply(0 until state.collectionSize))
+}
@@ -1,5 +1,7 @@
 package com.twitter.algebird
 
+import com.twitter.algebird.mutable.{Reservoir, ReservoirSamplingToListAggregator}
+
 import java.util.PriorityQueue
 import scala.collection.compat._
 import scala.collection.generic.CanBuildFrom
@@ -286,12 +288,9 @@ object Aggregator extends java.io.Serializable {
   def reservoirSample[T](
       count: Int,
       seed: Int = DefaultSeed
-  ): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
-    val rng = new java.util.Random(seed)
-    Preparer[T]
-      .map(rng.nextDouble() -> _)
-      .monoidAggregate(sortByTake(count)(_._1))
-      .andThenPresent(_.map(_._2))
+  ): MonoidAggregator[T, Reservoir[T], Seq[T]] = {
+    val rng = new scala.util.Random(seed)
+    new ReservoirSamplingToListAggregator[T](count)(() => rng)
   }
 
   /**
 
@@ -0,0 +1,213 @@
+package com.twitter.algebird.mutable
+
+import com.twitter.algebird.{Monoid, MonoidAggregator}
+
+import scala.collection.mutable
+import scala.util.Random
+
+/**
+ * A reservoir of the currently sampled items.
+ *
+ * @param capacity
+ *   the reservoir capacity
+ * @tparam T
+ *   the element type
+ */
+sealed class Reservoir[T](val capacity: Int) {
+  var reservoir: mutable.Buffer[T] = mutable.Buffer()
+
+  // When the reservoir is full, w is the threshold for accepting an element into the reservoir, and
+  // the following invariant holds: The maximum score of the elements in the reservoir is w,
+  // and the remaining elements are distributed as U[0, w].
+  // Scores are not kept explicitly, only their distribution is tracked and sampled from.
+  // (w = 1 when the reservoir is not full.)
+  var w: Double = 1
+
+  require(capacity > 0, "reservoir size must be positive")
+  private val kInv: Double = 1d / capacity
+
+  def size: Int = reservoir.size
+  def isEmpty: Boolean = reservoir.isEmpty
+  def isFull: Boolean = size == capacity
+
+  /**
+   * Add an element to the reservoir. If the reservoir is full then the element will replace a random element
+   * in the reservoir, and the threshold <pre>w</pre> is updated.
+   *
+   * When adding multiple elements, [[append]] should be used to take advantage of exponential jumps.
+   *
+   * @param x
+   *   the element to add
+   * @param rng
+   *   the random source
+   */
+  def accept(x: T, rng: Random): Unit = {
+    if (isFull) {
+      reservoir(rng.nextInt(capacity)) = x
+    } else {
+      reservoir.append(x)
+    }
+    if (isFull) {
+      w *= Math.pow(rng.nextDouble, kInv)
+    }
+  }
+
+  /**
+   * Add multiple elements to the reservoir.
+   * @param xs
+   *   the elements to add
+   * @param rng
+   *   the random source
+   * @param prior
+   *   the threshold of the elements being added, such that the added element's value is distributed as
+   *   <pre>U[0, prior]</pre>
+   * @return
+   *   this reservoir
+   */
+  def append(xs: TraversableOnce[T], rng: Random, prior: Double = 1): Reservoir[T] = {
+    // The number of items to skip before accepting the next item is geometrically distributed
+    // with probability of success w / prior. The prior will be 1 when adding to a single reservoir,
+    // but when merging reservoirs it will be the threshold of the reservoir being pulled from,
+    // and in this case we require that w < prior.
+    def nextAcceptTime = (-rng.self.nextExponential / Math.log1p(-w / prior)).toInt
+
+    var skip = if (isFull) nextAcceptTime else 0
+    for (x <- xs) {
+      if (!isFull) {
+        // keep adding while reservoir is not full
+        accept(x, rng)
+        if (isFull) {
+          skip = nextAcceptTime
+        }
+      } else if (skip > 0) {
+        skip -= 1
+      } else {
+        accept(x, rng)
+        skip = nextAcceptTime
+      }
+    }
+    this
+  }
+
+  override def toString: String = s"Reservoir($capacity, $w, ${reservoir.toList})"
+}
+
+object Reservoir {
+  implicit def monoid[T](implicit randomSupplier: () => Random): Monoid[Reservoir[T]] =
+    new ReservoirMonoid()(randomSupplier)
+}
+
+/**
+ * This is the "Algorithm L" reservoir sampling algorithm [1], with modifications to act as a monoid by
+ * merging reservoirs.
+ *
+ * [1] Kim-Hung Li, "Reservoir-Sampling Algorithms of Time Complexity O(n(1+log(N/n)))", 1994
+ *
+ * @tparam T
+ *   the item type
+ */
+class ReservoirMonoid[T](implicit val randomSupplier: () => Random) extends Monoid[Reservoir[T]] {
+
+  /**
+   * Builds a reservoir with a single item.
+   *
+   * @param k
+   *   the reservoir capacity
+   * @param x
+   *   the item to add
+   * @return
+   */
+  def build(k: Int, x: T): Reservoir[T] = {
+    val r = new Reservoir[T](k)
+    r.accept(x, randomSupplier())
+    r
+  }
+
+  override def zero: Reservoir[T] = new Reservoir(1)
+  def zero(k: Int): Reservoir[T] = new Reservoir(k)
+  override def isNonZero(r: Reservoir[T]): Boolean = !r.isEmpty
+
+  /**
+   * Merge two reservoirs. NOTE: This mutates one or both of the reservoirs. They should not be used after
+   * this operation, except as the return value for further aggregation.
+   */
+  override def plus(left: Reservoir[T], right: Reservoir[T]): Reservoir[T] =
+    if (left.isEmpty) right
+    else if (left.size + right.size <= left.capacity) {
+      // the sum of the sizes is less than the reservoir size, so we can just merge
+      left.append(right.reservoir, randomSupplier())
+    } else {
+      val (s1, s2) = if (left.w < right.w) (left, right) else (right, left)
+      val rng = randomSupplier()
+      if (s2.isFull) {
+        // The highest score in s2 is w, and the other scores are distributed as U[0, w].
+        // Since s1.w < s2.w, we have to drop the single (sampled) element with the highest score
+        // unconditionally. The other elements enter the reservoir with probability s1.w / s2.w.
+        val i = rng.nextInt(s2.size)
+        s2.reservoir(i) = s2.reservoir.head
+        s1.append(s2.reservoir.drop(1), rng, s2.w)
+      } else {
+        s1.append(s2.reservoir, rng)
+      }
+    }
+}
+
+/**
+ * An aggregator that uses reservoir sampling to sample k elements from a stream of items. Because the
+ * reservoir is mutable, it is a good idea to copy the result to an immutable view before using it, as is done
+ * by [[ReservoirSamplingToListAggregator]].
+ *
+ * @param k
+ *   the number of elements to sample
+ * @param randomSupplier
+ *   the random generator
+ * @tparam T
+ *   the item type
+ * @tparam C
+ *   the result type
+ */
+abstract class ReservoirSamplingAggregator[T, +C](k: Int)(implicit val randomSupplier: () => Random)
+    extends MonoidAggregator[T, Reservoir[T], C] {
+  override val monoid: ReservoirMonoid[T] = new ReservoirMonoid
+  override def prepare(x: T): Reservoir[T] = monoid.build(k, x)
+
+  override def apply(xs: TraversableOnce[T]): C = present(agg(xs))
+
+  override def applyOption(inputs: TraversableOnce[T]): Option[C] =
+    if (inputs.isEmpty) None else Some(apply(inputs))
+
+  override def append(r: Reservoir[T], t: T): Reservoir[T] = r.append(Seq(t), randomSupplier())
+
+  override def appendAll(r: Reservoir[T], xs: TraversableOnce[T]): Reservoir[T] =
+    r.append(xs, randomSupplier())
+
+  override def appendAll(xs: TraversableOnce[T]): Reservoir[T] = agg(xs)
+
+  private def agg(xs: TraversableOnce[T]): Reservoir[T] =
+    appendAll(monoid.zero(k), xs)
+}
+
+class ReservoirSamplingToListAggregator[T](k: Int)(implicit randomSupplier: () => Random)
+    extends ReservoirSamplingAggregator[T, List[T]](k)(randomSupplier) {
+  override def present(r: Reservoir[T]): List[T] =
+    randomSupplier().shuffle(r.reservoir).toList
+
+  override def andThenPresent[D](f: List[T] => D): MonoidAggregator[T, Reservoir[T], D] =
+    new AndThenPresent(this, f)
+}
+
+/**
+ * Monoid that implements [[andThenPresent]] without ruining the optimized behavior of the aggregator.
+ */
+protected class AndThenPresent[-A, B, C, +D](val agg: MonoidAggregator[A, B, C], f: C => D)
+    extends MonoidAggregator[A, B, D] {
+  override val monoid: Monoid[B] = agg.monoid
+  override def prepare(a: A): B = agg.prepare(a)
+  override def present(b: B): D = f(agg.present(b))
+
+  override def apply(xs: TraversableOnce[A]): D = f(agg(xs))
+  override def applyOption(xs: TraversableOnce[A]): Option[D] = agg.applyOption(xs).map(f)
+  override def append(b: B, a: A): B = agg.append(b, a)
+  override def appendAll(b: B, as: TraversableOnce[A]): B = agg.appendAll(b, as)
+  override def appendAll(as: TraversableOnce[A]): B = agg.appendAll(as)
+}
@@ -0,0 +1,77 @@
+package com.twitter.algebird
+
+import com.twitter.algebird.scalacheck.Distribution._
+import org.scalacheck.{Gen, Prop}
+
+object RandomSamplingLaws {
+
+  def sampleOneUniformly[T](newSampler: Int => Aggregator[Int, T, Seq[Int]]): Prop = {
+    val n = 100
+
+    "sampleOne" |: forAllSampled(10000, Gen.choose(1, 20))(_ => uniform(n)) { k =>
+      newSampler(k).andThenPresent(_.head).apply(0 until n)
+    }
+  }
+
+  def reservoirSizeOne[T](newSampler: Int => Aggregator[Int, T, Seq[Int]]): Prop = {
+    val n = 100
+
+    "reservoirSizeOne" |: forAllSampled(10000)(uniform(n)) {
+      newSampler(1).andThenPresent(_.head).apply(0 until n)
+    }
+  }
+
+  def reservoirSizeTwo[T](newSampler: Int => Aggregator[Int, T, Seq[Int]]): Prop = {
+    val n = 10
+    val tuples = for {
+      i <- 0 until n
+      j <- 0 until n
+      if i != j
+    } yield (i, j)
+
+    "reservoirSizeTwo" |: forAllSampled(10000)(tuples.map(_ -> 1d).toMap) {
+      newSampler(2).andThenPresent(xs => (xs(0), xs(1))).apply(0 until n)
+    }
+  }
+
+  def sampleSpecificItem[T](newSampler: Int => Aggregator[Int, T, Seq[Int]]): Prop = {
+    val sizeAndIndex: Gen[(Int, Int)] = for {
+      k <- Gen.choose(1, 10)
+      i <- Gen.choose(0, k - 1)
+    } yield (k, i)
+
+    val n = 100
+
+    "sampleAnyItem" |: forAllSampled(10000, sizeAndIndex)(_ => uniform(n)) { case (k, i) =>
+      newSampler(k).andThenPresent(_(i)).apply(0 until n)
+    }
+  }
+
+  def sampleTwoItems[T](newSampler: Int => Aggregator[Int, T, Seq[Int]]): Prop = {
+    val sizeAndIndexes: Gen[(Int, Int, Int)] = for {
+      k <- Gen.choose(1, 10)
+      i <- Gen.choose(0, k - 1)
+      j <- Gen.choose(0, k - 1)
+      if i != j
+    } yield (k, i, j)
+
+    val n = 20
+
+    "sampleTwoItems" |: forAllSampled(10000, sizeAndIndexes)(_ =>
+      (for {
+        i <- 0 until n
+        j <- 0 until n
+        if i != j
+      } yield (i, j)).map(_ -> 1d).toMap
+    ) { case (k, i, j) =>
+      newSampler(k).andThenPresent(xs => (xs(i), xs(j))).apply(0 until n)
+    }
+  }
+
+  def randomSamplingDistributions[T](newSampler: Int => MonoidAggregator[Int, T, Seq[Int]]): Prop =
+    sampleOneUniformly(newSampler) &&
+      reservoirSizeOne(newSampler) &&
+      reservoirSizeTwo(newSampler) &&
+      sampleSpecificItem(newSampler) &&
+      sampleTwoItems(newSampler)
+}