add sum and count to summary

salesforce · tovbinm · Sep 4, 2018 · Aug 28, 2018 · Aug 28, 2018 · Aug 28, 2018
commit b958887e0159f13e43f6e34bec45da1cc125ed9c
@@ -166,7 +166,7 @@ private[op] object FeatureDistribution {
  ): FeatureDistribution = {
  val (nullCount, (summaryInfo, distribution)): (Int, (Array[Double], Array[Double])) =
  value.map(seq => 0 -> histValues(seq, summary, bins))
- .getOrElse(1 -> (Array(summary.min, summary.max) -> Array.fill(bins)(0.0)))
+ .getOrElse(1 -> (Array(summary.min, summary.max, summary.sum, summary.count) -> Array.fill(bins)(0.0)))
 
  FeatureDistribution(
  name = featureKey._1,
@@ -194,12 +194,12 @@ private[op] object FeatureDistribution {
  case Left(seq) => {
  val minBins = bins
  val maxBins = MaxBins
- val numBins = math.min(math.max(bins, sum.max / AvgBinValue), maxBins).floor
+ val numBins = math.min(math.max(bins, sum.max / AvgBinValue), maxBins).intValue()
 
  val hasher: HashingTF = new HashingTF(numFeatures = numBins)
  .setBinary(false)
  .setHashAlgorithm(HashAlgorithm.MurMur3.toString.toLowerCase)
- Array(sum.min, sum.max) -> hasher.transform(seq).toArray
+ Array(sum.min, sum.max, sum.sum, sum.count) -> hasher.transform(seq).toArray
  }
  case Right(seq) => // TODO use kernel fit instead of histogram
  if (sum == Summary.empty) {
@@ -218,7 +218,7 @@ private[op] object FeatureDistribution {
  } else {
  val same = seq.map(v => if (v == sum.max) 1.0 else 0.0).sum
  val other = seq.map(v => if (v != sum.max) 1.0 else 0.0).sum
- Array(sum.min, sum.max) -> Array(same, other)
+ Array(sum.min, sum.max, sum.sum, sum.count) -> Array(same, other)
  }
  }
  }

@@ -35,18 +35,21 @@ import com.twitter.algebird.Monoid
 /**
  * Class used to get summaries of prepared features to determine distribution binning strategy
  *
- * @param min minimum value seen
- * @param max maximum value seen
+ * @param min minimum value seen for double, minimum number of tokens in one text for text
+ * @param max maximum value seen for double, maximum number of tokens in one text for text
+ * @param sum sum of values for double, total number of tokens for text
+ * @param count number of doubles for double, number of texts for text
  */
-private[op] case class Summary(min: Double, max: Double)
+private[op] case class Summary(min: Double, max: Double, sum: Double, count: Double)
 
 private[op] case object Summary {
 
- val empty: Summary = Summary(Double.PositiveInfinity, Double.NegativeInfinity)
+ val empty: Summary = Summary(Double.PositiveInfinity, Double.NegativeInfinity, 0.0, 0.0)
 
  implicit val monoid: Monoid[Summary] = new Monoid[Summary] {
  override def zero = empty
- override def plus(l: Summary, r: Summary) = Summary(math.min(l.min, r.min), math.max(l.max, r.max))
+ override def plus(l: Summary, r: Summary) = Summary(math.min(l.min, r.min), math.max(l.max, r.max),
+ l.sum + r.sum, l.count + r.count)
  }
 
  /**
@@ -55,8 +58,8 @@ private[op] case object Summary {
  */
  def apply(preppedFeature: ProcessedSeq): Summary = {
  preppedFeature match {
- case Left(v) => Summary(v.size, v.size)
- case Right(v) => monoid.sum(v.map(d => Summary(d, d)))
+ case Left(v) => Summary(v.size, v.size, v.size, 1.0)
+ case Right(v) => monoid.sum(v.map(d => Summary(d, d, d, 1.0)))
  }
  }
 }
@@ -47,7 +47,8 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
  (true, Left(Seq.empty[String])), (false, Right(Seq(1.0, 3.0, 5.0)))
  )
  val summary =
- Array(Summary(0.0, 1.0), Summary(-1.6, 10.6), Summary(0.0, 3.0), Summary(0.0, 0.0), Summary(1.0, 5.0))
+ Array(Summary(0.0, 1.0, 6.0, 10), Summary(-1.6, 10.6, 3.0, 10),
+ Summary(0.0, 3.0, 7.0, 10), Summary(0.0, 0.0, 5.0, 10), Summary(1.0, 5.0, 10.0, 10))
  val bins = 10
 
  val featureKeys: Array[FeatureKey] = features.map(f => (f.name, None))
@@ -66,7 +67,7 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
  distribs(1).nulls shouldBe 1
  distribs(1).distribution.sum shouldBe 0
  distribs(2).distribution.sum shouldBe 2
- distribs(2).summaryInfo should contain theSameElementsAs Array(0.0, 3.0)
+ distribs(2).summaryInfo should contain theSameElementsAs Array(0.0, 3.0, 7.0, 10.0)
  distribs(3).distribution.sum shouldBe 0
  distribs(4).distribution.sum shouldBe 3
  distribs(4).summaryInfo.length shouldBe bins
@@ -75,10 +76,9 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
  it should "be correctly created for text features" in {
  val features = Array(description, gender)
  val values: Array[(Boolean, ProcessedSeq)] = Array(
- (false, Left(RandomText.strings(1, 10).take(10000).toSeq.map(_.value.get))),
- (false, Left(RandomText.strings(1, 10).take(1000000).toSeq.map(_.value.get)))
+ (false, Left(RandomText.strings(1, 10).take(10000).toSeq.map(_.value.get)))
  )
- val summary = Array(Summary(10000.0, 10000.0), Summary(1000000, 1000000))
+ val summary = Array(Summary(1000.0, 50000.0, 70000.0, 10))
  val bins = 100
  val featureKeys: Array[FeatureKey] = features.map(f => (f.name, None))
  val processedSeqs: Array[Option[ProcessedSeq]] = values.map { case (isEmpty, processed) =>
@@ -91,8 +91,6 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
  distribs(0).distribution.length shouldBe 100
  distribs(0).distribution.sum shouldBe 10000
 
- distribs(1).distribution.length shouldBe 200
- distribs(1).distribution.sum shouldBe 1000000
  }
 
  it should "be correctly created for map features" in {
@@ -102,9 +100,9 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
  Map("A" -> Right(Seq(1.0)), "B" -> Right(Seq(1.0))),
  Map("B" -> Right(Seq(0.0))))
  val summary = Array(
- Map("A" -> Summary(0.0, 1.0), "B" -> Summary(0.0, 5.0)),
- Map("A" -> Summary(-1.6, 10.6), "B" -> Summary(0.0, 3.0)),
- Map("B" -> Summary(0.0, 0.0)))
+ Map("A" -> Summary(0.0, 2.0, 100.0, 10), "B" -> Summary(0.0, 5.0, 10.0, 10)),
+ Map("A" -> Summary(-1.6, 10.6, 30.0, 10), "B" -> Summary(0.0, 3.0, 11.0, 10)),
+ Map("B" -> Summary(0.0, 0.0, 0.0, 10)))
  val bins = 10
  val distribs = features.map(_.name).zip(summary).zip(values).flatMap { case ((name, summaryMaps), valueMaps) =>
  summaryMaps.map { case (key, summary) =>
@@ -121,15 +119,15 @@ class FeatureDistributionTest extends FlatSpec with PassengerSparkFixtureTest wi
  else d.distribution.length shouldBe 2
  }
  distribs(0).nulls shouldBe 0
- distribs(0).summaryInfo should contain theSameElementsAs Array(0.0, 1.0)
+ distribs(0).summaryInfo should contain theSameElementsAs Array(0.0, 2.0, 100.0, 10.0)
  distribs(1).nulls shouldBe 1
  distribs(0).distribution.sum shouldBe 2
  distribs(1).distribution.sum shouldBe 0
  distribs(2).summaryInfo.length shouldBe bins
  distribs(2).distribution.sum shouldBe 1
  distribs(4).distribution(0) shouldBe 1
  distribs(4).distribution(1) shouldBe 0
- distribs(4).summaryInfo.length shouldBe 2
+ distribs(4).summaryInfo.length shouldBe 4
  }
 
  it should "correctly compare fill rates" in {

@@ -78,21 +78,23 @@ class PreparedFeaturesTest extends FlatSpec with TestSparkContext {
  val (responseSummaries3, predictorSummaries3) = preparedFeatures3.summaries
 
  responseSummaries1 should contain theSameElementsAs
- Seq(responseKey1 -> Summary(1.0, 1.0), responseKey2 -> Summary(0.5, 0.5))
+ Seq(responseKey1 -> Summary(1.0, 1.0, 1.0, 1), responseKey2 -> Summary(0.5, 0.5, 0.5, 1))
  predictorSummaries1 should contain theSameElementsAs
- Seq(predictorKey1 -> Summary(0.0, 0.0), predictorKey2A -> Summary(2.0, 2.0), predictorKey2B -> Summary(1.0, 1.0))
+ Seq(predictorKey1 -> Summary(0.0, 0.0, 0.0, 2), predictorKey2A -> Summary(2.0, 2.0, 2.0, 1),
+ predictorKey2B -> Summary(1.0, 1.0, 1.0, 1))
  responseSummaries2 should contain theSameElementsAs
- Seq(responseKey1 -> Summary(0.0, 0.0))
+ Seq(responseKey1 -> Summary(0.0, 0.0, 0.0, 1))
  predictorSummaries2 should contain theSameElementsAs
- Seq(predictorKey1 -> Summary(0.4, 0.5))
+ Seq(predictorKey1 -> Summary(0.4, 0.5, 0.9, 2))
  responseSummaries3 should contain theSameElementsAs
- Seq(responseKey2 -> Summary(-0.5, -0.5))
+ Seq(responseKey2 -> Summary(-0.5, -0.5, -0.5, 1))
  predictorSummaries3 should contain theSameElementsAs
- Seq(predictorKey2A -> Summary(1.0, 1.0))
+ Seq(predictorKey2A -> Summary(1.0, 1.0, 1.0, 1))
  allResponseSummaries should contain theSameElementsAs
- Seq(responseKey1 -> Summary(0.0, 1.0), responseKey2 -> Summary(-0.5, 0.5))
+ Seq(responseKey1 -> Summary(0.0, 1.0, 1.0, 2), responseKey2 -> Summary(-0.5, 0.5, 0.0, 2))
  allPredictorSummaries should contain theSameElementsAs
- Seq(predictorKey1 -> Summary(0.0, 0.5), predictorKey2A -> Summary(1.0, 2.0), predictorKey2B -> Summary(1.0, 1.0))
+ Seq(predictorKey1 -> Summary(0.0, 0.5, 0.9, 4), predictorKey2A -> Summary(1.0, 2.0, 3.0, 2),
+ predictorKey2B -> Summary(1.0, 1.0, 1.0, 1))
  }
 
  it should "produce correct null-label leakage vector with single response" in {

@@ -52,7 +52,7 @@ class RawFeatureFilterTest extends FlatSpec with PassengerSparkFixtureTest with
  val allFeatureInfo = filter.computeFeatureStats(passengersDataSet, features)
 
  allFeatureInfo.responseSummaries.size shouldBe 1
- allFeatureInfo.responseSummaries.headOption.map(_._2) shouldEqual Option(Summary(0, 1))
+ allFeatureInfo.responseSummaries.headOption.map(_._2) shouldEqual Option(Summary(0, 1, 1, 2))
  allFeatureInfo.responseDistributions.size shouldBe 1
  allFeatureInfo.predictorSummaries.size shouldBe 12
  allFeatureInfo.predictorDistributions.size shouldBe 12

@@ -44,7 +44,11 @@ class SummaryTest extends FlatSpec with TestCommon {
  val f2s = Summary(f2)
  f1s.min shouldBe 3
  f1s.max shouldBe 3
+ f1s.sum shouldBe 3
+ f1s.count shouldBe 1
  f2s.min shouldBe 0.5
  f2s.max shouldBe 1.0
+ f2s.sum shouldBe 1.5
+ f2s.count shouldBe 2
  }
 }