Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect and remove IDs disguised in text features #415

Closed
wants to merge 44 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
681d98a
starter code
TuanNguyen27 Sep 25, 2019
ef7cdfb
fix weird compilation error
TuanNguyen27 Sep 25, 2019
6a57693
fix some tests
TuanNguyen27 Sep 25, 2019
bb858c7
fix more errors resulting from removing moments calculation
TuanNguyen27 Sep 25, 2019
cae7fe4
Update ModelInsightsTest.scala
TuanNguyen27 Sep 25, 2019
aa55539
Update FeatureDistributionTest.scala
TuanNguyen27 Sep 25, 2019
7950924
add new rules to remove raw feature based on topK & starter code on t…
TuanNguyen27 Sep 27, 2019
8f3befe
fix scala style
TuanNguyen27 Sep 27, 2019
9ed2a02
more code
TuanNguyen27 Sep 27, 2019
90350d0
fix more style error
TuanNguyen27 Sep 28, 2019
8640d6d
adding isID as an exclusion criteria
TuanNguyen27 Sep 30, 2019
91285b6
fix scala style
TuanNguyen27 Oct 1, 2019
61d26b1
bunch of broken tests
TuanNguyen27 Oct 1, 2019
a7a0781
move IdDetect app to hw
TuanNguyen27 Oct 1, 2019
6c887f3
try modify titanic instead
TuanNguyen27 Oct 1, 2019
7829dfd
add app
TuanNguyen27 Oct 1, 2019
b611289
switch to a different metric
TuanNguyen27 Oct 7, 2019
a30eca6
remove extra calculations
TuanNguyen27 Oct 7, 2019
c0ceaa6
remove more stuff
TuanNguyen27 Oct 7, 2019
b3930dc
fix naming issue
TuanNguyen27 Oct 8, 2019
33afe00
Update IdDetectTest.scala
TuanNguyen27 Oct 8, 2019
b7f050b
Update FeatureDistributionTest.scala
TuanNguyen27 Oct 8, 2019
d79456e
finishing up RFF
TuanNguyen27 Oct 8, 2019
b99b395
update default so that tests will pass
TuanNguyen27 Oct 8, 2019
ac8757e
Update OpWorkflow.scala
TuanNguyen27 Oct 8, 2019
2a4ccbc
Update OpWorkflow.scala
TuanNguyen27 Oct 8, 2019
febdc13
Update OpTitanicSimple.scala
TuanNguyen27 Oct 8, 2019
0c016b8
Update RawFeatureFilter.scala
TuanNguyen27 Oct 8, 2019
93267ab
Merge branch 'master' into ID_detect
TuanNguyen27 Oct 8, 2019
d73f3c5
new transformer wip
TuanNguyen27 Oct 10, 2019
7c1f262
Merge branch 'ID_detect' of https://github.com/salesforce/Transmogrif…
TuanNguyen27 Oct 10, 2019
88b5867
Update FeatureDistributionTest.scala
TuanNguyen27 Oct 10, 2019
a133392
added transformer for map
TuanNguyen27 Oct 10, 2019
095a180
Update FeatureDistribution.scala
TuanNguyen27 Oct 10, 2019
7031d16
Merge branch 'master' into ID_detect
TuanNguyen27 Oct 10, 2019
1e767c1
more updates
TuanNguyen27 Oct 10, 2019
72ce224
more
TuanNguyen27 Oct 10, 2019
df5562e
fix unecessary changes
TuanNguyen27 Oct 10, 2019
c3cc3b0
more updates
TuanNguyen27 Oct 10, 2019
b022921
Delete IdDetectTest.scala
TuanNguyen27 Oct 10, 2019
9bad875
more fix
TuanNguyen27 Oct 10, 2019
9f7bc99
Update FeatureDistribution.scala
TuanNguyen27 Oct 10, 2019
126ddcd
fix unit tests
TuanNguyen27 Oct 10, 2019
7753280
Update SmartTextVectorizerTest.scala
TuanNguyen27 Oct 11, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import com.salesforce.op.features._
import com.salesforce.op.features.types._
import com.salesforce.op.filters._
import com.salesforce.op.stages._
import com.salesforce.op.stages.impl.feature.{CombinationStrategy, TextStats, TransmogrifierDefaults}
import com.salesforce.op.stages.impl.feature.{CombinationStrategy, TransmogrifierDefaults}
import com.salesforce.op.stages.impl.preparators._
import com.salesforce.op.stages.impl.selector._
import com.salesforce.op.stages.impl.tuning.{DataBalancerSummary, DataCutterSummary, DataSplitterSummary}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ package com.salesforce.op.filters

import java.util.Objects

import com.salesforce.op.features.{FeatureDistributionLike, FeatureDistributionType}
import com.salesforce.op.stages.impl.feature.{HashAlgorithm, Inclusion, NumericBucketizer, TextStats}
import com.salesforce.op.features.{FeatureDistributionLike, FeatureDistributionType, TextStats}
import com.salesforce.op.stages.impl.feature.{HashAlgorithm, Inclusion, NumericBucketizer}
import com.salesforce.op.utils.json.EnumEntrySerializer
import com.twitter.algebird.Monoid._
import com.twitter.algebird._
Expand Down Expand Up @@ -63,7 +63,6 @@ case class FeatureDistribution
nulls: Long,
distribution: Array[Double],
summaryInfo: Array[Double],
moments: Option[Moments] = None,
cardEstimate: Option[TextStats] = None,
`type`: FeatureDistributionType = FeatureDistributionType.Training
) extends FeatureDistributionLike {
Expand Down Expand Up @@ -109,11 +108,23 @@ case class FeatureDistribution
// summary info can be empty or min max if hist is empty but should otherwise match so take the longest info
val combinedSummaryInfo = if (summaryInfo.length > fd.summaryInfo.length) summaryInfo else fd.summaryInfo

val combinedMoments = moments + fd.moments
val combinedCard = cardEstimate + fd.cardEstimate

FeatureDistribution(name, key, count + fd.count, nulls + fd.nulls, combinedDist,
combinedSummaryInfo, combinedMoments, combinedCard, `type`)
combinedSummaryInfo, combinedCard, `type`)
}

/**
* Cardinality of the length of tokenized text, or numerical value for numbers (at most 500)
*
* @return cardinality count based on cardEstimate
*/

def cardSize(): Option[Int] = {
cardEstimate match {
case Some(x) => Option(x.valueCounts.size)
case _ => None
}
}

/**
Expand Down Expand Up @@ -166,22 +177,20 @@ case class FeatureDistribution
"count" -> count.toString,
"nulls" -> nulls.toString,
"distribution" -> distribution.mkString("[", ",", "]"),
"summaryInfo" -> summaryInfo.mkString("[", ",", "]"),
"moments" -> moments.map(_.toString).getOrElse("")
"summaryInfo" -> summaryInfo.mkString("[", ",", "]")
).map { case (n, v) => s"$n = $v" }.mkString(", ")

s"${getClass.getSimpleName}($valStr)"
}

override def equals(that: Any): Boolean = that match {
case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, m, c, `type`) =>
distribution.deep == d.deep && summaryInfo.deep == s.deep &&
moments == m && cardEstimate == c
case FeatureDistribution(`name`, `key`, `count`, `nulls`, d, s, c, `type`) =>
distribution.deep == d.deep && summaryInfo.deep == s.deep && cardEstimate == c
case _ => false
}

override def hashCode(): Int = Objects.hashCode(name, key, count, nulls, distribution,
summaryInfo, moments, cardEstimate, `type`)
summaryInfo, cardEstimate, `type`)
}

object FeatureDistribution {
Expand Down Expand Up @@ -239,7 +248,6 @@ object FeatureDistribution {
value.map(seq => 0L -> histValues(seq, summary, bins, textBinsFormula))
.getOrElse(1L -> (Array(summary.min, summary.max, summary.sum, summary.count) -> new Array[Double](bins)))

val moments = value.map(momentsValues)
val cardEstimate = value.map(cardinalityValues)

FeatureDistribution(
Expand All @@ -249,26 +257,11 @@ object FeatureDistribution {
nulls = nullCount,
summaryInfo = summaryInfo,
distribution = distribution,
moments = moments,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about backwards compatibility?

cardEstimate = cardEstimate,
`type` = `type`
)
}

/**
* Function to calculate the first five central moments of numeric values, or length of tokens for text features
*
* @param values values to calculate moments
* @return Moments object containing information about moments
*/
private def momentsValues(values: ProcessedSeq): Moments = {
val population = values match {
case Left(seq) => seq.map(x => x.length.toDouble)
case Right(seq) => seq
}
MomentsGroup.sum(population.map(x => Moments(x)))
}

/**
* Function to track frequency of the first $(MaxCardinality) unique values
* (number for numeric features, token for text features)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ class RawFeatureFilter[T]
val featureSize: Int = trainingDistribs.length

val trainingFillRates: Seq[Double] = trainingDistribs.map(_.fillRate())

val trainingCardSizes: Seq[Option[Int]] = trainingDistribs.map(_.cardSize)
val trainingNullLabelAbsoluteCorrs: Seq[Option[Double]] =
if (correlationInfo.isEmpty) Seq.fill(featureSize)(None)
else {
Expand All @@ -232,6 +232,7 @@ class RawFeatureFilter[T]
traininingDistribs: Seq[FeatureDistribution],
trainingFillRates: Seq[Double],
trainingNullLabelAbsoluteCorrs: Seq[Option[Double]],
trainingCardSizes: Seq[Option[Int]],
scoringFillRates: Seq[Option[Double]],
jsDivergences: Seq[Option[Double]],
fillRateDiffs: Seq[Option[Double]],
Expand All @@ -245,12 +246,15 @@ class RawFeatureFilter[T]
.zip(jsDivergences)
.zip(fillRateDiffs)
.zip(fillRatioDiffs)
.zip(trainingCardSizes)
.map {
case (((((((name, key), trainingFillRate), trainingNullLabelAbsoluteCorr),
scoringFillRate), jsDivergence), fillRateDiff), fillRatioDiff) =>
case ((((((((name, key), trainingFillRate), trainingNullLabelAbsoluteCorr),
scoringFillRate), jsDivergence), fillRateDiff), fillRatioDiff),
trainingCardSize) =>
RawFeatureFilterMetrics(
name, key, trainingFillRate, trainingNullLabelAbsoluteCorr,
scoringFillRate, jsDivergence, fillRateDiff, fillRatioDiff)
scoringFillRate, jsDivergence, fillRateDiff, fillRatioDiff,
trainingCardSize)
}
}

Expand All @@ -263,7 +267,7 @@ class RawFeatureFilter[T]

val rawFeatureFilterMetrics = combineRawFeatureFilterMetrics(
trainingDistribs, trainingFillRates, trainingNullLabelAbsoluteCorrs,
scoringFillRates, jsDivergences, fillRateDiffs, fillRatioDiffs
trainingCardSizes, scoringFillRates, jsDivergences, fillRateDiffs, fillRatioDiffs
)
rawFeatureFilterMetrics

Expand All @@ -280,7 +284,8 @@ class RawFeatureFilter[T]

val rawFeatureFilterMetrics = combineRawFeatureFilterMetrics(
trainingDistribs, trainingFillRates, trainingNullLabelAbsoluteCorrs,
scoringFillRates, jsDivergences, fillRateDiffs, fillRatioDiffs
trainingCardSizes, scoringFillRates, jsDivergences,
fillRateDiffs, fillRatioDiffs
)

log.info(combined.zip(rawFeatureFilterMetrics).map {
Expand Down Expand Up @@ -354,7 +359,8 @@ class RawFeatureFilter[T]
.zip(fillRatioDiffMismatches)
.map {
case (((((((name, key), trainingUnfilledState), trainingNullLabelLeaker),
scoringUnfilledState), jsDivergenceMismatch), fillRateDiffMismatch), fillRatioDiffMismatch) =>
scoringUnfilledState), jsDivergenceMismatch),
fillRateDiffMismatch), fillRatioDiffMismatch) =>
ExclusionReasons(
name,
key,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ case class RawFeatureFilterMetrics
scoringFillRate: Option[Double],
jsDivergence: Option[Double],
fillRateDiff: Option[Double],
fillRatioDiff: Option[Double]
fillRatioDiff: Option[Double],
trainingCardSize: Option[Int]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's name it fully, i.e trainingCardinalitySize

)

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package com.salesforce.op.stages.impl.feature
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

License header is missing


import com.salesforce.op.UID
import com.salesforce.op.features.types.{TextMap}
import com.salesforce.op.stages.base.unary.UnaryTransformer

class IdMapRemover(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs please

minUniqueTokLen: Int,
uid: String = UID[IdMapRemover]
) extends UnaryTransformer[TextMap, TextMap](operationName = "IdMapRemover", uid = uid) {

private var dropMap: Map[String, Boolean] = Map()

override protected def onSetInput(): Unit = {
super.onSetInput()
val dist = in1.asFeatureLike.distributions
val keys = dist.flatMap(_.key)
val drop = dist.flatMap(_.cardEstimate).map(_.valueCounts.size < minUniqueTokLen)
dropMap = (keys zip drop) toMap
}

override def transformFn: TextMap => TextMap =
a => {
val filteredMap = a.value.map { case (k, v) =>
dropMap.get(k) match {
case Some(true) => (k, "")
case _ => (k, v)
}
}
TextMap(filteredMap)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.types.Text
import com.salesforce.op.stages.base.unary.UnaryTransformer

class IdRemover(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs please

minUniqueTokLen: Int,
uid: String = UID[IdRemover],
operationName: String = "IDremover"
) extends UnaryTransformer[Text, Text] (operationName = operationName, uid = uid) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to expose operationName on the IdRemover ctor args. instead do:
extends UnaryTransformer[Text, Text] (operationName = "IdRemover", uid = uid)


private var drop: Boolean = false

override protected def onSetInput(): Unit = {
super.onSetInput()
val dist = in1.asFeatureLike.distributions
val tokenLenCardFilter = dist.flatMap(_.cardEstimate).map(_.valueCounts.size < minUniqueTokLen)
drop = tokenLenCardFilter.headOption.getOrElse(false)
}

override def transformFn: Text => Text = a => if (drop) Text.empty else a
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.TextStats
import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.sequence.{SequenceEstimator, SequenceModel}
import com.salesforce.op.stages.impl.feature.VectorizerUtils._
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,13 @@
package com.salesforce.op.stages.impl.feature

import com.salesforce.op.UID
import com.salesforce.op.features.TransientFeature
import com.salesforce.op.features.types.{OPVector, Text, TextList, VectorConversions, SeqDoubleConversions}
import com.salesforce.op.features.{TextStats, TransientFeature}
import com.salesforce.op.features.types.{OPVector, SeqDoubleConversions, Text, TextList, VectorConversions}
import com.salesforce.op.stages.base.sequence.{SequenceEstimator, SequenceModel}
import com.salesforce.op.stages.impl.feature.VectorizerUtils._
import com.salesforce.op.utils.json.JsonLike
import com.salesforce.op.utils.spark.RichDataset._
import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
import com.twitter.algebird.Monoid
import com.twitter.algebird.Monoid._
import com.twitter.algebird.Operators._
import com.twitter.algebird.Semigroup
Expand Down Expand Up @@ -163,27 +162,6 @@ object SmartTextVectorizer {
}
}

/**
* Summary statistics of a text feature
*
* @param valueCounts counts of feature values
*/
private[op] case class TextStats(valueCounts: Map[String, Int]) extends JsonLike

private[op] object TextStats {
def monoid(maxCardinality: Int): Monoid[TextStats] = new Monoid[TextStats] {
override def plus(l: TextStats, r: TextStats): TextStats = {
if (l.valueCounts.size > maxCardinality) l
else if (r.valueCounts.size > maxCardinality) r
else TextStats(l.valueCounts + r.valueCounts)
}

override def zero: TextStats = TextStats.empty
}

def empty: TextStats = TextStats(Map.empty)
}

/**
* Arguments for [[SmartTextVectorizerModel]]
*
Expand Down
24 changes: 7 additions & 17 deletions core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ package com.salesforce.op

import com.salesforce.op.evaluators._
import com.salesforce.op.features.types._
import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike}
import com.salesforce.op.features.{Feature, FeatureDistributionType, FeatureLike, TextStats}
import com.salesforce.op.filters._
import com.salesforce.op.stages.impl.classification._
import com.salesforce.op.stages.impl.preparators._
import com.salesforce.op.stages.impl.regression.{OpLinearRegression, OpXGBoostRegressor, RegressionModelSelector}
import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType
import com.salesforce.op.stages.impl.selector.{SelectedModelCombiner, SelectedCombinerModel, SelectedModel}
import com.salesforce.op.stages.impl.selector.{SelectedCombinerModel, SelectedModel, SelectedModelCombiner}
import com.salesforce.op.stages.impl.selector.ValidationType._
import com.salesforce.op.stages.impl.tuning.{DataCutter, DataSplitter}
import com.salesforce.op.test.{PassengerSparkFixtureTest, TestFeatureBuilder}
Expand All @@ -48,8 +48,7 @@ import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.junit.runner.RunWith
import com.salesforce.op.features.types.Real
import com.salesforce.op.stages.impl.feature.{CombinationStrategy, TextStats}
import com.twitter.algebird.Moments
import com.salesforce.op.stages.impl.feature.CombinationStrategy
import org.apache.spark.sql.{DataFrame, Dataset}
import org.scalactic.Equality
import org.scalatest.FlatSpec
Expand Down Expand Up @@ -167,15 +166,14 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
}

def getFeatureMomentsAndCard(inputModel: FeatureLike[Prediction],
DF: DataFrame): (Map[String, Moments], Map[String, TextStats]) = {
DF: DataFrame): Map[String, TextStats] = {
lazy val workFlow = new OpWorkflow().setResultFeatures(inputModel).setInputDataset(DF)
lazy val dummyReader = workFlow.getReader()
lazy val workFlowRFF = workFlow.withRawFeatureFilter(Some(dummyReader), None)
lazy val model = workFlowRFF.train()
val insights = model.modelInsights(inputModel)
val featureMoments = insights.features.map(f => f.featureName -> f.distributions.head.moments.get).toMap
val featureCardinality = insights.features.map(f => f.featureName -> f.distributions.head.cardEstimate.get).toMap
return (featureMoments, featureCardinality)
return featureCardinality
}

val params = new OpParams()
Expand Down Expand Up @@ -777,23 +775,15 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
absError2 should be < tol * smallCoeffSum / 2
}

it should "correctly return moments calculation and cardinality calculation for numeric features" in {
it should "correctly return cardinality calculation for numeric features" in {

import spark.implicits._
val df = linRegDF._3
val meanTol = 0.01
val varTol = 0.01
val (moments, cardinality) = getFeatureMomentsAndCard(standardizedLinpred, linRegDF._3)
val cardinality = getFeatureMomentsAndCard(standardizedLinpred, linRegDF._3)

// Go through each feature and check that the mean, variance, and unique counts match the data
moments.foreach { case (featureName, value) => {
value.count shouldBe 1000
val (expectedMean, expectedVariance) =
df.select(avg(featureName), variance(featureName)).as[(Double, Double)].collect().head
math.abs((value.mean - expectedMean) / expectedMean) < meanTol shouldBe true
math.abs((value.variance - expectedVariance) / expectedVariance) < varTol shouldBe true
}
}

cardinality.foreach { case (featureName, value) => {
val actualUniques = df.select(featureName).as[Double].collect().toSet
Expand Down
Loading