From 3ad8ce1d6a618a5804a470e4f96031961ae4b46a Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Fri, 10 Jan 2020 14:04:29 -0800 Subject: [PATCH 01/13] Got previous tests working --- .../OPCollectionHashingVectorizer.scala | 16 ++- .../impl/feature/SmartTextMapVectorizer.scala | 131 ++++++++++++++---- .../impl/feature/SmartTextVectorizer.scala | 2 - .../impl/feature/AttributeAsserts.scala | 2 + .../feature/SmartTextMapVectorizerTest.scala | 70 +++++++++- .../salesforce/op/features/types/Lists.scala | 9 ++ 6 files changed, 196 insertions(+), 34 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala index c646db6d06..2e1fa78710 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala @@ -314,32 +314,38 @@ private[op] trait MapHashingFun extends HashingFun { protected def makeVectorColumnMetadata ( + hashFeatures: Array[TransientFeature], + ignoreFeatures: Array[TransientFeature], features: Array[TransientFeature], params: HashingFunctionParams, - allKeys: Seq[Seq[String]], + hashKeys: Seq[Seq[String]], + ignoreKeys: Seq[Seq[String]], shouldTrackNulls: Boolean, shouldTrackLen: Boolean ): Array[OpVectorColumnMetadata] = { val numHashes = params.numFeatures - val numFeatures = allKeys.map(_.length).sum + val numFeatures = hashKeys.map(_.length).sum val hashColumns = if (isSharedHashSpace(params, Some(numFeatures))) { (0 until numHashes).map { i => OpVectorColumnMetadata( - parentFeatureName = features.map(_.name), - parentFeatureType = features.map(_.typeName), + parentFeatureName = hashFeatures.map(_.name), + parentFeatureType = hashFeatures.map(_.typeName), grouping = None, indicatorValue = None ) }.toArray } else { for { - (keys, f) <- allKeys.toArray.zip(features) + (keys, f) <- hashKeys.toArray.zip(hashFeatures) key <- keys i <- 0 until numHashes } yield f.toColumnMetaData().copy(grouping = Option(key)) } + // All columns get null tracking or text length tracking, whether their contents are hashed or ignored + val allKeys = hashKeys.zip(ignoreKeys).map{ case(h, i) => h ++ i } + // val allFeatures = val nullColumns = if (shouldTrackNulls) { for { (keys, f) <- allKeys.toArray.zip(features) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index f149f5abba..11e84e3528 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -39,7 +39,7 @@ import com.salesforce.op.utils.spark.RichDataset._ import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import com.twitter.algebird.Monoid._ import com.twitter.algebird.Operators._ -import com.twitter.algebird.Monoid +import com.twitter.algebird.{Monoid, Semigroup} import com.twitter.algebird.macros.caseclass import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.{Dataset, Encoder} @@ -63,7 +63,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]] with PivotParams with CleanTextFun with SaveOthersParams with TrackNullsParam with MinSupportParam with TextTokenizerParams with TrackTextLenParam with HashingVectorizerParams with MapHashingFun with OneHotFun with MapStringPivotHelper - with MapVectorizerFuns[String, OPMap[String]] with MaxCardinalityParams { + with MapVectorizerFuns[String, OPMap[String]] with MaxCardinalityParams with MinLengthStdDevParams { private implicit val textMapStatsSeqEnc: Encoder[Array[TextMapStats]] = ExpressionEncoder[Array[TextMapStats]]() @@ -73,7 +73,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]] ): TextMapStats = { val keyValueCounts = textMap.map{ case (k, v) => cleanTextFn(k, shouldCleanKeys) -> - TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1), Map(cleanTextFn(v, shouldCleanValues).length -> 1)) + TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1L), Map(cleanTextFn(v, shouldCleanValues).length -> 1L)) } TextMapStats(keyValueCounts) } @@ -104,9 +104,24 @@ class SmartTextMapVectorizer[T <: OPMap[String]] ) } else Array.empty[OpVectorColumnMetadata] - val textColumns = if (args.textFeatureInfo.flatten.nonEmpty) { + + /* + val hashedTextColumns = if (args.hashFeatureInfo.flatten.nonEmpty) { + val (mapFeatures, mapFeatureInfo) = + inN.toSeq.zip(args.hashFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip + val allKeys = mapFeatureInfo.map(_.map(_.key)) + makeVectorColumnMetadata( + features = mapFeatures.toArray, + params = makeHashingParams(), + allKeys = allKeys, + shouldTrackNulls = args.shouldTrackNulls, + shouldTrackLen = $(trackTextLen) + ) + } else Array.empty[OpVectorColumnMetadata] + + val ignoredTextColumns = if (args.ignoreFeatureInfo.flatten.nonEmpty) { val (mapFeatures, mapFeatureInfo) = - inN.toSeq.zip(args.textFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip + inN.toSeq.zip(args.ignoreFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip val allKeys = mapFeatureInfo.map(_.map(_.key)) makeVectorColumnMetadata( features = mapFeatures.toArray, @@ -116,13 +131,46 @@ class SmartTextMapVectorizer[T <: OPMap[String]] shouldTrackLen = $(trackTextLen) ) } else Array.empty[OpVectorColumnMetadata] + */ + + val allTextFeatureInfo = args.hashFeatureInfo.zip(args.ignoreFeatureInfo).map{ case (h, i) => h ++ i } + val allTextColumns = if (allTextFeatureInfo.flatten.nonEmpty) { + val (mapFeatures, mapFeatureInfo) = + inN.toSeq.zip(allTextFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip + val allKeys = mapFeatureInfo.map(_.map(_.key)) + + val hashKeys = args.hashFeatureInfo.map( + _.filter(_.vectorizationMethod == TextVectorizationMethod.Hash).map(_.key) + ) + val ignoreKeys = args.ignoreFeatureInfo.map( + _.filter(_.vectorizationMethod == TextVectorizationMethod.Ignore).map(_.key) + ) + val hashFeatures = inN.toSeq.zip(args.hashFeatureInfo).filter{ + case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty + }.unzip._1 + val ignoreFeatures = inN.toSeq.zip(args.ignoreFeatureInfo).filter{ + case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty + }.unzip._1 + + makeVectorColumnMetadata( + hashFeatures = hashFeatures.toArray, + ignoreFeatures = ignoreFeatures.toArray, + features = mapFeatures.toArray, + params = makeHashingParams(), + hashKeys = hashKeys, + ignoreKeys = ignoreKeys, + shouldTrackNulls = args.shouldTrackNulls, + shouldTrackLen = $(trackTextLen) + ) + } else Array.empty[OpVectorColumnMetadata] - val columns = categoricalColumns ++ textColumns + val columns = categoricalColumns ++ allTextColumns OpVectorMetadata(getOutputFeatureName, columns, Transmogrifier.inputFeaturesToHistory(inN, stageName)) } def makeSmartTextMapVectorizerModelArgs(aggregatedStats: Array[TextMapStats]): SmartTextMapVectorizerModelArgs = { val maxCard = $(maxCardinality) + val minLenStdDev = $(minLengthStdDev) val minSup = $(minSupport) val shouldCleanKeys = $(cleanKeys) val shouldCleanValues = $(cleanText) @@ -130,14 +178,18 @@ class SmartTextMapVectorizer[T <: OPMap[String]] val allFeatureInfo = aggregatedStats.toSeq.map { textMapStats => textMapStats.keyValueCounts.toSeq.map { case (k, textStats) => - val isCat = textStats.valueCounts.size <= maxCard - val topVals = if (isCat) { + val vecMethod: TextVectorizationMethod = textStats match { + case _ if textStats.valueCounts.size <= maxCard => TextVectorizationMethod.Pivot + case _ if textStats.lengthStdDev <= minLenStdDev => TextVectorizationMethod.Ignore + case _ => TextVectorizationMethod.Hash + } + val topVals = if (vecMethod == TextVectorizationMethod.Pivot) { textStats.valueCounts .filter { case (_, count) => count >= minSup } .toSeq.sortBy(v => -v._2 -> v._1) .take($(topK)).map(_._1).toArray } else Array.empty[String] - SmartTextFeatureInfo(key = k, isCategorical = isCat, topValues = topVals) + SmartTextFeatureInfo(key = k, vectorizationMethod = vecMethod, topValues = topVals) } } @@ -197,11 +249,15 @@ private[op] object TextMapStats { /** * Info about each feature within a text map * - * @param key name of a feature - * @param isCategorical indicate whether a feature is categorical or not - * @param topValues most common values of a feature (only for categoricals) + * @param key name of a feature + * @param vectorizationMethod method to use for text vectorization (either pivot, hashing, or ignoring) + * @param topValues most common values of a feature (only for categoricals) */ -case class SmartTextFeatureInfo(key: String, isCategorical: Boolean, topValues: Array[String]) extends JsonLike +case class SmartTextFeatureInfo( + key: String, + vectorizationMethod: TextVectorizationMethod, + topValues: Array[String] +) extends JsonLike /** @@ -221,11 +277,20 @@ case class SmartTextMapVectorizerModelArgs shouldTrackNulls: Boolean, hashingParams: HashingFunctionParams ) extends JsonLike { - val (categoricalFeatureInfo, textFeatureInfo) = allFeatureInfo.map{ featureInfoSeq => - featureInfoSeq.partition{_ .isCategorical } - }.unzip - val categoricalKeys = categoricalFeatureInfo.map(featureInfoSeq => featureInfoSeq.map(_.key)) - val textKeys = textFeatureInfo.map(featureInfoSeq => featureInfoSeq.map(_.key)) + val (categoricalFeatureInfo, hashFeatureInfo, ignoreFeatureInfo) = allFeatureInfo.map{ featureInfoSeq => + val groups = featureInfoSeq.groupBy(_.vectorizationMethod) + val catGroup = groups.getOrElse(TextVectorizationMethod.Pivot, Seq.empty) + val hashGroup = groups.getOrElse(TextVectorizationMethod.Hash, Seq.empty) + val ignoreGroup = groups.getOrElse(TextVectorizationMethod.Ignore, Seq.empty) + + (catGroup, hashGroup, ignoreGroup) + }.unzip3 + + val categoricalKeys = categoricalFeatureInfo.map(_.map(_.key)) + val hashKeys = hashFeatureInfo.map(_.map(_.key)) + val ignoreKeys = ignoreFeatureInfo.map(_.map(_.key)) + + val textKeys = hashKeys.zip(ignoreKeys).map{ case (hk, ik) => hk ++ ik } } @@ -248,32 +313,46 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op] ) private def partitionRow(row: Seq[OPMap[String]]): - (Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]]) = { + (Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]]) = { val (rowCategorical, keysCategorical) = row.view.zip(args.categoricalKeys).collect { case (elements, keys) if keys.nonEmpty => val filtered = elements.value.filter { case (k, v) => keys.contains(k) } (TextMap(filtered), keys) }.unzip - val (rowText, keysText) = - row.view.zip(args.textKeys).collect { case (elements, keys) if keys.nonEmpty => + val (rowHashedText, keysHashedText) = + row.view.zip(args.hashKeys).collect { case (elements, keys) if keys.nonEmpty => val filtered = elements.value.filter { case (k, v) => keys.contains(k) } (TextMap(filtered), keys) }.unzip - (rowCategorical.toList, keysCategorical.toList, rowText.toList, keysText.toList) + val (rowIgnoredText, keysIgnoredText) = + row.view.zip(args.ignoreKeys).collect { case (elements, keys) if keys.nonEmpty => + val filtered = elements.value.filter { case (k, v) => keys.contains(k) } + (TextMap(filtered), keys) + }.unzip + + (rowCategorical.toList, keysCategorical.toList, rowHashedText.toList, keysHashedText.toList, + rowIgnoredText.toList, keysIgnoredText.toList) } def transformFn: Seq[T] => OPVector = row => { - val (rowCategorical, keysCategorical, rowText, keysText) = partitionRow(row) + implicit val textListMonoid: Monoid[TextList] = TextList.monoid + + val (rowCategorical, keysCategorical, rowHash, keysHash, rowIgnore, keysIgnore) = partitionRow(row) + val keysText = keysHash + keysIgnore // Go algebird! val categoricalVector = categoricalPivotFn(rowCategorical) - val rowTextTokenized = rowText.map(_.value.map { case (k, v) => k -> tokenize(v.toText).tokens }) - val textVector = hash(rowTextTokenized, keysText, args.hashingParams) + + val rowHashTokenized = rowHash.map(_.value.map { case (k, v) => k -> tokenize(v.toText).tokens }) + val rowIgnoreTokenized = rowIgnore.map(_.value.map { case (k, v) => k -> tokenize(v.toText).tokens }) + val rowTextTokenized = rowHashTokenized + rowIgnoreTokenized // Go go algebird! + val hashVector = hash(rowHashTokenized, keysHash, args.hashingParams) + val textNullIndicatorsVector = if (args.shouldTrackNulls) getNullIndicatorsVector(keysText, rowTextTokenized) else OPVector.empty val textLenVector = if ($(trackTextLen)) getLenVector(keysText, rowTextTokenized) else OPVector.empty - categoricalVector.combine(textVector, textLenVector, textNullIndicatorsVector) + categoricalVector.combine(hashVector, textLenVector, textNullIndicatorsVector) } private def getNullIndicatorsVector(keysSeq: Seq[Seq[String]], inputs: Seq[Map[String, TextList]]): OPVector = { diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala index e95939d249..a2eab274ff 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala @@ -225,8 +225,6 @@ private[op] object TextStats { * Arguments for [[SmartTextVectorizerModel]] * * @param vectorizationMethods method to use for text vectorization (either pivot, hashing, or ignoring) - * @param isCategorical is feature a categorical or not - * @param isIgnorable is a text feature that we think is ignorable? high cardinality + low length variance * @param topValues top values to each feature * @param shouldCleanText should clean text value * @param shouldTrackNulls should track nulls diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala index 0cb29643ab..8d812ae449 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala @@ -49,6 +49,8 @@ trait AttributeAsserts { for { (x, i) <- output.zipWithIndex _ = withClue(s"Output vector $i and expectedNominal arrays are not of the same length:") { + println(s"x.value.size: ${x.value.size}") + println(s"expectedNominal.length: ${expectedNominal.length}") x.value.size shouldBe expectedNominal.length } (value, nominal) <- x.value.toArray.zip(expectedNominal) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 5584ea1565..a39d8a0eff 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -32,13 +32,14 @@ package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.stages.base.sequence.SequenceModel -import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext} +import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.linalg.Vectors import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import com.salesforce.op.features.types._ +import com.salesforce.op.testkit.RandomText @RunWith(classOf[JUnitRunner]) class SmartTextMapVectorizerTest @@ -73,6 +74,37 @@ class SmartTextMapVectorizerTest ) ) + /* + Generate some more complicated input data to check things a little closer. There are four text fields with + different token distributions: + + country: Uniformly distributed from a larger list of ~few hundred countries, should be hashed + categoricalText: Uniformly distributed from a small list of choices, should be pivoted (also has fixed lengths, + so serves as a test that the categorical check happens before the token length variance check) + textId: Uniformly distributed high cardinality Ids with fixed lengths, should be ignored + text: Uniformly distributed unicode strings with lengths ranging from 0-100, should be hashed + */ + val countryData: Seq[Text] = RandomText.countries.withProbabilityOfEmpty(0.2).limit(1000) + val categoricalTextData: Seq[Text] = RandomText.textFromDomain(domain = List("A", "B", "C", "D", "E", "F")) + .withProbabilityOfEmpty(0.2).limit(1000) + // Generate List containing elements like 040231, 040232, ... + val textIdData: Seq[Text] = RandomText.textFromDomain( + domain = (1 to 1000).map(x => "%06d".format(40230 + x)).toList + ).withProbabilityOfEmpty(0.2).limit(1000) + val textData: Seq[Text] = RandomText.strings(minLen = 0, maxLen = 100).withProbabilityOfEmpty(0.2).limit(1000) + val generatedData: Seq[(Text, Text, Text, Text)] = + countryData.zip(categoricalTextData).zip(textIdData).zip(textData).map { + case (((co, ca), id), te) => (co, ca, id, te) + } + + def mapifyText(textSeq: Seq[Text]): TextMap = { + textSeq.zipWithIndex.flatMap { + case (t, i) => t.value.map(tv => "f" + i.toString -> tv) + }.toMap.toTextMap + } + val mapData = generatedData.map { case (t1, t2, t3, t4) => mapifyText(Seq(t1, t2, t3, t4)) } + val (rawDF, rawTextMap) = TestFeatureBuilder("textMap", mapData) + /** * Estimator instance to be tested */ @@ -398,4 +430,40 @@ class SmartTextMapVectorizerTest result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } + it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance" in { + val topKCategorial = 3 + val hashSize = 5 + + val smartVectorized = new SmartTextMapVectorizer() + .setMaxCardinality(10).setNumFeatures(hashSize).setMinSupport(10).setTopK(topKCategorial) + // .setMinLengthStdDev(1.0) + .setAutoDetectLanguage(false).setMinTokenLength(1).setToLowercase(false) + .setTrackNulls(true).setTrackTextLen(true) + .setInput(rawTextMap).getOutput() + + val transformed = new OpWorkflow().setResultFeatures(smartVectorized).transform(rawDF) + val result = transformed.collect(smartVectorized) + + /* + Feature vector should have 16 components, corresponding to two hashed text fields, one categorical field, and + one ignored text field. + + Hashed text: (5 hash buckets + 1 length + 1 null indicator) = 7 elements + Categorical: (3 topK + 1 other + 1 null indicator) = 5 elements + Ignored text: (1 length + 1 null indicator) = 2 elements + */ + val featureVectorSize = 2 * (hashSize + 2) + (topKCategorial + 2) + 2 + val firstRes = result.head + // firstRes.v.size shouldBe featureVectorSize + + val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) + meta.columns.foreach(println) + meta.columns.length shouldBe featureVectorSize + meta.columns.slice(0, 5).forall(_.grouping.contains("categorical")) + meta.columns.slice(5, 10).forall(_.grouping.contains("country")) + meta.columns.slice(10, 15).forall(_.grouping.contains("text")) + meta.columns.slice(15, 18).forall(_.descriptorValue.contains(OpVectorColumnMetadata.TextLenString)) + meta.columns.slice(18, 21).forall(_.indicatorValue.contains(OpVectorColumnMetadata.NullString)) + } + } diff --git a/features/src/main/scala/com/salesforce/op/features/types/Lists.scala b/features/src/main/scala/com/salesforce/op/features/types/Lists.scala index eebf1f7a47..67147227c4 100644 --- a/features/src/main/scala/com/salesforce/op/features/types/Lists.scala +++ b/features/src/main/scala/com/salesforce/op/features/types/Lists.scala @@ -30,6 +30,8 @@ package com.salesforce.op.features.types +import com.twitter.algebird.{Monoid, SeqMonoid} + /** * A list of text values * @@ -41,6 +43,13 @@ class TextList(val value: Seq[String]) extends OPList[String] { object TextList { def apply(value: Seq[String]): TextList = new TextList(value) def empty: TextList = FeatureTypeDefaults.TextList + + def monoid: Monoid[TextList] = new Monoid[TextList] { + override def zero = TextList.empty + override def plus(left: TextList, right: TextList): TextList = { + TextList(left.value ++ right.value) + } + } } /** From b00775bbe354dcdd1aa41a47d58a27eb9b73f59c Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Fri, 10 Jan 2020 14:07:12 -0800 Subject: [PATCH 02/13] New test also working --- .../op/stages/impl/feature/SmartTextMapVectorizerTest.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index a39d8a0eff..3b65a085f4 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -436,7 +436,7 @@ class SmartTextMapVectorizerTest val smartVectorized = new SmartTextMapVectorizer() .setMaxCardinality(10).setNumFeatures(hashSize).setMinSupport(10).setTopK(topKCategorial) - // .setMinLengthStdDev(1.0) + .setMinLengthStdDev(1.0) .setAutoDetectLanguage(false).setMinTokenLength(1).setToLowercase(false) .setTrackNulls(true).setTrackTextLen(true) .setInput(rawTextMap).getOutput() @@ -454,10 +454,9 @@ class SmartTextMapVectorizerTest */ val featureVectorSize = 2 * (hashSize + 2) + (topKCategorial + 2) + 2 val firstRes = result.head - // firstRes.v.size shouldBe featureVectorSize + firstRes.v.size shouldBe featureVectorSize val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) - meta.columns.foreach(println) meta.columns.length shouldBe featureVectorSize meta.columns.slice(0, 5).forall(_.grouping.contains("categorical")) meta.columns.slice(5, 10).forall(_.grouping.contains("country")) From 03ae4d00874a136d9255f7e07a80a82c1e8b449a Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Fri, 10 Jan 2020 14:09:11 -0800 Subject: [PATCH 03/13] Remove debug output --- .../salesforce/op/stages/impl/feature/AttributeAsserts.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala index 8d812ae449..0cb29643ab 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala @@ -49,8 +49,6 @@ trait AttributeAsserts { for { (x, i) <- output.zipWithIndex _ = withClue(s"Output vector $i and expectedNominal arrays are not of the same length:") { - println(s"x.value.size: ${x.value.size}") - println(s"expectedNominal.length: ${expectedNominal.length}") x.value.size shouldBe expectedNominal.length } (value, nominal) <- x.value.toArray.zip(expectedNominal) From 9759747524b71275e6cab8d2e0d0a9c275543410 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Fri, 10 Jan 2020 14:20:10 -0800 Subject: [PATCH 04/13] Added test for TextList monoid --- .../op/features/types/FeatureTypeValueTest.scala | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala index 345b98e6b4..c2576b1a49 100644 --- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala +++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala @@ -32,6 +32,8 @@ package com.salesforce.op.features.types import com.salesforce.op.test.TestCommon import com.salesforce.op.utils.reflection.ReflectionUtils +import com.twitter.algebird.Monoid +import com.twitter.algebird.Operators._ import org.apache.lucene.geo.GeoUtils import org.apache.spark.ml.linalg.DenseVector import org.junit.runner.RunWith @@ -125,8 +127,14 @@ class FeatureTypeValueTest extends PropSpec with PropertyChecks with TestCommon } property("OPList types should correctly wrap their corresponding types") { + implicit val textListMonoid: Monoid[TextList] = TextList.monoid + forAll(geoGen) { x => checkVals(Geolocation(x), x) } - forAll(textListGen) { x => checkVals(TextList(x), x) } + forAll(textListGen) { x => + checkVals(TextList(x), x) + val tl = TextList(x) + tl + tl shouldBe TextList(x ++ x) // Test the monoid too + } forAll(longListGen) { x => checkVals(DateList(x), x) checkVals(DateTimeList(x), x) From 10644d87b7effc8cd1c62387016bf20bfeca28d1 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Mon, 13 Jan 2020 15:48:35 -0800 Subject: [PATCH 05/13] Added another test and fixed sneaky metadata issue --- .../OPCollectionHashingVectorizer.scala | 15 +++--- .../impl/feature/SmartTextMapVectorizer.scala | 39 +++------------- .../feature/SmartTextMapVectorizerTest.scala | 46 ++++++++++++++++++- 3 files changed, 59 insertions(+), 41 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala index 2e1fa78710..c3c4588c1d 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala @@ -316,7 +316,6 @@ private[op] trait MapHashingFun extends HashingFun { ( hashFeatures: Array[TransientFeature], ignoreFeatures: Array[TransientFeature], - features: Array[TransientFeature], params: HashingFunctionParams, hashKeys: Seq[Seq[String]], ignoreKeys: Seq[Seq[String]], @@ -337,25 +336,27 @@ private[op] trait MapHashingFun extends HashingFun { }.toArray } else { for { - (keys, f) <- hashKeys.toArray.zip(hashFeatures) + // Need to filter out empty key sequences since the hashFeatures only contain a map feature if one of their + // keys is to be hashed, but hashKeys contains a sequence per map (whether it's empty or not) + (keys, f) <- hashKeys.filter(_.nonEmpty).zip(hashFeatures) key <- keys i <- 0 until numHashes } yield f.toColumnMetaData().copy(grouping = Option(key)) - } + }.toArray // All columns get null tracking or text length tracking, whether their contents are hashed or ignored - val allKeys = hashKeys.zip(ignoreKeys).map{ case(h, i) => h ++ i } - // val allFeatures = + val allTextKeys = hashKeys.zip(ignoreKeys).map{ case(h, i) => h ++ i } + val allTextFeatures = hashFeatures ++ ignoreFeatures val nullColumns = if (shouldTrackNulls) { for { - (keys, f) <- allKeys.toArray.zip(features) + (keys, f) <- allTextKeys.toArray.zip(allTextFeatures) key <- keys } yield f.toColumnMetaData(isNull = true).copy(grouping = Option(key)) } else Array.empty[OpVectorColumnMetadata] val lenColumns = if (shouldTrackLen) { for { - (keys, f) <- allKeys.toArray.zip(features) + (keys, f) <- allTextKeys.toArray.zip(allTextFeatures) key <- keys } yield f.toColumnMetaData(descriptorValue = OpVectorColumnMetadata.TextLenString).copy(grouping = Option(key)) } else Array.empty[OpVectorColumnMetadata] diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index 11e84e3528..014407d9e6 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -104,58 +104,30 @@ class SmartTextMapVectorizer[T <: OPMap[String]] ) } else Array.empty[OpVectorColumnMetadata] - - /* - val hashedTextColumns = if (args.hashFeatureInfo.flatten.nonEmpty) { - val (mapFeatures, mapFeatureInfo) = - inN.toSeq.zip(args.hashFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip - val allKeys = mapFeatureInfo.map(_.map(_.key)) - makeVectorColumnMetadata( - features = mapFeatures.toArray, - params = makeHashingParams(), - allKeys = allKeys, - shouldTrackNulls = args.shouldTrackNulls, - shouldTrackLen = $(trackTextLen) - ) - } else Array.empty[OpVectorColumnMetadata] - - val ignoredTextColumns = if (args.ignoreFeatureInfo.flatten.nonEmpty) { - val (mapFeatures, mapFeatureInfo) = - inN.toSeq.zip(args.ignoreFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip - val allKeys = mapFeatureInfo.map(_.map(_.key)) - makeVectorColumnMetadata( - features = mapFeatures.toArray, - params = makeHashingParams(), - allKeys = allKeys, - shouldTrackNulls = args.shouldTrackNulls, - shouldTrackLen = $(trackTextLen) - ) - } else Array.empty[OpVectorColumnMetadata] - */ - val allTextFeatureInfo = args.hashFeatureInfo.zip(args.ignoreFeatureInfo).map{ case (h, i) => h ++ i } val allTextColumns = if (allTextFeatureInfo.flatten.nonEmpty) { val (mapFeatures, mapFeatureInfo) = inN.toSeq.zip(allTextFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip val allKeys = mapFeatureInfo.map(_.map(_.key)) + // Careful when zipping sequences like hashKeys (length and hashFeatures val hashKeys = args.hashFeatureInfo.map( _.filter(_.vectorizationMethod == TextVectorizationMethod.Hash).map(_.key) ) val ignoreKeys = args.ignoreFeatureInfo.map( _.filter(_.vectorizationMethod == TextVectorizationMethod.Ignore).map(_.key) ) - val hashFeatures = inN.toSeq.zip(args.hashFeatureInfo).filter{ + + val hashFeatures = inN.toSeq.zip(args.hashFeatureInfo).filter { case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty - }.unzip._1 + }.map(_._1) val ignoreFeatures = inN.toSeq.zip(args.ignoreFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty - }.unzip._1 + }.map(_._1) makeVectorColumnMetadata( hashFeatures = hashFeatures.toArray, ignoreFeatures = ignoreFeatures.toArray, - features = mapFeatures.toArray, params = makeHashingParams(), hashKeys = hashKeys, ignoreKeys = ignoreKeys, @@ -348,6 +320,7 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op] val rowTextTokenized = rowHashTokenized + rowIgnoreTokenized // Go go algebird! val hashVector = hash(rowHashTokenized, keysHash, args.hashingParams) + // All columns get null tracking or text length tracking, whether their contents are hashed or ignored val textNullIndicatorsVector = if (args.shouldTrackNulls) getNullIndicatorsVector(keysText, rowTextTokenized) else OPVector.empty val textLenVector = if ($(trackTextLen)) getLenVector(keysText, rowTextTokenized) else OPVector.empty diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 3b65a085f4..a5af0c824c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -105,6 +105,13 @@ class SmartTextMapVectorizerTest val mapData = generatedData.map { case (t1, t2, t3, t4) => mapifyText(Seq(t1, t2, t3, t4)) } val (rawDF, rawTextMap) = TestFeatureBuilder("textMap", mapData) + // Do the same thing with the data spread across many maps to test that they get combined correctly as well + val mapDataSeparate = generatedData.map { + case (t1, t2, t3, t4) => (mapifyText(Seq(t1, t2)), mapifyText(Seq(t3)), mapifyText(Seq(t4))) + } + val (rawDFSeparateMaps, rawTextMap1, rawTextMap2, rawTextMap3) = + TestFeatureBuilder("textMap1", "textMap2", "textMap3", mapDataSeparate) + /** * Estimator instance to be tested */ @@ -430,7 +437,8 @@ class SmartTextMapVectorizerTest result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 } } - it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance" in { + it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance " + + "when data is in a single TextMap" in { val topKCategorial = 3 val hashSize = 5 @@ -465,4 +473,40 @@ class SmartTextMapVectorizerTest meta.columns.slice(18, 21).forall(_.indicatorValue.contains(OpVectorColumnMetadata.NullString)) } + it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance " + + "when data is in many TextMaps" in { + val topKCategorial = 3 + val hashSize = 5 + + val smartVectorized = new SmartTextMapVectorizer() + .setMaxCardinality(10).setNumFeatures(hashSize).setMinSupport(10).setTopK(topKCategorial) + .setMinLengthStdDev(1.0) + .setAutoDetectLanguage(false).setMinTokenLength(1).setToLowercase(false) + .setTrackNulls(true).setTrackTextLen(true) + .setInput(rawTextMap1, rawTextMap2, rawTextMap3).getOutput() + + val transformed = new OpWorkflow().setResultFeatures(smartVectorized).transform(rawDFSeparateMaps) + val result = transformed.collect(smartVectorized) + + /* + Feature vector should have 16 components, corresponding to two hashed text fields, one categorical field, and + one ignored text field. + + Hashed text: (5 hash buckets + 1 length + 1 null indicator) = 7 elements + Categorical: (3 topK + 1 other + 1 null indicator) = 5 elements + Ignored text: (1 length + 1 null indicator) = 2 elements + */ + val featureVectorSize = 2 * (hashSize + 2) + (topKCategorial + 2) + 2 + val firstRes = result.head + firstRes.v.size shouldBe featureVectorSize + + val meta = OpVectorMetadata(transformed.schema(smartVectorized.name)) + meta.columns.length shouldBe featureVectorSize + meta.columns.slice(0, 5).forall(_.grouping.contains("categorical")) + meta.columns.slice(5, 10).forall(_.grouping.contains("country")) + meta.columns.slice(10, 15).forall(_.grouping.contains("text")) + meta.columns.slice(15, 18).forall(_.descriptorValue.contains(OpVectorColumnMetadata.TextLenString)) + meta.columns.slice(18, 21).forall(_.indicatorValue.contains(OpVectorColumnMetadata.NullString)) + } + } From a107db1572bea004fcdc4fbfd6fd7ba6a40f28d5 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 14 Jan 2020 11:25:17 -0800 Subject: [PATCH 06/13] Pulled out SensitiveFeatureInformation metadata changes into its own branch --- .../com/salesforce/op/ModelInsights.scala | 42 ++++- .../stages/impl/feature/Transmogrifier.scala | 4 +- .../com/salesforce/op/ModelInsightsTest.scala | 77 +++++++- .../op/utils/spark/OpVectorMetadata.scala | 51 ++++-- .../op/utils/spark/OPVectorMetadataTest.scala | 103 ++++++++--- .../op/SensitiveFeatureInformation.scala | 165 ++++++++++++++++++ .../op/SensitiveFeatureInformationTest.scala | 134 ++++++++++++++ 7 files changed, 531 insertions(+), 45 deletions(-) create mode 100644 utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala create mode 100644 utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 529217491d..08645260b4 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -333,7 +333,7 @@ case class Discrete(domain: Seq[String], prob: Seq[Double]) extends LabelInfo * @param metrics sequence containing metrics computed in RawFeatureFilter * @param distributions distribution information for the raw feature (if calculated in RawFeatureFilter) * @param exclusionReasons exclusion reasons for the raw feature (if calculated in RawFeatureFilter) - * + * @param sensitiveInformation derived information about sensitive field checks (if performed) */ case class FeatureInsights ( @@ -342,7 +342,8 @@ case class FeatureInsights derivedFeatures: Seq[Insights], metrics: Seq[RawFeatureFilterMetrics] = Seq.empty, distributions: Seq[FeatureDistribution] = Seq.empty, - exclusionReasons: Seq[ExclusionReasons] = Seq.empty + exclusionReasons: Seq[ExclusionReasons] = Seq.empty, + sensitiveInformation: Seq[SensitiveFeatureInformation] = Seq.empty ) /** @@ -697,8 +698,41 @@ case object ModelInsights { val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname) val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname) val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname) - FeatureInsights(featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2), - metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons) + val sensitiveFeatureInformation = vectorInfo.flatMap(_.sensitive.get(fname)) match { + case Some(info) => info + case _ => Seq.empty + } + FeatureInsights( + featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2), + metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons, + sensitiveInformation = sensitiveFeatureInformation + ) + }.toSeq ++ { + /* + Add FeatureInsights for removed sensitive fields that do not have a column in OpVectorMetadata. + With current TMOG settings, this will not happen unless null tracking is turned off since + null indicators are created for all text features, even ignored ones. + */ + vectorInfo match { + case Some(v) => + // Find features where `actionTaken` is true for all of the sensitive feature informations + v.sensitive.collect { + case (fname, sensitiveFeatureInformation) + if sensitiveFeatureInformation.forall(_.actionTaken) => + val ftype = allFeatures.find(_.name == fname) + .map(_.typeName) + .getOrElse("") + val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname) + val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname) + val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname) + FeatureInsights( + featureName = fname, featureType = ftype, derivedFeatures = Seq.empty, + metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons, + sensitiveInformation = sensitiveFeatureInformation + ) + } + case None => Seq.empty[FeatureInsights] + } }.toSeq } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala index 8ce42cb8ac..f50f8f1726 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala @@ -380,7 +380,7 @@ trait VectorizerDefaults extends OpPipelineStageBase { val cols = if (withNullTracking) tf.flatMap { f => Seq(f.toColumnMetaData(), f.toColumnMetaData(isNull = true)) } else tf.map { f => f.toColumnMetaData() } - OpVectorMetadata(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName)) + OpVectorMetadata.apply(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName)) } /** @@ -697,6 +697,6 @@ trait MapStringPivotHelper extends SaveOthersParams { ): OpVectorMetadata = { val otherValueString = $(unseenName) val cols = makeVectorColumnMetadata(topValues, inputFeatures, otherValueString, trackNulls) - OpVectorMetadata(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName)) + OpVectorMetadata.apply(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName)) } } diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index d1fa503188..fc790dc81a 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -574,7 +574,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou indicatorValue = Option(name) ) }, - Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap + Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, + Map( + "f0" -> Seq(SensitiveFeatureInformation.Name(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false)) + ) ) it should "correctly extract the LabelSummary from the label and sanity checker info" in { @@ -623,6 +626,18 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou f0In.featureName shouldBe "f0" f0In.featureType shouldBe classOf[PickList].getName f0In.derivedFeatures.size shouldBe 2 + f0In.sensitiveInformation match { + case Seq(SensitiveFeatureInformation.Name( + probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken + )) => + actionTaken shouldBe false + probName shouldBe 0.0 + genderDetectResults shouldBe Seq.empty[String] + probMale shouldBe 0.0 + probFemale shouldBe 0.0 + probOther shouldBe 1.0 + case _ => fail("SensitiveFeatureInformation was not found.") + } val f0InDer2 = f0In.derivedFeatures.head f0InDer2.derivedFeatureName shouldBe "f0_f0_f2_1" @@ -690,6 +705,62 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou insights.features.foreach(f => f.distributions shouldBe empty) } + it should + """include sensitive feature information + |even for sensitive features that are removed from output vector and output vector metadata""".stripMargin in { + // Copy metadata from above but add new feature that was removed in vectorizing to sensitive info + val f_notInMeta = Feature[Text]("f_notInMeta", false, null, Seq(), "test") + val newMeta = OpVectorMetadata( + "fv", + OpVectorColumnMetadata( + parentFeatureName = Seq("f1"), + parentFeatureType = Seq(classOf[Real].getName), + grouping = None, + indicatorValue = None + ) +: Array("f2", "f3").map { name => + OpVectorColumnMetadata( + parentFeatureName = Seq("f0"), + parentFeatureType = Seq(classOf[PickList].getName), + grouping = Option("f0"), + indicatorValue = Option(name) + ) + }, + Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, + Map( + "f0" -> Seq(SensitiveFeatureInformation.Name( + 0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false + )), + "f_notInMeta" -> Seq(SensitiveFeatureInformation.Name( + 1.0, Seq.empty[String], 0.0, 0.0, 1.0, "f_notInMeta", None, true + )) + ) + ) + + val labelSum = ModelInsights.getLabelSummary(Option(lbl), Option(summary)) + + val featureInsights = ModelInsights.getFeatureInsights( + Option(newMeta), Option(summary), None, Array(f1, f0, f_notInMeta), Array.empty, Map.empty[String, Set[String]], + RawFeatureFilterResults(), labelSum + ) + featureInsights.size shouldBe 3 + val f_notInMeta_butInInsights = featureInsights.find(_.featureName == "f_notInMeta").get + f_notInMeta_butInInsights.featureName shouldBe "f_notInMeta" + f_notInMeta_butInInsights.featureType shouldBe classOf[Text].getName + f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0 + f_notInMeta_butInInsights.sensitiveInformation match { + case Seq(SensitiveFeatureInformation.Name( + probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken + )) => + actionTaken shouldBe true + probName shouldBe 1.0 + genderDetectResults shouldBe Seq.empty[String] + probMale shouldBe 0.0 + probFemale shouldBe 0.0 + probOther shouldBe 1.0 + case _ => fail("SensitiveFeatureInformation was not found.") + } + } + it should "return model insights for xgboost classification" in { noException should be thrownBy xgbWorkflowModel.modelInsights(xgbClassifierPred) val insights = xgbWorkflowModel.modelInsights(xgbClassifierPred) @@ -794,8 +865,8 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou } cardinality.foreach { case (featureName, value) => - val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet - actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble) + val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet + actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble) } } diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 71eb7b7254..ca059d791d 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -30,7 +30,7 @@ package com.salesforce.op.utils.spark -import com.salesforce.op.FeatureHistory +import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation} import com.salesforce.op.features.types.{FeatureType, _} import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NumericAttribute} import org.apache.spark.ml.linalg.SQLDataTypes._ @@ -43,14 +43,17 @@ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} * * @param name name of the feature vector * @param col information about each element in the vector - * @param history history of parent features used to create the vector map is from + * @param history history of parent features used to create the vector; map is from * OpVectorColumnMetadata.parentFeatureName (String) to FeatureHistory + * @param sensitive parent features that were detected as sensitive in the creation of the vector; + * map is from OpVectorColumnMetadata.parentFeatureName (String) to SensitiveFeatureInformation */ class OpVectorMetadata private ( val name: String, col: Array[OpVectorColumnMetadata], - val history: Map[String, FeatureHistory] // TODO fix map -> causes problems when multiple vectorizers used on feature + val history: Map[String, FeatureHistory], // TODO fix map -> causes problems when multiple vectorizers used on feature + val sensitive: Map[String, Seq[SensitiveFeatureInformation]] = Map.empty[String, Seq[SensitiveFeatureInformation]] ) { /** @@ -92,6 +95,7 @@ class OpVectorMetadata private val meta = new MetadataBuilder() .putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray) .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history)) + .putMetadata(OpVectorMetadata.SensitiveKey, SensitiveFeatureInformation.toMetadata(sensitive)) .build() val attributes = columns.map { case c if (c.indicatorValue.isDefined || binaryTypes.exists(c.parentFeatureType.contains)) && @@ -161,7 +165,10 @@ class OpVectorMetadata private override def equals(obj: Any): Boolean = obj match { case o: OpVectorMetadata - if o.name == name && o.columns.toSeq == columns.toSeq && history == o.history => true + if o.name == name && + o.columns.toSeq == columns.toSeq && + history == o.history && + sensitive == o.sensitive => true case _ => false } @@ -169,7 +176,7 @@ class OpVectorMetadata private override def hashCode(): Int = 37 * columns.toSeq.hashCode() override def toString: String = - s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history)" + s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history,$sensitive)" } @@ -179,6 +186,7 @@ object OpVectorMetadata { val ColumnsKey = "vector_columns" val HistoryKey = "vector_history" + val SensitiveKey = "vector_detected_sensitive" /** * Construct an [[OpVectorMetadata]] from a [[StructField]], assuming that [[ColumnsKey]] is present and conforms @@ -197,9 +205,14 @@ object OpVectorMetadata { if (wrapped.underlyingMap(HistoryKey).asInstanceOf[Metadata].isEmpty) Map.empty[String, FeatureHistory] else FeatureHistory.fromMetadataMap(field.metadata.getMetadata(HistoryKey)) - new OpVectorMetadata(field.name, columns, history) - } + val sensitive = + if (wrapped.underlyingMap(SensitiveKey).asInstanceOf[Metadata].isEmpty) { + Map.empty[String, Seq[SensitiveFeatureInformation]] + } + else SensitiveFeatureInformation.fromMetadataMap(field.metadata.getMetadata(SensitiveKey)) + new OpVectorMetadata(field.name, columns, history, sensitive) + } /** * Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]] @@ -214,9 +227,24 @@ object OpVectorMetadata { name: String, columns: Array[OpVectorColumnMetadata], history: Map[String, FeatureHistory] - ): OpVectorMetadata = { - new OpVectorMetadata(name, columns, history) - } + ): OpVectorMetadata = new OpVectorMetadata(name, columns, history) + + /** + * Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]] + * representing its columns. + * + * @param name The name of the column the metadata represents + * @param columns The columns within the vectors + * @param history The history of the parent features + * @param sensitive Which columns have been marked as sensitive and related information + * @return The constructed vector metadata + */ + def apply( + name: String, + columns: Array[OpVectorColumnMetadata], + history: Map[String, FeatureHistory], + sensitive: Map[String, Seq[SensitiveFeatureInformation]] + ): OpVectorMetadata = new OpVectorMetadata(name, columns, history, sensitive) /** * Construct an [[OpVectorMetadata]] from its name and a [[Metadata]], assuming that [[ColumnsKey]] and @@ -242,7 +270,8 @@ object OpVectorMetadata { def flatten(outputName: String, vectors: Seq[OpVectorMetadata]): OpVectorMetadata = { val allColumns = vectors.flatMap(_.columns).toArray val allHist = vectors.flatMap(_.history).toMap - new OpVectorMetadata(outputName, allColumns, allHist) + val allSensitive = vectors.flatMap(_.sensitive).toMap + new OpVectorMetadata(outputName, allColumns, allHist, allSensitive) } } diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala index 438119ee57..0eff4dc9e1 100644 --- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala +++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala @@ -30,7 +30,7 @@ package com.salesforce.op.utils.spark -import com.salesforce.op.FeatureHistory +import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation} import com.salesforce.op.features.types.{DateTime, Email, FeatureType, OPMap, PickList, Prediction, Real, RealMap, TextAreaMap} import com.salesforce.op.test.TestCommon import org.apache.spark.sql.types.Metadata @@ -47,7 +47,11 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks type OpVectorColumnTuple = (Seq[String], Seq[String], Option[String], Option[String], Option[String], Int) type FeatureHistoryTuple = (Seq[String], Seq[String]) - type OpVectorTuple = (String, Array[OpVectorColumnTuple], FeatureHistoryTuple) + + type SensitiveTuple = (SensitiveNameTuple, String, Option[String], Boolean) + type SensitiveNameTuple = (Double, Seq[String], Double, Double, Double) + + type OpVectorTuple = (String, Array[OpVectorColumnTuple], FeatureHistoryTuple, Seq[SensitiveTuple]) // AttributeGroup and Attribute require non-empty names val genName: Gen[String] = Gen.nonEmptyListOf(alphaNumChar).map(_.mkString) @@ -72,24 +76,51 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks ) val arrVecColTupleGen: Gen[Array[OpVectorColumnTuple]] = Gen.containerOf[Array, OpVectorColumnTuple](vecColTupleGen) + val sensitiveGen: Gen[SensitiveTuple] = for { + featureName <- genName + mapKey <- Gen.option(genName) + actionTaken <- Gen.oneOf[Boolean](Seq(false, true)) + probName <- Gen.choose(0.0, 1.0) + genderDetectResults <- Gen.containerOf[Seq, String](genName) + probMale <- Gen.choose(0.0, 1.0) + probFemale <- Gen.choose(0.0, 1.0 - probMale) + probOther <- Gen.choose(0.0, 1.0 - probMale - probFemale) + } yield { + ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken) + } + val vecGen: Gen[OpVectorTuple] = for { name <- genName arr <- arrVecColTupleGen histories <- featHistTupleGen + sensitiveCols <- Gen.containerOf[Seq, SensitiveTuple](sensitiveGen) } yield { - (name, arr, histories) + (name, arr, histories, sensitiveCols) } val seqVecGen: Gen[Seq[OpVectorTuple]] = Gen.containerOf[Seq, OpVectorTuple](vecGen) - private def generateHistory(columnsMeta: Array[OpVectorColumnMetadata], hist: (Seq[String], Seq[String])) = + private def generateHistory( + columnsMeta: Array[OpVectorColumnMetadata], hist: (Seq[String], Seq[String]) + ): Map[String, FeatureHistory] = columnsMeta.flatMap(v => v.parentFeatureName.map(p => p -> FeatureHistory(hist._1, hist._2))).toMap - private def checkTuples(tup: OpVectorColumnTuple) = tup._1.nonEmpty && tup._2.nonEmpty + private def generateSensitiveFeatureInfo( + columnsMeta: Array[OpVectorColumnMetadata], sensitiveInfoSeqRaw: Seq[SensitiveTuple] + ): Map[String, Seq[SensitiveFeatureInformation]] = { + val sensitiveInfoSeq = sensitiveInfoSeqRaw map { + case ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken) => + SensitiveFeatureInformation.Name( + probName, genderDetectResults, probMale, probFemale, probOther, featureName, mapKey, actionTaken + ) + } + columnsMeta.flatMap(v => v.parentFeatureName.map(p => p -> sensitiveInfoSeq)).toMap + } + private def checkTuples(tup: OpVectorColumnTuple): Boolean = tup._1.nonEmpty && tup._2.nonEmpty property("column metadata stays the same when serialized to spark metadata") { - forAll(vecColTupleGen) { (vct: OpVectorColumnTuple) => + forAll(vecColTupleGen) { vct: OpVectorColumnTuple => if (checkTuples(vct)) { val columnMeta = OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5) columnMeta shouldEqual OpVectorColumnMetadata.fromMetadata(columnMeta.toMetadata()).head @@ -98,7 +129,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks } property("column metadata cannot be created with empty parents or feature types") { - forAll(vecColTupleGen) { (vct: OpVectorColumnTuple) => + forAll(vecColTupleGen) { vct: OpVectorColumnTuple => if (!checkTuples(vct)) { assertThrows[IllegalArgumentException] { OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5) } } @@ -106,22 +137,34 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks } property("vector metadata stays the same when serialized to spark metadata") { - forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) => - val cols = columns.filter(checkTuples) - val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) - val history = generateHistory(columnsMeta, hist) - val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history) - val field = vectorMeta.toStructField() - vectorMeta shouldEqual OpVectorMetadata(field) + forAll(vecGen) { + case (outputName: String, + columns: Array[OpVectorColumnTuple], + hist: FeatureHistoryTuple, + sens: Seq[SensitiveTuple] + ) if outputName.nonEmpty => + val cols = columns.filter(checkTuples) + val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) + val history = generateHistory(columnsMeta, hist) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive) + val field = vectorMeta.toStructField() + vectorMeta shouldEqual OpVectorMetadata(field) + case _ => true shouldEqual true } } property("vector metadata properly finds indices of its columns") { - forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) => + forAll(vecGen) { + case (outputName: String, + columns: Array[OpVectorColumnTuple], + hist: FeatureHistoryTuple, + sens: Seq[SensitiveTuple]) => val cols = columns.filter(checkTuples) val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) val history = generateHistory(columnsMeta, hist) - val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive) for {(col, i) <- vectorMeta.columns.zipWithIndex} { vectorMeta.index(col) shouldEqual i } @@ -139,13 +182,14 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks } property("vector metadata flattens correctly") { - forAll(seqVecGen) { (vectors: Seq[OpVectorTuple]) => + forAll(seqVecGen) { vectors: Seq[OpVectorTuple] => val vecs = vectors.map { - case (outputName, columns, hist) => + case (outputName, columns, hist, sens) => val cols = columns.filter(checkTuples) val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) val history = generateHistory(columnsMeta, hist) - OpVectorMetadata(outputName, columnsMeta, history) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + OpVectorMetadata(outputName, columnsMeta, history, sensitive) } val flattened = OpVectorMetadata.flatten("out", vecs) flattened.size shouldEqual vecs.map(_.size).sum @@ -155,12 +199,16 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks } property("vector metadata should properly serialize to and from spark metadata") { - forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) => + forAll(vecGen) { + case (outputName: String, + columns: Array[OpVectorColumnTuple], + hist: FeatureHistoryTuple, + sens: Seq[SensitiveTuple]) => val cols = columns.filter(checkTuples) val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) val history = generateHistory(columnsMeta, hist) - - val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive) val vectorMetaFromSerialized = OpVectorMetadata(vectorMeta.name, vectorMeta.toMetadata) vectorMeta.name shouldEqual vectorMetaFromSerialized.name @@ -171,13 +219,18 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks property("vector metadata should generate feature history correctly") { - forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) => + forAll(vecGen) { case ( + outputName: String, + columns: Array[OpVectorColumnTuple], + hist: FeatureHistoryTuple, + sens: Seq[SensitiveTuple]) => val cols = columns.filter(checkTuples) val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) val history = generateHistory(columnsMeta, hist) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive) - val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history) - if (history.isEmpty && columnsMeta.nonEmpty ) { + if (history.isEmpty && columnsMeta.nonEmpty) { assertThrows[RuntimeException](vectorMeta.getColumnHistory()) } else { val colHist = vectorMeta.getColumnHistory() diff --git a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala new file mode 100644 index 0000000000..918701d645 --- /dev/null +++ b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op + +import com.salesforce.op.utils.json.JsonLike +import com.salesforce.op.utils.spark.RichMetadata._ +import org.apache.spark.sql.types.{Metadata, MetadataBuilder} +import enumeratum._ + +/** + * A base class for different SensitiveFeatureInformation (implemented as an enum) + * Currently, only Name types are supported but there are placeholders for other possibilities + * The following three params are required for every kind of SensitiveFeatureInformation + * + * @param name the name of the raw feature + * @param key optionally, the name of the key (if the raw feature is a Map type) + * @param actionTaken whether the handling of the raw feature changed b/c it was detected as sensitive + */ +sealed class SensitiveFeatureInformation +( + val name: String, + val key: Option[String] = None, + val actionTaken: Boolean = false +) extends EnumEntry with JsonLike { + + /** + * Convert to Spark metadata + * + * @return metadata representation + */ + def toMetadata: Metadata = { + this match { + case SensitiveFeatureInformation.Name( + probName, genderDetectResults, probMale, probFemale, probOther, name, key, actionTaken + ) => + new MetadataBuilder() + .putString(SensitiveFeatureInformation.NameKey, name) + .putString(SensitiveFeatureInformation.MapKeyKey, key.getOrElse("")) + .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken) + .putString(SensitiveFeatureInformation.TypeKey, this.entryName) + .putDouble(SensitiveFeatureInformation.Name.ProbNameKey, probName) + .putStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey, genderDetectResults.toArray) + .putDouble(SensitiveFeatureInformation.Name.ProbMaleKey, probMale) + .putDouble(SensitiveFeatureInformation.Name.ProbFemaleKey, probFemale) + .putDouble(SensitiveFeatureInformation.Name.ProbOtherKey, probOther) + .build() + case _ => throw new RuntimeException( + "Metadata for sensitive features other than names have not been implemented.") + } + } +} +case object SensitiveFeatureInformation extends Enum[SensitiveFeatureInformation] { + val NameKey = "FeatureName" + val MapKeyKey = "MapKey" + val ActionTakenKey = "ActionTaken" + val TypeKey = "DetectedSensitiveFeatureKind" + val values: Seq[SensitiveFeatureInformation] = findValues + + // Utilized by SmartTextVectorizer's name detection + case class Name + ( + probName: Double, + genderDetectResults: Seq[String], + probMale: Double, + probFemale: Double, + probOther: Double, + override val name: String, + override val key: Option[String] = None, + override val actionTaken: Boolean = false + ) extends SensitiveFeatureInformation(name, key, actionTaken) { + override val entryName: String = SensitiveFeatureInformation.Name.EntryName + } + case object Name { + val EntryName = "Name" + val ProbNameKey = "ProbName" + val GenderDetectStratsKey = "GenderDetectStrats" + val ProbMaleKey = "ProbMale" + val ProbFemaleKey = "ProbFemale" + val ProbOtherKey = "ProbOther" + } + + // Not yet implemented + case object Salutation extends SensitiveFeatureInformation("None", None, false) + case object BirthDate extends SensitiveFeatureInformation("None", None, false) + case object PostalCode extends SensitiveFeatureInformation("None", None, false) + case object Other extends SensitiveFeatureInformation("None", None, false) + + /** + * Build metadata from Map of [[SensitiveFeatureInformation]] instances + * + * @param map Map from feature name to Seq of [[SensitiveFeatureInformation]] about that feature + * @return metadata representation + */ + def toMetadata(map: Map[String, Seq[SensitiveFeatureInformation]]): Metadata = { + val builder = new MetadataBuilder() + map.foreach { case (k, values) => builder.putMetadataArray(k, values map { _.toMetadata } toArray) } + builder.build() + } + + /** + * Build Map of [[SensitiveFeatureInformation]] instances from metadata + * + * @param meta metadata containing a mapping from feature name to [[SensitiveFeatureInformation]] + * @return map of that information + */ + def fromMetadataMap(meta: Metadata): Map[String, Seq[SensitiveFeatureInformation]] = { + val infoMap = meta.wrapped.underlyingMap + infoMap.map { case (k, values) => k -> values.asInstanceOf[Array[Metadata]].map(fromMetadata).toSeq } + } + + /** + * Build [[SensitiveFeatureInformation]] from metadata + * + * @param meta Metadata representing [[SensitiveFeatureInformation]] + * @return new instance of [[SensitiveFeatureInformation]] + */ + def fromMetadata(meta: Metadata): SensitiveFeatureInformation = { + meta.getString(SensitiveFeatureInformation.TypeKey) match { + case SensitiveFeatureInformation.Name.EntryName => + SensitiveFeatureInformation.Name( + meta.getDouble(SensitiveFeatureInformation.Name.ProbNameKey), + meta.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey), + meta.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey), + meta.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey), + meta.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey), + meta.getString(SensitiveFeatureInformation.NameKey), + { + val mapKey = meta.getString(SensitiveFeatureInformation.MapKeyKey) + if (mapKey.isEmpty) None else Some(mapKey) + }, + meta.getBoolean(SensitiveFeatureInformation.ActionTakenKey) + ) + case _ => throw new RuntimeException( + "Metadata for sensitive features other than names have not been implemented.") + } + } +} diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala new file mode 100644 index 0000000000..7c26d04f23 --- /dev/null +++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op + +import com.salesforce.op.test.TestCommon +import org.apache.spark.sql.types.MetadataBuilder +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { + + val probName = 1.0 + val genderDetectResults: Seq[String] = Seq("ByIndex", "AnotherStrategy") + val probMale = 0.25 + val probFemale = 0.50 + val probOther = 0.25 + val name = "feature" + val mapKey: Option[String] = None + val actionTaken = true + + val sensitiveFeatureInfo: SensitiveFeatureInformation.Name = SensitiveFeatureInformation.Name( + probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken + ) + + Spec[SensitiveFeatureInformation] should "convert sensitive feature information to metadata" in { + val metadata = sensitiveFeatureInfo.toMetadata + + metadata.contains(SensitiveFeatureInformation.NameKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.ActionTakenKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.Name.ProbNameKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true + + metadata.getString(SensitiveFeatureInformation.NameKey) shouldBe name + metadata.getString(SensitiveFeatureInformation.MapKeyKey) shouldBe "" + metadata.getBoolean(SensitiveFeatureInformation.ActionTakenKey) shouldBe actionTaken + metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveFeatureInformation.Name.EntryName + metadata.getDouble(SensitiveFeatureInformation.Name.ProbNameKey) shouldBe probName + metadata.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe genderDetectResults + metadata.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe probMale + metadata.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale + metadata.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther + } + + it should "create metadata from a map" in { + val info1 = sensitiveFeatureInfo + val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) + val map = Map("1" -> Seq(info1), "2" -> Seq(info2)) + val metadata = SensitiveFeatureInformation.toMetadata(map) + + metadata.contains("1") shouldBe true + metadata.contains("2") shouldBe true + + val f1 = metadata.getMetadata("1") + f1.contains(SensitiveFeatureInformation.NameKey) shouldBe true + f1.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true + f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + f1.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true + f1.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true + f1.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true + f1.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true + f1.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe genderDetectResults + f1.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe probMale + f1.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale + f1.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther + + val f2 = metadata.getMetadata("2") + f2.contains(SensitiveFeatureInformation.NameKey) shouldBe true + f2.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true + f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + f2.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true + f2.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true + f2.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true + f2.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true + f2.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe Seq("") + f2.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe 0.0 + f2.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe 0.0 + f2.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe 0.0 + } + + it should "create a map from metadata" in { + val info1 = sensitiveFeatureInfo + val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) + + val mapMetadata = new MetadataBuilder() + .putMetadata("1", info1.toMetadata) + .putMetadata("2", info2.toMetadata) + .build() + + val map = SensitiveFeatureInformation.fromMetadataMap(mapMetadata) + + map.contains("1") shouldBe true + map("1") shouldBe info1 + map.contains("2") shouldBe true + map("2") shouldBe info2 + } +} + From acb873e0df2845335bfda8bf06f7982d2e0d94fc Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 14 Jan 2020 13:18:19 -0800 Subject: [PATCH 07/13] Fixed SensitiveFeatureInformation tests failing due to not changing the to/fromMetadata tests --- .../op/SensitiveFeatureInformationTest.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala index 7c26d04f23..8dfbb23f63 100644 --- a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala +++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala @@ -85,7 +85,7 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { metadata.contains("1") shouldBe true metadata.contains("2") shouldBe true - val f1 = metadata.getMetadata("1") + val f1 = metadata.getMetadataArray("1").head f1.contains(SensitiveFeatureInformation.NameKey) shouldBe true f1.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true @@ -99,7 +99,7 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { f1.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale f1.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther - val f2 = metadata.getMetadata("2") + val f2 = metadata.getMetadataArray("2").head f2.contains(SensitiveFeatureInformation.NameKey) shouldBe true f2.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true @@ -119,16 +119,16 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) val mapMetadata = new MetadataBuilder() - .putMetadata("1", info1.toMetadata) - .putMetadata("2", info2.toMetadata) + .putMetadataArray("1", Array(info1.toMetadata)) + .putMetadataArray("2", Array(info2.toMetadata)) .build() val map = SensitiveFeatureInformation.fromMetadataMap(mapMetadata) map.contains("1") shouldBe true - map("1") shouldBe info1 + map("1") shouldBe Array(info1) map.contains("2") shouldBe true - map("2") shouldBe info2 + map("2") shouldBe Array(info2) } } From 3ae735c4c4cc62c02ef0911684b1e363fb0a95ca Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Tue, 14 Jan 2020 14:06:32 -0800 Subject: [PATCH 08/13] Addressing comments --- .../impl/feature/SmartTextMapVectorizer.scala | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index 014407d9e6..466e2a7e2e 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -110,7 +110,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]] inN.toSeq.zip(allTextFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip val allKeys = mapFeatureInfo.map(_.map(_.key)) - // Careful when zipping sequences like hashKeys (length and hashFeatures + // Careful when zipping sequences like hashKeys (lengtht and hashFeatures val hashKeys = args.hashFeatureInfo.map( _.filter(_.vectorizationMethod == TextVectorizationMethod.Hash).map(_.key) ) @@ -249,19 +249,21 @@ case class SmartTextMapVectorizerModelArgs shouldTrackNulls: Boolean, hashingParams: HashingFunctionParams ) extends JsonLike { + // Partition allFeatureInfo into separate SmartTextFeatureInfo sequences corresponding to each vectorization type val (categoricalFeatureInfo, hashFeatureInfo, ignoreFeatureInfo) = allFeatureInfo.map{ featureInfoSeq => val groups = featureInfoSeq.groupBy(_.vectorizationMethod) val catGroup = groups.getOrElse(TextVectorizationMethod.Pivot, Seq.empty) val hashGroup = groups.getOrElse(TextVectorizationMethod.Hash, Seq.empty) val ignoreGroup = groups.getOrElse(TextVectorizationMethod.Ignore, Seq.empty) - (catGroup, hashGroup, ignoreGroup) }.unzip3 + // Seq[Seq[String]] corresponding to the keys in each map that are treated with each vectorization type val categoricalKeys = categoricalFeatureInfo.map(_.map(_.key)) val hashKeys = hashFeatureInfo.map(_.map(_.key)) val ignoreKeys = ignoreFeatureInfo.map(_.map(_.key)) + // Combined keys for hashed and ignored features (everything that's not pivoted) val textKeys = hashKeys.zip(ignoreKeys).map{ case (hk, ik) => hk ++ ik } } @@ -277,6 +279,25 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op] with MapHashingFun with TextMapPivotVectorizerModelFun[OPMap[String]] { + /** + * Storage for results of row partitioning + * + * @param categoricalMaps Sequence of maps that have at least one key that should be treated as a categorical + * @param categoricalKeys Sequence containing keys for each map that correspond to categorical features + * @param hashMaps Sequence of maps that have at least one key that should be hashed + * @param hashKeys Sequence containing keys for each map that correspond to hashed features + * @param ignoreMaps Sequence of maps that have at least one key that should be ignored + * @param ignoreKeys Sequence containing keys for each map that correspond to ignored features + */ + case class PartitionResult( + categoricalMaps: Seq[OPMap[String]], + categoricalKeys: Seq[Seq[String]], + hashMaps: Seq[OPMap[String]], + hashKeys: Seq[Seq[String]], + ignoreMaps: Seq[OPMap[String]], + ignoreKeys: Seq[Seq[String]] + ) + private val categoricalPivotFn = pivotFn( topValues = args.categoricalFeatureInfo.filter(_.nonEmpty).map(_.map(info => info.key -> info.topValues)), shouldCleanKeys = args.shouldCleanKeys, @@ -284,8 +305,7 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op] shouldTrackNulls = args.shouldTrackNulls ) - private def partitionRow(row: Seq[OPMap[String]]): - (Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]]) = { + private def partitionRow(row: Seq[OPMap[String]]): PartitionResult = { val (rowCategorical, keysCategorical) = row.view.zip(args.categoricalKeys).collect { case (elements, keys) if keys.nonEmpty => val filtered = elements.value.filter { case (k, v) => keys.contains(k) } @@ -304,14 +324,14 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op] (TextMap(filtered), keys) }.unzip - (rowCategorical.toList, keysCategorical.toList, rowHashedText.toList, keysHashedText.toList, + PartitionResult(rowCategorical.toList, keysCategorical.toList, rowHashedText.toList, keysHashedText.toList, rowIgnoredText.toList, keysIgnoredText.toList) } def transformFn: Seq[T] => OPVector = row => { implicit val textListMonoid: Monoid[TextList] = TextList.monoid - val (rowCategorical, keysCategorical, rowHash, keysHash, rowIgnore, keysIgnore) = partitionRow(row) + val PartitionResult(rowCategorical, keysCategorical, rowHash, keysHash, rowIgnore, keysIgnore) = partitionRow(row) val keysText = keysHash + keysIgnore // Go algebird! val categoricalVector = categoricalPivotFn(rowCategorical) From bef2b22441b3e55a0dd5f95827fe89345041b844 Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Tue, 14 Jan 2020 16:14:51 -0800 Subject: [PATCH 09/13] Spelling --- .../op/stages/impl/feature/SmartTextMapVectorizer.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index 466e2a7e2e..7f62dff48e 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -110,7 +110,8 @@ class SmartTextMapVectorizer[T <: OPMap[String]] inN.toSeq.zip(allTextFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip val allKeys = mapFeatureInfo.map(_.map(_.key)) - // Careful when zipping sequences like hashKeys (lengtht and hashFeatures + // Careful when zipping sequences like hashKeys (length = number of maps, always) and + // hashFeatures (length <= number of maps, depending on which ones contain keys to hash) val hashKeys = args.hashFeatureInfo.map( _.filter(_.vectorizationMethod == TextVectorizationMethod.Hash).map(_.key) ) From 069031c8f298b46a68c914e110f9296420ca233c Mon Sep 17 00:00:00 2001 From: Kevin Moore Date: Tue, 14 Jan 2020 16:53:23 -0800 Subject: [PATCH 10/13] Fixed failing test by making default behavior of SmartTextVectorizer and SmartTextMapVectorizer the same as previous behavior - don't ignore any features --- .../op/stages/impl/feature/SmartTextMapVectorizer.scala | 2 +- .../salesforce/op/stages/impl/feature/SmartTextVectorizer.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index 7f62dff48e..03f4b47d99 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -153,7 +153,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]] textMapStats.keyValueCounts.toSeq.map { case (k, textStats) => val vecMethod: TextVectorizationMethod = textStats match { case _ if textStats.valueCounts.size <= maxCard => TextVectorizationMethod.Pivot - case _ if textStats.lengthStdDev <= minLenStdDev => TextVectorizationMethod.Ignore + case _ if textStats.lengthStdDev < minLenStdDev => TextVectorizationMethod.Ignore case _ => TextVectorizationMethod.Hash } val topVals = if (vecMethod == TextVectorizationMethod.Pivot) { diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala index a2eab274ff..d75e42bf36 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala @@ -91,7 +91,7 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( val (vectorizationMethods, topValues) = aggregatedStats.map { stats => val vecMethod: TextVectorizationMethod = stats match { case _ if stats.valueCounts.size <= maxCard => TextVectorizationMethod.Pivot - case _ if stats.lengthStdDev <= minLenStdDev => TextVectorizationMethod.Ignore + case _ if stats.lengthStdDev < minLenStdDev => TextVectorizationMethod.Ignore case _ => TextVectorizationMethod.Hash } val topValues = stats.valueCounts From a8504da0fefff9cbc09afc0203faafea08b7ee6a Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 23 Jan 2020 19:56:25 -0800 Subject: [PATCH 11/13] Removed enum from SensitiveFeatureInformation per PR comments --- .../com/salesforce/op/ModelInsightsTest.scala | 10 +- .../op/utils/spark/OPVectorMetadataTest.scala | 2 +- .../op/SensitiveFeatureInformation.scala | 122 ++++++++---------- .../op/SensitiveFeatureInformationTest.scala | 60 ++++----- 4 files changed, 89 insertions(+), 105 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index fc790dc81a..3ec6a43306 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -576,7 +576,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou }, Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, Map( - "f0" -> Seq(SensitiveFeatureInformation.Name(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false)) + "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false)) ) ) @@ -627,7 +627,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou f0In.featureType shouldBe classOf[PickList].getName f0In.derivedFeatures.size shouldBe 2 f0In.sensitiveInformation match { - case Seq(SensitiveFeatureInformation.Name( + case Seq(SensitiveNameInformation( probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken )) => actionTaken shouldBe false @@ -727,10 +727,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou }, Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, Map( - "f0" -> Seq(SensitiveFeatureInformation.Name( + "f0" -> Seq(SensitiveNameInformation( 0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false )), - "f_notInMeta" -> Seq(SensitiveFeatureInformation.Name( + "f_notInMeta" -> Seq(SensitiveNameInformation( 1.0, Seq.empty[String], 0.0, 0.0, 1.0, "f_notInMeta", None, true )) ) @@ -748,7 +748,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou f_notInMeta_butInInsights.featureType shouldBe classOf[Text].getName f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0 f_notInMeta_butInInsights.sensitiveInformation match { - case Seq(SensitiveFeatureInformation.Name( + case Seq(SensitiveNameInformation( probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken )) => actionTaken shouldBe true diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala index 0eff4dc9e1..a3c5963f4d 100644 --- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala +++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala @@ -110,7 +110,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks ): Map[String, Seq[SensitiveFeatureInformation]] = { val sensitiveInfoSeq = sensitiveInfoSeqRaw map { case ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken) => - SensitiveFeatureInformation.Name( + SensitiveNameInformation( probName, genderDetectResults, probMale, probFemale, probOther, featureName, mapKey, actionTaken ) } diff --git a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala index 918701d645..817914ae9e 100644 --- a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala +++ b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala @@ -33,85 +33,30 @@ package com.salesforce.op import com.salesforce.op.utils.json.JsonLike import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder} -import enumeratum._ /** - * A base class for different SensitiveFeatureInformation (implemented as an enum) - * Currently, only Name types are supported but there are placeholders for other possibilities + * A base class for different SensitiveFeatureInformation * The following three params are required for every kind of SensitiveFeatureInformation * * @param name the name of the raw feature * @param key optionally, the name of the key (if the raw feature is a Map type) * @param actionTaken whether the handling of the raw feature changed b/c it was detected as sensitive */ -sealed class SensitiveFeatureInformation +sealed abstract class SensitiveFeatureInformation ( val name: String, val key: Option[String] = None, val actionTaken: Boolean = false -) extends EnumEntry with JsonLike { - - /** - * Convert to Spark metadata - * - * @return metadata representation - */ - def toMetadata: Metadata = { - this match { - case SensitiveFeatureInformation.Name( - probName, genderDetectResults, probMale, probFemale, probOther, name, key, actionTaken - ) => - new MetadataBuilder() - .putString(SensitiveFeatureInformation.NameKey, name) - .putString(SensitiveFeatureInformation.MapKeyKey, key.getOrElse("")) - .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken) - .putString(SensitiveFeatureInformation.TypeKey, this.entryName) - .putDouble(SensitiveFeatureInformation.Name.ProbNameKey, probName) - .putStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey, genderDetectResults.toArray) - .putDouble(SensitiveFeatureInformation.Name.ProbMaleKey, probMale) - .putDouble(SensitiveFeatureInformation.Name.ProbFemaleKey, probFemale) - .putDouble(SensitiveFeatureInformation.Name.ProbOtherKey, probOther) - .build() - case _ => throw new RuntimeException( - "Metadata for sensitive features other than names have not been implemented.") - } - } +) extends JsonLike { + val EntryName: String + def toMetadata: Metadata } -case object SensitiveFeatureInformation extends Enum[SensitiveFeatureInformation] { + +object SensitiveFeatureInformation { val NameKey = "FeatureName" val MapKeyKey = "MapKey" val ActionTakenKey = "ActionTaken" val TypeKey = "DetectedSensitiveFeatureKind" - val values: Seq[SensitiveFeatureInformation] = findValues - - // Utilized by SmartTextVectorizer's name detection - case class Name - ( - probName: Double, - genderDetectResults: Seq[String], - probMale: Double, - probFemale: Double, - probOther: Double, - override val name: String, - override val key: Option[String] = None, - override val actionTaken: Boolean = false - ) extends SensitiveFeatureInformation(name, key, actionTaken) { - override val entryName: String = SensitiveFeatureInformation.Name.EntryName - } - case object Name { - val EntryName = "Name" - val ProbNameKey = "ProbName" - val GenderDetectStratsKey = "GenderDetectStrats" - val ProbMaleKey = "ProbMale" - val ProbFemaleKey = "ProbFemale" - val ProbOtherKey = "ProbOther" - } - - // Not yet implemented - case object Salutation extends SensitiveFeatureInformation("None", None, false) - case object BirthDate extends SensitiveFeatureInformation("None", None, false) - case object PostalCode extends SensitiveFeatureInformation("None", None, false) - case object Other extends SensitiveFeatureInformation("None", None, false) /** * Build metadata from Map of [[SensitiveFeatureInformation]] instances @@ -144,13 +89,13 @@ case object SensitiveFeatureInformation extends Enum[SensitiveFeatureInformation */ def fromMetadata(meta: Metadata): SensitiveFeatureInformation = { meta.getString(SensitiveFeatureInformation.TypeKey) match { - case SensitiveFeatureInformation.Name.EntryName => - SensitiveFeatureInformation.Name( - meta.getDouble(SensitiveFeatureInformation.Name.ProbNameKey), - meta.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey), - meta.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey), - meta.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey), - meta.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey), + case SensitiveNameInformation.EntryName => + SensitiveNameInformation( + meta.getDouble(SensitiveNameInformation.ProbNameKey), + meta.getStringArray(SensitiveNameInformation.GenderDetectStratsKey), + meta.getDouble(SensitiveNameInformation.ProbMaleKey), + meta.getDouble(SensitiveNameInformation.ProbFemaleKey), + meta.getDouble(SensitiveNameInformation.ProbOtherKey), meta.getString(SensitiveFeatureInformation.NameKey), { val mapKey = meta.getString(SensitiveFeatureInformation.MapKeyKey) @@ -163,3 +108,42 @@ case object SensitiveFeatureInformation extends Enum[SensitiveFeatureInformation } } } + +case class SensitiveNameInformation +( + probName: Double, + genderDetectResults: Seq[String], + probMale: Double, + probFemale: Double, + probOther: Double, + override val name: String, + override val key: Option[String] = None, + override val actionTaken: Boolean = false +) extends SensitiveFeatureInformation(name, key, actionTaken) { + override val EntryName: String = SensitiveNameInformation.EntryName + override def toMetadata: Metadata = { + new MetadataBuilder() + .putString(SensitiveFeatureInformation.NameKey, name) + .putString(SensitiveFeatureInformation.MapKeyKey, key.getOrElse("")) + .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken) + .putString(SensitiveFeatureInformation.TypeKey, this.EntryName) + .putDouble(SensitiveNameInformation.ProbNameKey, probName) + .putStringArray(SensitiveNameInformation.GenderDetectStratsKey, genderDetectResults.toArray) + .putDouble(SensitiveNameInformation.ProbMaleKey, probMale) + .putDouble(SensitiveNameInformation.ProbFemaleKey, probFemale) + .putDouble(SensitiveNameInformation.ProbOtherKey, probOther) + .build() + } +} + +case object SensitiveNameInformation { + val EntryName = "SensitiveNameInformation" + val ProbNameKey = "ProbName" + val GenderDetectStratsKey = "GenderDetectStrats" + val ProbMaleKey = "ProbMale" + val ProbFemaleKey = "ProbFemale" + val ProbOtherKey = "ProbOther" +} + +// TODO: Use this everywhere +case class GenderDetectionResults(strategyString: String, pctUnidentified: Double) extends JsonLike diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala index 8dfbb23f63..7ec239fce1 100644 --- a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala +++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala @@ -48,7 +48,7 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { val mapKey: Option[String] = None val actionTaken = true - val sensitiveFeatureInfo: SensitiveFeatureInformation.Name = SensitiveFeatureInformation.Name( + val sensitiveFeatureInfo: SensitiveNameInformation = SensitiveNameInformation( probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken ) @@ -59,26 +59,26 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { metadata.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true metadata.contains(SensitiveFeatureInformation.ActionTakenKey) shouldBe true metadata.contains(SensitiveFeatureInformation.TypeKey) shouldBe true - metadata.contains(SensitiveFeatureInformation.Name.ProbNameKey) shouldBe true - metadata.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true - metadata.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true - metadata.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true - metadata.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true + metadata.contains(SensitiveNameInformation.ProbNameKey) shouldBe true + metadata.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true + metadata.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true + metadata.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true + metadata.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true metadata.getString(SensitiveFeatureInformation.NameKey) shouldBe name metadata.getString(SensitiveFeatureInformation.MapKeyKey) shouldBe "" metadata.getBoolean(SensitiveFeatureInformation.ActionTakenKey) shouldBe actionTaken - metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveFeatureInformation.Name.EntryName - metadata.getDouble(SensitiveFeatureInformation.Name.ProbNameKey) shouldBe probName - metadata.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe genderDetectResults - metadata.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe probMale - metadata.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale - metadata.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther + metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveNameInformation.EntryName + metadata.getDouble(SensitiveNameInformation.ProbNameKey) shouldBe probName + metadata.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe genderDetectResults + metadata.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale + metadata.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale + metadata.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther } it should "create metadata from a map" in { val info1 = sensitiveFeatureInfo - val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) + val info2 = SensitiveNameInformation(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) val map = Map("1" -> Seq(info1), "2" -> Seq(info2)) val metadata = SensitiveFeatureInformation.toMetadata(map) @@ -90,33 +90,33 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { f1.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true - f1.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true - f1.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true - f1.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true - f1.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true - f1.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe genderDetectResults - f1.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe probMale - f1.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale - f1.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther + f1.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true + f1.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true + f1.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true + f1.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true + f1.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe genderDetectResults + f1.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale + f1.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale + f1.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther val f2 = metadata.getMetadataArray("2").head f2.contains(SensitiveFeatureInformation.NameKey) shouldBe true f2.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true - f2.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true - f2.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true - f2.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true - f2.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true - f2.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe Seq("") - f2.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe 0.0 - f2.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe 0.0 - f2.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe 0.0 + f2.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true + f2.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true + f2.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true + f2.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true + f2.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe Seq("") + f2.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe 0.0 + f2.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe 0.0 + f2.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe 0.0 } it should "create a map from metadata" in { val info1 = sensitiveFeatureInfo - val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) + val info2 = SensitiveNameInformation(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) val mapMetadata = new MetadataBuilder() .putMetadataArray("1", Array(info1.toMetadata)) From eac0e058a47ad7157738c820b2596a9148027702 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 23 Jan 2020 20:25:07 -0800 Subject: [PATCH 12/13] Using case class for GenderDetectionStrategy information --- .../com/salesforce/op/ModelInsightsTest.scala | 6 ++--- .../op/utils/spark/OPVectorMetadataTest.scala | 16 ++++++++---- .../op/SensitiveFeatureInformation.scala | 25 +++++++++++++++---- .../op/SensitiveFeatureInformationTest.scala | 23 ++++++++++++----- 4 files changed, 51 insertions(+), 19 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 3ec6a43306..2427076b67 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -576,7 +576,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou }, Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, Map( - "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false)) + "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None, false)) ) ) @@ -728,10 +728,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, Map( "f0" -> Seq(SensitiveNameInformation( - 0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false + 0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None, false )), "f_notInMeta" -> Seq(SensitiveNameInformation( - 1.0, Seq.empty[String], 0.0, 0.0, 1.0, "f_notInMeta", None, true + 1.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f_notInMeta", None, true )) ) ) diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala index a3c5963f4d..b1532ef5dc 100644 --- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala +++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala @@ -30,7 +30,7 @@ package com.salesforce.op.utils.spark -import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation} +import com.salesforce.op.{FeatureHistory, GenderDetectionResults, SensitiveFeatureInformation, SensitiveNameInformation} import com.salesforce.op.features.types.{DateTime, Email, FeatureType, OPMap, PickList, Prediction, Real, RealMap, TextAreaMap} import com.salesforce.op.test.TestCommon import org.apache.spark.sql.types.Metadata @@ -49,7 +49,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks type FeatureHistoryTuple = (Seq[String], Seq[String]) type SensitiveTuple = (SensitiveNameTuple, String, Option[String], Boolean) - type SensitiveNameTuple = (Double, Seq[String], Double, Double, Double) + type SensitiveNameTuple = (Double, Seq[String], Seq[Double], Double, Double, Double) type OpVectorTuple = (String, Array[OpVectorColumnTuple], FeatureHistoryTuple, Seq[SensitiveTuple]) @@ -81,12 +81,13 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks mapKey <- Gen.option(genName) actionTaken <- Gen.oneOf[Boolean](Seq(false, true)) probName <- Gen.choose(0.0, 1.0) - genderDetectResults <- Gen.containerOf[Seq, String](genName) + genderDetectNames <- Gen.containerOf[Seq, String](genName) + genderDetectNums <- Gen.containerOf[Seq, Double](Gen.choose(0.0, 1.0)) probMale <- Gen.choose(0.0, 1.0) probFemale <- Gen.choose(0.0, 1.0 - probMale) probOther <- Gen.choose(0.0, 1.0 - probMale - probFemale) } yield { - ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken) + ((probName, genderDetectNames, genderDetectNums, probMale, probFemale, probOther), featureName, mapKey, actionTaken) } val vecGen: Gen[OpVectorTuple] = for { @@ -109,7 +110,12 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks columnsMeta: Array[OpVectorColumnMetadata], sensitiveInfoSeqRaw: Seq[SensitiveTuple] ): Map[String, Seq[SensitiveFeatureInformation]] = { val sensitiveInfoSeq = sensitiveInfoSeqRaw map { - case ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken) => + case ( + (probName, genderDetectNames, genderDetectNums, probMale, probFemale, probOther), + featureName, mapKey, actionTaken) => + val genderDetectResults = genderDetectNames.zip(genderDetectNums).map { + case (name, pct) => GenderDetectionResults(name, pct) + } SensitiveNameInformation( probName, genderDetectResults, probMale, probFemale, probOther, featureName, mapKey, actionTaken ) diff --git a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala index 817914ae9e..2cbdb5d36b 100644 --- a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala +++ b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala @@ -92,7 +92,9 @@ object SensitiveFeatureInformation { case SensitiveNameInformation.EntryName => SensitiveNameInformation( meta.getDouble(SensitiveNameInformation.ProbNameKey), - meta.getStringArray(SensitiveNameInformation.GenderDetectStratsKey), + meta.getMetadataArray( + SensitiveNameInformation.GenderDetectStratsKey + ).map(GenderDetectionResults.fromMetadata), meta.getDouble(SensitiveNameInformation.ProbMaleKey), meta.getDouble(SensitiveNameInformation.ProbFemaleKey), meta.getDouble(SensitiveNameInformation.ProbOtherKey), @@ -112,7 +114,7 @@ object SensitiveFeatureInformation { case class SensitiveNameInformation ( probName: Double, - genderDetectResults: Seq[String], + genderDetectResults: Seq[GenderDetectionResults], probMale: Double, probFemale: Double, probOther: Double, @@ -128,7 +130,7 @@ case class SensitiveNameInformation .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken) .putString(SensitiveFeatureInformation.TypeKey, this.EntryName) .putDouble(SensitiveNameInformation.ProbNameKey, probName) - .putStringArray(SensitiveNameInformation.GenderDetectStratsKey, genderDetectResults.toArray) + .putMetadataArray(SensitiveNameInformation.GenderDetectStratsKey, genderDetectResults.toArray.map(_.toMetadata)) .putDouble(SensitiveNameInformation.ProbMaleKey, probMale) .putDouble(SensitiveNameInformation.ProbFemaleKey, probFemale) .putDouble(SensitiveNameInformation.ProbOtherKey, probOther) @@ -145,5 +147,18 @@ case object SensitiveNameInformation { val ProbOtherKey = "ProbOther" } -// TODO: Use this everywhere -case class GenderDetectionResults(strategyString: String, pctUnidentified: Double) extends JsonLike +case class GenderDetectionResults(strategyString: String, pctUnidentified: Double) extends JsonLike { + def toMetadata: Metadata = { + new MetadataBuilder() + .putString(GenderDetectionResults.StrategyStringKey, strategyString) + .putDouble(GenderDetectionResults.PctUnidentifiedKey, pctUnidentified) + .build() + } +} +case object GenderDetectionResults { + val StrategyStringKey = "strategyString" + val PctUnidentifiedKey = "pctUnidentified" + def fromMetadata(meta: Metadata): GenderDetectionResults = { + GenderDetectionResults(meta.getString(StrategyStringKey), meta.getDouble(PctUnidentifiedKey)) + } +} diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala index 7ec239fce1..74ff254e90 100644 --- a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala +++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala @@ -40,7 +40,10 @@ import org.scalatest.junit.JUnitRunner class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { val probName = 1.0 - val genderDetectResults: Seq[String] = Seq("ByIndex", "AnotherStrategy") + val genderDetectResults: Seq[GenderDetectionResults] = Seq( + GenderDetectionResults("ByIndex", 0.1), + GenderDetectionResults("AnotherStrategy", 0.99) + ) val probMale = 0.25 val probFemale = 0.50 val probOther = 0.25 @@ -70,7 +73,9 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { metadata.getBoolean(SensitiveFeatureInformation.ActionTakenKey) shouldBe actionTaken metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveNameInformation.EntryName metadata.getDouble(SensitiveNameInformation.ProbNameKey) shouldBe probName - metadata.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe genderDetectResults + metadata.getMetadataArray( + SensitiveNameInformation.GenderDetectStratsKey + ).map(GenderDetectionResults.fromMetadata) shouldBe genderDetectResults metadata.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale metadata.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale metadata.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther @@ -78,7 +83,8 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { it should "create metadata from a map" in { val info1 = sensitiveFeatureInfo - val info2 = SensitiveNameInformation(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) + val info2 = SensitiveNameInformation(0.0, + Seq(GenderDetectionResults("", 0)), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) val map = Map("1" -> Seq(info1), "2" -> Seq(info2)) val metadata = SensitiveFeatureInformation.toMetadata(map) @@ -94,7 +100,9 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { f1.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true f1.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true f1.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true - f1.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe genderDetectResults + f1.getMetadataArray( + SensitiveNameInformation.GenderDetectStratsKey + ).map(GenderDetectionResults.fromMetadata) shouldBe genderDetectResults f1.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale f1.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale f1.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther @@ -108,7 +116,9 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { f2.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true f2.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true f2.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true - f2.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe Seq("") + f2.getMetadataArray( + SensitiveNameInformation.GenderDetectStratsKey + ).map(GenderDetectionResults.fromMetadata) shouldBe Seq(GenderDetectionResults("", 0)) f2.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe 0.0 f2.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe 0.0 f2.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe 0.0 @@ -116,7 +126,8 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { it should "create a map from metadata" in { val info1 = sensitiveFeatureInfo - val info2 = SensitiveNameInformation(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) + val info2 = SensitiveNameInformation(0.0, + Seq(GenderDetectionResults("", 0)), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) val mapMetadata = new MetadataBuilder() .putMetadataArray("1", Array(info1.toMetadata)) From 26f30fb00a9b24b78e88891766b0f4094d4a973b Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 23 Jan 2020 20:34:06 -0800 Subject: [PATCH 13/13] Cleaning up tests per PR comments --- .../com/salesforce/op/ModelInsightsTest.scala | 51 ++++++++++--------- .../op/utils/spark/OPVectorMetadataTest.scala | 4 +- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index 2427076b67..50f5f3b392 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -576,7 +576,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou }, Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, Map( - "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None, false)) + "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None)) ) ) @@ -709,32 +709,33 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou """include sensitive feature information |even for sensitive features that are removed from output vector and output vector metadata""".stripMargin in { // Copy metadata from above but add new feature that was removed in vectorizing to sensitive info - val f_notInMeta = Feature[Text]("f_notInMeta", false, null, Seq(), "test") - val newMeta = OpVectorMetadata( - "fv", + val f_notInMeta = Feature[Text]("f_notInMeta", isResponse = false, null, Seq(), "test") + val newFeatureName = "fv" + val newColumnMeta = OpVectorColumnMetadata( + parentFeatureName = Seq("f1"), + parentFeatureType = Seq(classOf[Real].getName), + grouping = None, + indicatorValue = None + ) +: Array("f2", "f3").map { name => OpVectorColumnMetadata( - parentFeatureName = Seq("f1"), - parentFeatureType = Seq(classOf[Real].getName), - grouping = None, - indicatorValue = None - ) +: Array("f2", "f3").map { name => - OpVectorColumnMetadata( - parentFeatureName = Seq("f0"), - parentFeatureType = Seq(classOf[PickList].getName), - grouping = Option("f0"), - indicatorValue = Option(name) - ) - }, - Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, - Map( - "f0" -> Seq(SensitiveNameInformation( - 0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None, false - )), - "f_notInMeta" -> Seq(SensitiveNameInformation( - 1.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f_notInMeta", None, true - )) + parentFeatureName = Seq("f0"), + parentFeatureType = Seq(classOf[PickList].getName), + grouping = Option("f0"), + indicatorValue = Option(name) ) + } + val newFeatureHistory = Seq("f1", "f0").map( + name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq()) + ).toMap + val newSensitiveInfo = Map( + "f0" -> Seq(SensitiveNameInformation( + 0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None + )), + "f_notInMeta" -> Seq(SensitiveNameInformation( + 1.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f_notInMeta", None, actionTaken = true + )) ) + val newMeta = OpVectorMetadata(newFeatureName, newColumnMeta, newFeatureHistory, newSensitiveInfo) val labelSum = ModelInsights.getLabelSummary(Option(lbl), Option(summary)) @@ -749,7 +750,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0 f_notInMeta_butInInsights.sensitiveInformation match { case Seq(SensitiveNameInformation( - probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken + probName, genderDetectResults, probMale, probFemale, probOther, _, _, actionTaken )) => actionTaken shouldBe true probName shouldBe 1.0 diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala index b1532ef5dc..b2a51f28e4 100644 --- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala +++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala @@ -76,7 +76,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks ) val arrVecColTupleGen: Gen[Array[OpVectorColumnTuple]] = Gen.containerOf[Array, OpVectorColumnTuple](vecColTupleGen) - val sensitiveGen: Gen[SensitiveTuple] = for { + val sensitiveNameGen: Gen[SensitiveTuple] = for { featureName <- genName mapKey <- Gen.option(genName) actionTaken <- Gen.oneOf[Boolean](Seq(false, true)) @@ -94,7 +94,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks name <- genName arr <- arrVecColTupleGen histories <- featHistTupleGen - sensitiveCols <- Gen.containerOf[Seq, SensitiveTuple](sensitiveGen) + sensitiveCols <- Gen.containerOf[Seq, SensitiveTuple](sensitiveNameGen) } yield { (name, arr, histories, sensitiveCols) }