diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala index 529217491d..08645260b4 100644 --- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala +++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala @@ -333,7 +333,7 @@ case class Discrete(domain: Seq[String], prob: Seq[Double]) extends LabelInfo * @param metrics sequence containing metrics computed in RawFeatureFilter * @param distributions distribution information for the raw feature (if calculated in RawFeatureFilter) * @param exclusionReasons exclusion reasons for the raw feature (if calculated in RawFeatureFilter) - * + * @param sensitiveInformation derived information about sensitive field checks (if performed) */ case class FeatureInsights ( @@ -342,7 +342,8 @@ case class FeatureInsights derivedFeatures: Seq[Insights], metrics: Seq[RawFeatureFilterMetrics] = Seq.empty, distributions: Seq[FeatureDistribution] = Seq.empty, - exclusionReasons: Seq[ExclusionReasons] = Seq.empty + exclusionReasons: Seq[ExclusionReasons] = Seq.empty, + sensitiveInformation: Seq[SensitiveFeatureInformation] = Seq.empty ) /** @@ -697,8 +698,41 @@ case object ModelInsights { val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname) val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname) val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname) - FeatureInsights(featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2), - metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons) + val sensitiveFeatureInformation = vectorInfo.flatMap(_.sensitive.get(fname)) match { + case Some(info) => info + case _ => Seq.empty + } + FeatureInsights( + featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2), + metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons, + sensitiveInformation = sensitiveFeatureInformation + ) + }.toSeq ++ { + /* + Add FeatureInsights for removed sensitive fields that do not have a column in OpVectorMetadata. + With current TMOG settings, this will not happen unless null tracking is turned off since + null indicators are created for all text features, even ignored ones. + */ + vectorInfo match { + case Some(v) => + // Find features where `actionTaken` is true for all of the sensitive feature informations + v.sensitive.collect { + case (fname, sensitiveFeatureInformation) + if sensitiveFeatureInformation.forall(_.actionTaken) => + val ftype = allFeatures.find(_.name == fname) + .map(_.typeName) + .getOrElse("") + val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname) + val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname) + val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname) + FeatureInsights( + featureName = fname, featureType = ftype, derivedFeatures = Seq.empty, + metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons, + sensitiveInformation = sensitiveFeatureInformation + ) + } + case None => Seq.empty[FeatureInsights] + } }.toSeq } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala index 8ce42cb8ac..f50f8f1726 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala @@ -380,7 +380,7 @@ trait VectorizerDefaults extends OpPipelineStageBase { val cols = if (withNullTracking) tf.flatMap { f => Seq(f.toColumnMetaData(), f.toColumnMetaData(isNull = true)) } else tf.map { f => f.toColumnMetaData() } - OpVectorMetadata(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName)) + OpVectorMetadata.apply(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName)) } /** @@ -697,6 +697,6 @@ trait MapStringPivotHelper extends SaveOthersParams { ): OpVectorMetadata = { val otherValueString = $(unseenName) val cols = makeVectorColumnMetadata(topValues, inputFeatures, otherValueString, trackNulls) - OpVectorMetadata(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName)) + OpVectorMetadata.apply(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName)) } } diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala index d1fa503188..50f5f3b392 100644 --- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala +++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala @@ -574,7 +574,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou indicatorValue = Option(name) ) }, - Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap + Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap, + Map( + "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None)) + ) ) it should "correctly extract the LabelSummary from the label and sanity checker info" in { @@ -623,6 +626,18 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou f0In.featureName shouldBe "f0" f0In.featureType shouldBe classOf[PickList].getName f0In.derivedFeatures.size shouldBe 2 + f0In.sensitiveInformation match { + case Seq(SensitiveNameInformation( + probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken + )) => + actionTaken shouldBe false + probName shouldBe 0.0 + genderDetectResults shouldBe Seq.empty[String] + probMale shouldBe 0.0 + probFemale shouldBe 0.0 + probOther shouldBe 1.0 + case _ => fail("SensitiveFeatureInformation was not found.") + } val f0InDer2 = f0In.derivedFeatures.head f0InDer2.derivedFeatureName shouldBe "f0_f0_f2_1" @@ -690,6 +705,63 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou insights.features.foreach(f => f.distributions shouldBe empty) } + it should + """include sensitive feature information + |even for sensitive features that are removed from output vector and output vector metadata""".stripMargin in { + // Copy metadata from above but add new feature that was removed in vectorizing to sensitive info + val f_notInMeta = Feature[Text]("f_notInMeta", isResponse = false, null, Seq(), "test") + val newFeatureName = "fv" + val newColumnMeta = OpVectorColumnMetadata( + parentFeatureName = Seq("f1"), + parentFeatureType = Seq(classOf[Real].getName), + grouping = None, + indicatorValue = None + ) +: Array("f2", "f3").map { name => + OpVectorColumnMetadata( + parentFeatureName = Seq("f0"), + parentFeatureType = Seq(classOf[PickList].getName), + grouping = Option("f0"), + indicatorValue = Option(name) + ) + } + val newFeatureHistory = Seq("f1", "f0").map( + name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq()) + ).toMap + val newSensitiveInfo = Map( + "f0" -> Seq(SensitiveNameInformation( + 0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None + )), + "f_notInMeta" -> Seq(SensitiveNameInformation( + 1.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f_notInMeta", None, actionTaken = true + )) + ) + val newMeta = OpVectorMetadata(newFeatureName, newColumnMeta, newFeatureHistory, newSensitiveInfo) + + val labelSum = ModelInsights.getLabelSummary(Option(lbl), Option(summary)) + + val featureInsights = ModelInsights.getFeatureInsights( + Option(newMeta), Option(summary), None, Array(f1, f0, f_notInMeta), Array.empty, Map.empty[String, Set[String]], + RawFeatureFilterResults(), labelSum + ) + featureInsights.size shouldBe 3 + val f_notInMeta_butInInsights = featureInsights.find(_.featureName == "f_notInMeta").get + f_notInMeta_butInInsights.featureName shouldBe "f_notInMeta" + f_notInMeta_butInInsights.featureType shouldBe classOf[Text].getName + f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0 + f_notInMeta_butInInsights.sensitiveInformation match { + case Seq(SensitiveNameInformation( + probName, genderDetectResults, probMale, probFemale, probOther, _, _, actionTaken + )) => + actionTaken shouldBe true + probName shouldBe 1.0 + genderDetectResults shouldBe Seq.empty[String] + probMale shouldBe 0.0 + probFemale shouldBe 0.0 + probOther shouldBe 1.0 + case _ => fail("SensitiveFeatureInformation was not found.") + } + } + it should "return model insights for xgboost classification" in { noException should be thrownBy xgbWorkflowModel.modelInsights(xgbClassifierPred) val insights = xgbWorkflowModel.modelInsights(xgbClassifierPred) @@ -794,8 +866,8 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou } cardinality.foreach { case (featureName, value) => - val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet - actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble) + val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet + actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble) } } diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala index 71eb7b7254..ca059d791d 100644 --- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala +++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala @@ -30,7 +30,7 @@ package com.salesforce.op.utils.spark -import com.salesforce.op.FeatureHistory +import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation} import com.salesforce.op.features.types.{FeatureType, _} import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NumericAttribute} import org.apache.spark.ml.linalg.SQLDataTypes._ @@ -43,14 +43,17 @@ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} * * @param name name of the feature vector * @param col information about each element in the vector - * @param history history of parent features used to create the vector map is from + * @param history history of parent features used to create the vector; map is from * OpVectorColumnMetadata.parentFeatureName (String) to FeatureHistory + * @param sensitive parent features that were detected as sensitive in the creation of the vector; + * map is from OpVectorColumnMetadata.parentFeatureName (String) to SensitiveFeatureInformation */ class OpVectorMetadata private ( val name: String, col: Array[OpVectorColumnMetadata], - val history: Map[String, FeatureHistory] // TODO fix map -> causes problems when multiple vectorizers used on feature + val history: Map[String, FeatureHistory], // TODO fix map -> causes problems when multiple vectorizers used on feature + val sensitive: Map[String, Seq[SensitiveFeatureInformation]] = Map.empty[String, Seq[SensitiveFeatureInformation]] ) { /** @@ -92,6 +95,7 @@ class OpVectorMetadata private val meta = new MetadataBuilder() .putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray) .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history)) + .putMetadata(OpVectorMetadata.SensitiveKey, SensitiveFeatureInformation.toMetadata(sensitive)) .build() val attributes = columns.map { case c if (c.indicatorValue.isDefined || binaryTypes.exists(c.parentFeatureType.contains)) && @@ -161,7 +165,10 @@ class OpVectorMetadata private override def equals(obj: Any): Boolean = obj match { case o: OpVectorMetadata - if o.name == name && o.columns.toSeq == columns.toSeq && history == o.history => true + if o.name == name && + o.columns.toSeq == columns.toSeq && + history == o.history && + sensitive == o.sensitive => true case _ => false } @@ -169,7 +176,7 @@ class OpVectorMetadata private override def hashCode(): Int = 37 * columns.toSeq.hashCode() override def toString: String = - s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history)" + s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history,$sensitive)" } @@ -179,6 +186,7 @@ object OpVectorMetadata { val ColumnsKey = "vector_columns" val HistoryKey = "vector_history" + val SensitiveKey = "vector_detected_sensitive" /** * Construct an [[OpVectorMetadata]] from a [[StructField]], assuming that [[ColumnsKey]] is present and conforms @@ -197,9 +205,14 @@ object OpVectorMetadata { if (wrapped.underlyingMap(HistoryKey).asInstanceOf[Metadata].isEmpty) Map.empty[String, FeatureHistory] else FeatureHistory.fromMetadataMap(field.metadata.getMetadata(HistoryKey)) - new OpVectorMetadata(field.name, columns, history) - } + val sensitive = + if (wrapped.underlyingMap(SensitiveKey).asInstanceOf[Metadata].isEmpty) { + Map.empty[String, Seq[SensitiveFeatureInformation]] + } + else SensitiveFeatureInformation.fromMetadataMap(field.metadata.getMetadata(SensitiveKey)) + new OpVectorMetadata(field.name, columns, history, sensitive) + } /** * Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]] @@ -214,9 +227,24 @@ object OpVectorMetadata { name: String, columns: Array[OpVectorColumnMetadata], history: Map[String, FeatureHistory] - ): OpVectorMetadata = { - new OpVectorMetadata(name, columns, history) - } + ): OpVectorMetadata = new OpVectorMetadata(name, columns, history) + + /** + * Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]] + * representing its columns. + * + * @param name The name of the column the metadata represents + * @param columns The columns within the vectors + * @param history The history of the parent features + * @param sensitive Which columns have been marked as sensitive and related information + * @return The constructed vector metadata + */ + def apply( + name: String, + columns: Array[OpVectorColumnMetadata], + history: Map[String, FeatureHistory], + sensitive: Map[String, Seq[SensitiveFeatureInformation]] + ): OpVectorMetadata = new OpVectorMetadata(name, columns, history, sensitive) /** * Construct an [[OpVectorMetadata]] from its name and a [[Metadata]], assuming that [[ColumnsKey]] and @@ -242,7 +270,8 @@ object OpVectorMetadata { def flatten(outputName: String, vectors: Seq[OpVectorMetadata]): OpVectorMetadata = { val allColumns = vectors.flatMap(_.columns).toArray val allHist = vectors.flatMap(_.history).toMap - new OpVectorMetadata(outputName, allColumns, allHist) + val allSensitive = vectors.flatMap(_.sensitive).toMap + new OpVectorMetadata(outputName, allColumns, allHist, allSensitive) } } diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala index 438119ee57..b2a51f28e4 100644 --- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala +++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala @@ -30,7 +30,7 @@ package com.salesforce.op.utils.spark -import com.salesforce.op.FeatureHistory +import com.salesforce.op.{FeatureHistory, GenderDetectionResults, SensitiveFeatureInformation, SensitiveNameInformation} import com.salesforce.op.features.types.{DateTime, Email, FeatureType, OPMap, PickList, Prediction, Real, RealMap, TextAreaMap} import com.salesforce.op.test.TestCommon import org.apache.spark.sql.types.Metadata @@ -47,7 +47,11 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks type OpVectorColumnTuple = (Seq[String], Seq[String], Option[String], Option[String], Option[String], Int) type FeatureHistoryTuple = (Seq[String], Seq[String]) - type OpVectorTuple = (String, Array[OpVectorColumnTuple], FeatureHistoryTuple) + + type SensitiveTuple = (SensitiveNameTuple, String, Option[String], Boolean) + type SensitiveNameTuple = (Double, Seq[String], Seq[Double], Double, Double, Double) + + type OpVectorTuple = (String, Array[OpVectorColumnTuple], FeatureHistoryTuple, Seq[SensitiveTuple]) // AttributeGroup and Attribute require non-empty names val genName: Gen[String] = Gen.nonEmptyListOf(alphaNumChar).map(_.mkString) @@ -72,24 +76,57 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks ) val arrVecColTupleGen: Gen[Array[OpVectorColumnTuple]] = Gen.containerOf[Array, OpVectorColumnTuple](vecColTupleGen) + val sensitiveNameGen: Gen[SensitiveTuple] = for { + featureName <- genName + mapKey <- Gen.option(genName) + actionTaken <- Gen.oneOf[Boolean](Seq(false, true)) + probName <- Gen.choose(0.0, 1.0) + genderDetectNames <- Gen.containerOf[Seq, String](genName) + genderDetectNums <- Gen.containerOf[Seq, Double](Gen.choose(0.0, 1.0)) + probMale <- Gen.choose(0.0, 1.0) + probFemale <- Gen.choose(0.0, 1.0 - probMale) + probOther <- Gen.choose(0.0, 1.0 - probMale - probFemale) + } yield { + ((probName, genderDetectNames, genderDetectNums, probMale, probFemale, probOther), featureName, mapKey, actionTaken) + } + val vecGen: Gen[OpVectorTuple] = for { name <- genName arr <- arrVecColTupleGen histories <- featHistTupleGen + sensitiveCols <- Gen.containerOf[Seq, SensitiveTuple](sensitiveNameGen) } yield { - (name, arr, histories) + (name, arr, histories, sensitiveCols) } val seqVecGen: Gen[Seq[OpVectorTuple]] = Gen.containerOf[Seq, OpVectorTuple](vecGen) - private def generateHistory(columnsMeta: Array[OpVectorColumnMetadata], hist: (Seq[String], Seq[String])) = + private def generateHistory( + columnsMeta: Array[OpVectorColumnMetadata], hist: (Seq[String], Seq[String]) + ): Map[String, FeatureHistory] = columnsMeta.flatMap(v => v.parentFeatureName.map(p => p -> FeatureHistory(hist._1, hist._2))).toMap - private def checkTuples(tup: OpVectorColumnTuple) = tup._1.nonEmpty && tup._2.nonEmpty + private def generateSensitiveFeatureInfo( + columnsMeta: Array[OpVectorColumnMetadata], sensitiveInfoSeqRaw: Seq[SensitiveTuple] + ): Map[String, Seq[SensitiveFeatureInformation]] = { + val sensitiveInfoSeq = sensitiveInfoSeqRaw map { + case ( + (probName, genderDetectNames, genderDetectNums, probMale, probFemale, probOther), + featureName, mapKey, actionTaken) => + val genderDetectResults = genderDetectNames.zip(genderDetectNums).map { + case (name, pct) => GenderDetectionResults(name, pct) + } + SensitiveNameInformation( + probName, genderDetectResults, probMale, probFemale, probOther, featureName, mapKey, actionTaken + ) + } + columnsMeta.flatMap(v => v.parentFeatureName.map(p => p -> sensitiveInfoSeq)).toMap + } + private def checkTuples(tup: OpVectorColumnTuple): Boolean = tup._1.nonEmpty && tup._2.nonEmpty property("column metadata stays the same when serialized to spark metadata") { - forAll(vecColTupleGen) { (vct: OpVectorColumnTuple) => + forAll(vecColTupleGen) { vct: OpVectorColumnTuple => if (checkTuples(vct)) { val columnMeta = OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5) columnMeta shouldEqual OpVectorColumnMetadata.fromMetadata(columnMeta.toMetadata()).head @@ -98,7 +135,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks } property("column metadata cannot be created with empty parents or feature types") { - forAll(vecColTupleGen) { (vct: OpVectorColumnTuple) => + forAll(vecColTupleGen) { vct: OpVectorColumnTuple => if (!checkTuples(vct)) { assertThrows[IllegalArgumentException] { OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5) } } @@ -106,22 +143,34 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks } property("vector metadata stays the same when serialized to spark metadata") { - forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) => - val cols = columns.filter(checkTuples) - val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) - val history = generateHistory(columnsMeta, hist) - val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history) - val field = vectorMeta.toStructField() - vectorMeta shouldEqual OpVectorMetadata(field) + forAll(vecGen) { + case (outputName: String, + columns: Array[OpVectorColumnTuple], + hist: FeatureHistoryTuple, + sens: Seq[SensitiveTuple] + ) if outputName.nonEmpty => + val cols = columns.filter(checkTuples) + val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) + val history = generateHistory(columnsMeta, hist) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive) + val field = vectorMeta.toStructField() + vectorMeta shouldEqual OpVectorMetadata(field) + case _ => true shouldEqual true } } property("vector metadata properly finds indices of its columns") { - forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) => + forAll(vecGen) { + case (outputName: String, + columns: Array[OpVectorColumnTuple], + hist: FeatureHistoryTuple, + sens: Seq[SensitiveTuple]) => val cols = columns.filter(checkTuples) val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) val history = generateHistory(columnsMeta, hist) - val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive) for {(col, i) <- vectorMeta.columns.zipWithIndex} { vectorMeta.index(col) shouldEqual i } @@ -139,13 +188,14 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks } property("vector metadata flattens correctly") { - forAll(seqVecGen) { (vectors: Seq[OpVectorTuple]) => + forAll(seqVecGen) { vectors: Seq[OpVectorTuple] => val vecs = vectors.map { - case (outputName, columns, hist) => + case (outputName, columns, hist, sens) => val cols = columns.filter(checkTuples) val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) val history = generateHistory(columnsMeta, hist) - OpVectorMetadata(outputName, columnsMeta, history) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + OpVectorMetadata(outputName, columnsMeta, history, sensitive) } val flattened = OpVectorMetadata.flatten("out", vecs) flattened.size shouldEqual vecs.map(_.size).sum @@ -155,12 +205,16 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks } property("vector metadata should properly serialize to and from spark metadata") { - forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) => + forAll(vecGen) { + case (outputName: String, + columns: Array[OpVectorColumnTuple], + hist: FeatureHistoryTuple, + sens: Seq[SensitiveTuple]) => val cols = columns.filter(checkTuples) val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) val history = generateHistory(columnsMeta, hist) - - val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive) val vectorMetaFromSerialized = OpVectorMetadata(vectorMeta.name, vectorMeta.toMetadata) vectorMeta.name shouldEqual vectorMetaFromSerialized.name @@ -171,13 +225,18 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks property("vector metadata should generate feature history correctly") { - forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) => + forAll(vecGen) { case ( + outputName: String, + columns: Array[OpVectorColumnTuple], + hist: FeatureHistoryTuple, + sens: Seq[SensitiveTuple]) => val cols = columns.filter(checkTuples) val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)) val history = generateHistory(columnsMeta, hist) + val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens) + val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive) - val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history) - if (history.isEmpty && columnsMeta.nonEmpty ) { + if (history.isEmpty && columnsMeta.nonEmpty) { assertThrows[RuntimeException](vectorMeta.getColumnHistory()) } else { val colHist = vectorMeta.getColumnHistory() diff --git a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala new file mode 100644 index 0000000000..2cbdb5d36b --- /dev/null +++ b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op + +import com.salesforce.op.utils.json.JsonLike +import com.salesforce.op.utils.spark.RichMetadata._ +import org.apache.spark.sql.types.{Metadata, MetadataBuilder} + +/** + * A base class for different SensitiveFeatureInformation + * The following three params are required for every kind of SensitiveFeatureInformation + * + * @param name the name of the raw feature + * @param key optionally, the name of the key (if the raw feature is a Map type) + * @param actionTaken whether the handling of the raw feature changed b/c it was detected as sensitive + */ +sealed abstract class SensitiveFeatureInformation +( + val name: String, + val key: Option[String] = None, + val actionTaken: Boolean = false +) extends JsonLike { + val EntryName: String + def toMetadata: Metadata +} + +object SensitiveFeatureInformation { + val NameKey = "FeatureName" + val MapKeyKey = "MapKey" + val ActionTakenKey = "ActionTaken" + val TypeKey = "DetectedSensitiveFeatureKind" + + /** + * Build metadata from Map of [[SensitiveFeatureInformation]] instances + * + * @param map Map from feature name to Seq of [[SensitiveFeatureInformation]] about that feature + * @return metadata representation + */ + def toMetadata(map: Map[String, Seq[SensitiveFeatureInformation]]): Metadata = { + val builder = new MetadataBuilder() + map.foreach { case (k, values) => builder.putMetadataArray(k, values map { _.toMetadata } toArray) } + builder.build() + } + + /** + * Build Map of [[SensitiveFeatureInformation]] instances from metadata + * + * @param meta metadata containing a mapping from feature name to [[SensitiveFeatureInformation]] + * @return map of that information + */ + def fromMetadataMap(meta: Metadata): Map[String, Seq[SensitiveFeatureInformation]] = { + val infoMap = meta.wrapped.underlyingMap + infoMap.map { case (k, values) => k -> values.asInstanceOf[Array[Metadata]].map(fromMetadata).toSeq } + } + + /** + * Build [[SensitiveFeatureInformation]] from metadata + * + * @param meta Metadata representing [[SensitiveFeatureInformation]] + * @return new instance of [[SensitiveFeatureInformation]] + */ + def fromMetadata(meta: Metadata): SensitiveFeatureInformation = { + meta.getString(SensitiveFeatureInformation.TypeKey) match { + case SensitiveNameInformation.EntryName => + SensitiveNameInformation( + meta.getDouble(SensitiveNameInformation.ProbNameKey), + meta.getMetadataArray( + SensitiveNameInformation.GenderDetectStratsKey + ).map(GenderDetectionResults.fromMetadata), + meta.getDouble(SensitiveNameInformation.ProbMaleKey), + meta.getDouble(SensitiveNameInformation.ProbFemaleKey), + meta.getDouble(SensitiveNameInformation.ProbOtherKey), + meta.getString(SensitiveFeatureInformation.NameKey), + { + val mapKey = meta.getString(SensitiveFeatureInformation.MapKeyKey) + if (mapKey.isEmpty) None else Some(mapKey) + }, + meta.getBoolean(SensitiveFeatureInformation.ActionTakenKey) + ) + case _ => throw new RuntimeException( + "Metadata for sensitive features other than names have not been implemented.") + } + } +} + +case class SensitiveNameInformation +( + probName: Double, + genderDetectResults: Seq[GenderDetectionResults], + probMale: Double, + probFemale: Double, + probOther: Double, + override val name: String, + override val key: Option[String] = None, + override val actionTaken: Boolean = false +) extends SensitiveFeatureInformation(name, key, actionTaken) { + override val EntryName: String = SensitiveNameInformation.EntryName + override def toMetadata: Metadata = { + new MetadataBuilder() + .putString(SensitiveFeatureInformation.NameKey, name) + .putString(SensitiveFeatureInformation.MapKeyKey, key.getOrElse("")) + .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken) + .putString(SensitiveFeatureInformation.TypeKey, this.EntryName) + .putDouble(SensitiveNameInformation.ProbNameKey, probName) + .putMetadataArray(SensitiveNameInformation.GenderDetectStratsKey, genderDetectResults.toArray.map(_.toMetadata)) + .putDouble(SensitiveNameInformation.ProbMaleKey, probMale) + .putDouble(SensitiveNameInformation.ProbFemaleKey, probFemale) + .putDouble(SensitiveNameInformation.ProbOtherKey, probOther) + .build() + } +} + +case object SensitiveNameInformation { + val EntryName = "SensitiveNameInformation" + val ProbNameKey = "ProbName" + val GenderDetectStratsKey = "GenderDetectStrats" + val ProbMaleKey = "ProbMale" + val ProbFemaleKey = "ProbFemale" + val ProbOtherKey = "ProbOther" +} + +case class GenderDetectionResults(strategyString: String, pctUnidentified: Double) extends JsonLike { + def toMetadata: Metadata = { + new MetadataBuilder() + .putString(GenderDetectionResults.StrategyStringKey, strategyString) + .putDouble(GenderDetectionResults.PctUnidentifiedKey, pctUnidentified) + .build() + } +} +case object GenderDetectionResults { + val StrategyStringKey = "strategyString" + val PctUnidentifiedKey = "pctUnidentified" + def fromMetadata(meta: Metadata): GenderDetectionResults = { + GenderDetectionResults(meta.getString(StrategyStringKey), meta.getDouble(PctUnidentifiedKey)) + } +} diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala new file mode 100644 index 0000000000..74ff254e90 --- /dev/null +++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2017, Salesforce.com, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package com.salesforce.op + +import com.salesforce.op.test.TestCommon +import org.apache.spark.sql.types.MetadataBuilder +import org.junit.runner.RunWith +import org.scalatest.FlatSpec +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class SensitiveFeatureInformationTest extends FlatSpec with TestCommon { + + val probName = 1.0 + val genderDetectResults: Seq[GenderDetectionResults] = Seq( + GenderDetectionResults("ByIndex", 0.1), + GenderDetectionResults("AnotherStrategy", 0.99) + ) + val probMale = 0.25 + val probFemale = 0.50 + val probOther = 0.25 + val name = "feature" + val mapKey: Option[String] = None + val actionTaken = true + + val sensitiveFeatureInfo: SensitiveNameInformation = SensitiveNameInformation( + probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken + ) + + Spec[SensitiveFeatureInformation] should "convert sensitive feature information to metadata" in { + val metadata = sensitiveFeatureInfo.toMetadata + + metadata.contains(SensitiveFeatureInformation.NameKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.ActionTakenKey) shouldBe true + metadata.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + metadata.contains(SensitiveNameInformation.ProbNameKey) shouldBe true + metadata.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true + metadata.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true + metadata.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true + metadata.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true + + metadata.getString(SensitiveFeatureInformation.NameKey) shouldBe name + metadata.getString(SensitiveFeatureInformation.MapKeyKey) shouldBe "" + metadata.getBoolean(SensitiveFeatureInformation.ActionTakenKey) shouldBe actionTaken + metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveNameInformation.EntryName + metadata.getDouble(SensitiveNameInformation.ProbNameKey) shouldBe probName + metadata.getMetadataArray( + SensitiveNameInformation.GenderDetectStratsKey + ).map(GenderDetectionResults.fromMetadata) shouldBe genderDetectResults + metadata.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale + metadata.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale + metadata.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther + } + + it should "create metadata from a map" in { + val info1 = sensitiveFeatureInfo + val info2 = SensitiveNameInformation(0.0, + Seq(GenderDetectionResults("", 0)), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) + val map = Map("1" -> Seq(info1), "2" -> Seq(info2)) + val metadata = SensitiveFeatureInformation.toMetadata(map) + + metadata.contains("1") shouldBe true + metadata.contains("2") shouldBe true + + val f1 = metadata.getMetadataArray("1").head + f1.contains(SensitiveFeatureInformation.NameKey) shouldBe true + f1.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true + f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + f1.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true + f1.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true + f1.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true + f1.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true + f1.getMetadataArray( + SensitiveNameInformation.GenderDetectStratsKey + ).map(GenderDetectionResults.fromMetadata) shouldBe genderDetectResults + f1.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale + f1.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale + f1.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther + + val f2 = metadata.getMetadataArray("2").head + f2.contains(SensitiveFeatureInformation.NameKey) shouldBe true + f2.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true + f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true + f2.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true + f2.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true + f2.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true + f2.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true + f2.getMetadataArray( + SensitiveNameInformation.GenderDetectStratsKey + ).map(GenderDetectionResults.fromMetadata) shouldBe Seq(GenderDetectionResults("", 0)) + f2.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe 0.0 + f2.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe 0.0 + f2.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe 0.0 + } + + it should "create a map from metadata" in { + val info1 = sensitiveFeatureInfo + val info2 = SensitiveNameInformation(0.0, + Seq(GenderDetectionResults("", 0)), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true) + + val mapMetadata = new MetadataBuilder() + .putMetadataArray("1", Array(info1.toMetadata)) + .putMetadataArray("2", Array(info2.toMetadata)) + .build() + + val map = SensitiveFeatureInformation.fromMetadataMap(mapMetadata) + + map.contains("1") shouldBe true + map("1") shouldBe Array(info1) + map.contains("2") shouldBe true + map("2") shouldBe Array(info2) + } +} +