Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metadata changes for sensitive feature information #457

Merged
merged 18 commits into from
Jan 29, 2020
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 38 additions & 4 deletions core/src/main/scala/com/salesforce/op/ModelInsights.scala
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ case class Discrete(domain: Seq[String], prob: Seq[Double]) extends LabelInfo
* @param metrics sequence containing metrics computed in RawFeatureFilter
* @param distributions distribution information for the raw feature (if calculated in RawFeatureFilter)
* @param exclusionReasons exclusion reasons for the raw feature (if calculated in RawFeatureFilter)
*
* @param sensitiveInformation derived information about sensitive field checks (if performed)
*/
case class FeatureInsights
(
Expand All @@ -342,7 +342,8 @@ case class FeatureInsights
derivedFeatures: Seq[Insights],
metrics: Seq[RawFeatureFilterMetrics] = Seq.empty,
distributions: Seq[FeatureDistribution] = Seq.empty,
exclusionReasons: Seq[ExclusionReasons] = Seq.empty
exclusionReasons: Seq[ExclusionReasons] = Seq.empty,
sensitiveInformation: Seq[SensitiveFeatureInformation] = Seq.empty
)

/**
Expand Down Expand Up @@ -697,8 +698,41 @@ case object ModelInsights {
val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname)
val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname)
val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname)
FeatureInsights(featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2),
metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons)
val sensitiveFeatureInformation = vectorInfo.flatMap(_.sensitive.get(fname)) match {
case Some(info) => info
case _ => Seq.empty
}
FeatureInsights(
featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2),
metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons,
sensitiveInformation = sensitiveFeatureInformation
)
}.toSeq ++ {
/*
Add FeatureInsights for removed sensitive fields that do not have a column in OpVectorMetadata.
With current TMOG settings, this will not happen unless null tracking is turned off since
null indicators are created for all text features, even ignored ones.
*/
vectorInfo match {
tovbinm marked this conversation as resolved.
Show resolved Hide resolved
case Some(v) =>
// Find features where `actionTaken` is true for all of the sensitive feature informations
v.sensitive.collect {
case (fname, sensitiveFeatureInformation)
if sensitiveFeatureInformation.forall(_.actionTaken) =>
val ftype = allFeatures.find(_.name == fname)
.map(_.typeName)
.getOrElse("")
val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname)
val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname)
val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname)
FeatureInsights(
featureName = fname, featureType = ftype, derivedFeatures = Seq.empty,
metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons,
sensitiveInformation = sensitiveFeatureInformation
)
}
case None => Seq.empty[FeatureInsights]
}
}.toSeq
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ trait VectorizerDefaults extends OpPipelineStageBase {
val cols =
if (withNullTracking) tf.flatMap { f => Seq(f.toColumnMetaData(), f.toColumnMetaData(isNull = true)) }
else tf.map { f => f.toColumnMetaData() }
OpVectorMetadata(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName))
OpVectorMetadata.apply(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName))
}

/**
Expand Down Expand Up @@ -697,6 +697,6 @@ trait MapStringPivotHelper extends SaveOthersParams {
): OpVectorMetadata = {
val otherValueString = $(unseenName)
val cols = makeVectorColumnMetadata(topValues, inputFeatures, otherValueString, trackNulls)
OpVectorMetadata(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName))
OpVectorMetadata.apply(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName))
}
}
77 changes: 74 additions & 3 deletions core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
indicatorValue = Option(name)
)
},
Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap
Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
Map(
"f0" -> Seq(SensitiveFeatureInformation.Name(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false))
)
)

it should "correctly extract the LabelSummary from the label and sanity checker info" in {
Expand Down Expand Up @@ -623,6 +626,18 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
f0In.featureName shouldBe "f0"
f0In.featureType shouldBe classOf[PickList].getName
f0In.derivedFeatures.size shouldBe 2
f0In.sensitiveInformation match {
case Seq(SensitiveFeatureInformation.Name(
probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
)) =>
actionTaken shouldBe false
probName shouldBe 0.0
genderDetectResults shouldBe Seq.empty[String]
probMale shouldBe 0.0
probFemale shouldBe 0.0
probOther shouldBe 1.0
case _ => fail("SensitiveFeatureInformation was not found.")
}

val f0InDer2 = f0In.derivedFeatures.head
f0InDer2.derivedFeatureName shouldBe "f0_f0_f2_1"
Expand Down Expand Up @@ -690,6 +705,62 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
insights.features.foreach(f => f.distributions shouldBe empty)
}

it should
"""include sensitive feature information
|even for sensitive features that are removed from output vector and output vector metadata""".stripMargin in {
// Copy metadata from above but add new feature that was removed in vectorizing to sensitive info
val f_notInMeta = Feature[Text]("f_notInMeta", false, null, Seq(), "test")
val newMeta = OpVectorMetadata(
MWYang marked this conversation as resolved.
Show resolved Hide resolved
"fv",
OpVectorColumnMetadata(
parentFeatureName = Seq("f1"),
parentFeatureType = Seq(classOf[Real].getName),
grouping = None,
indicatorValue = None
) +: Array("f2", "f3").map { name =>
OpVectorColumnMetadata(
parentFeatureName = Seq("f0"),
parentFeatureType = Seq(classOf[PickList].getName),
grouping = Option("f0"),
indicatorValue = Option(name)
)
},
Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
Map(
"f0" -> Seq(SensitiveFeatureInformation.Name(
0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false
)),
"f_notInMeta" -> Seq(SensitiveFeatureInformation.Name(
1.0, Seq.empty[String], 0.0, 0.0, 1.0, "f_notInMeta", None, true
))
)
)

val labelSum = ModelInsights.getLabelSummary(Option(lbl), Option(summary))

val featureInsights = ModelInsights.getFeatureInsights(
Option(newMeta), Option(summary), None, Array(f1, f0, f_notInMeta), Array.empty, Map.empty[String, Set[String]],
RawFeatureFilterResults(), labelSum
)
featureInsights.size shouldBe 3
val f_notInMeta_butInInsights = featureInsights.find(_.featureName == "f_notInMeta").get
f_notInMeta_butInInsights.featureName shouldBe "f_notInMeta"
f_notInMeta_butInInsights.featureType shouldBe classOf[Text].getName
f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0
f_notInMeta_butInInsights.sensitiveInformation match {
case Seq(SensitiveFeatureInformation.Name(
probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
)) =>
actionTaken shouldBe true
probName shouldBe 1.0
genderDetectResults shouldBe Seq.empty[String]
probMale shouldBe 0.0
probFemale shouldBe 0.0
probOther shouldBe 1.0
case _ => fail("SensitiveFeatureInformation was not found.")
}
}

it should "return model insights for xgboost classification" in {
noException should be thrownBy xgbWorkflowModel.modelInsights(xgbClassifierPred)
val insights = xgbWorkflowModel.modelInsights(xgbClassifierPred)
Expand Down Expand Up @@ -794,8 +865,8 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
}

cardinality.foreach { case (featureName, value) =>
val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet
actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble)
val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet
actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

package com.salesforce.op.utils.spark

import com.salesforce.op.FeatureHistory
import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation}
import com.salesforce.op.features.types.{FeatureType, _}
import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NumericAttribute}
import org.apache.spark.ml.linalg.SQLDataTypes._
Expand All @@ -43,14 +43,17 @@ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField}
*
* @param name name of the feature vector
* @param col information about each element in the vector
* @param history history of parent features used to create the vector map is from
* @param history history of parent features used to create the vector; map is from
* OpVectorColumnMetadata.parentFeatureName (String) to FeatureHistory
* @param sensitive parent features that were detected as sensitive in the creation of the vector;
* map is from OpVectorColumnMetadata.parentFeatureName (String) to SensitiveFeatureInformation
*/
class OpVectorMetadata private
(
val name: String,
col: Array[OpVectorColumnMetadata],
val history: Map[String, FeatureHistory] // TODO fix map -> causes problems when multiple vectorizers used on feature
val history: Map[String, FeatureHistory], // TODO fix map -> causes problems when multiple vectorizers used on feature
val sensitive: Map[String, Seq[SensitiveFeatureInformation]] = Map.empty[String, Seq[SensitiveFeatureInformation]]
) {

/**
Expand Down Expand Up @@ -92,6 +95,7 @@ class OpVectorMetadata private
val meta = new MetadataBuilder()
.putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray)
.putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history))
.putMetadata(OpVectorMetadata.SensitiveKey, SensitiveFeatureInformation.toMetadata(sensitive))
.build()
val attributes = columns.map {
case c if (c.indicatorValue.isDefined || binaryTypes.exists(c.parentFeatureType.contains)) &&
Expand Down Expand Up @@ -161,15 +165,18 @@ class OpVectorMetadata private
override def equals(obj: Any): Boolean =
obj match {
case o: OpVectorMetadata
if o.name == name && o.columns.toSeq == columns.toSeq && history == o.history => true
if o.name == name &&
o.columns.toSeq == columns.toSeq &&
history == o.history &&
sensitive == o.sensitive => true
case _ => false
}

// have to override to support overridden .equals
override def hashCode(): Int = 37 * columns.toSeq.hashCode()

override def toString: String =
s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history)"
s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history,$sensitive)"

}

Expand All @@ -179,6 +186,7 @@ object OpVectorMetadata {

val ColumnsKey = "vector_columns"
val HistoryKey = "vector_history"
val SensitiveKey = "vector_detected_sensitive"

/**
* Construct an [[OpVectorMetadata]] from a [[StructField]], assuming that [[ColumnsKey]] is present and conforms
Expand All @@ -197,9 +205,14 @@ object OpVectorMetadata {
if (wrapped.underlyingMap(HistoryKey).asInstanceOf[Metadata].isEmpty) Map.empty[String, FeatureHistory]
else FeatureHistory.fromMetadataMap(field.metadata.getMetadata(HistoryKey))

new OpVectorMetadata(field.name, columns, history)
}
val sensitive =
if (wrapped.underlyingMap(SensitiveKey).asInstanceOf[Metadata].isEmpty) {
Map.empty[String, Seq[SensitiveFeatureInformation]]
}
else SensitiveFeatureInformation.fromMetadataMap(field.metadata.getMetadata(SensitiveKey))

new OpVectorMetadata(field.name, columns, history, sensitive)
}

/**
* Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]]
Expand All @@ -214,9 +227,24 @@ object OpVectorMetadata {
name: String,
columns: Array[OpVectorColumnMetadata],
history: Map[String, FeatureHistory]
): OpVectorMetadata = {
new OpVectorMetadata(name, columns, history)
}
): OpVectorMetadata = new OpVectorMetadata(name, columns, history)

/**
* Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]]
* representing its columns.
*
* @param name The name of the column the metadata represents
* @param columns The columns within the vectors
* @param history The history of the parent features
* @param sensitive Which columns have been marked as sensitive and related information
* @return The constructed vector metadata
*/
def apply(
name: String,
columns: Array[OpVectorColumnMetadata],
history: Map[String, FeatureHistory],
sensitive: Map[String, Seq[SensitiveFeatureInformation]]
): OpVectorMetadata = new OpVectorMetadata(name, columns, history, sensitive)

/**
* Construct an [[OpVectorMetadata]] from its name and a [[Metadata]], assuming that [[ColumnsKey]] and
Expand All @@ -242,7 +270,8 @@ object OpVectorMetadata {
def flatten(outputName: String, vectors: Seq[OpVectorMetadata]): OpVectorMetadata = {
val allColumns = vectors.flatMap(_.columns).toArray
val allHist = vectors.flatMap(_.history).toMap
new OpVectorMetadata(outputName, allColumns, allHist)
val allSensitive = vectors.flatMap(_.sensitive).toMap
new OpVectorMetadata(outputName, allColumns, allHist, allSensitive)
}

}
Loading