From 3ad8ce1d6a618a5804a470e4f96031961ae4b46a Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Fri, 10 Jan 2020 14:04:29 -0800
Subject: [PATCH 01/13] Got previous tests working

---
 .../OPCollectionHashingVectorizer.scala       |  16 ++-
 .../impl/feature/SmartTextMapVectorizer.scala | 131 ++++++++++++++----
 .../impl/feature/SmartTextVectorizer.scala    |   2 -
 .../impl/feature/AttributeAsserts.scala       |   2 +
 .../feature/SmartTextMapVectorizerTest.scala  |  70 +++++++++-
 .../salesforce/op/features/types/Lists.scala  |   9 ++
 6 files changed, 196 insertions(+), 34 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
index c646db6d06..2e1fa78710 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
@@ -314,32 +314,38 @@ private[op] trait MapHashingFun extends HashingFun {
 
   protected def makeVectorColumnMetadata
   (
+    hashFeatures: Array[TransientFeature],
+    ignoreFeatures: Array[TransientFeature],
     features: Array[TransientFeature],
     params: HashingFunctionParams,
-    allKeys: Seq[Seq[String]],
+    hashKeys: Seq[Seq[String]],
+    ignoreKeys: Seq[Seq[String]],
     shouldTrackNulls: Boolean,
     shouldTrackLen: Boolean
   ): Array[OpVectorColumnMetadata] = {
     val numHashes = params.numFeatures
-    val numFeatures = allKeys.map(_.length).sum
+    val numFeatures = hashKeys.map(_.length).sum
     val hashColumns =
       if (isSharedHashSpace(params, Some(numFeatures))) {
         (0 until numHashes).map { i =>
           OpVectorColumnMetadata(
-            parentFeatureName = features.map(_.name),
-            parentFeatureType = features.map(_.typeName),
+            parentFeatureName = hashFeatures.map(_.name),
+            parentFeatureType = hashFeatures.map(_.typeName),
             grouping = None,
             indicatorValue = None
           )
         }.toArray
       } else {
         for {
-          (keys, f) <- allKeys.toArray.zip(features)
+          (keys, f) <- hashKeys.toArray.zip(hashFeatures)
           key <- keys
           i <- 0 until numHashes
         } yield f.toColumnMetaData().copy(grouping = Option(key))
       }
 
+    // All columns get null tracking or text length tracking, whether their contents are hashed or ignored
+    val allKeys = hashKeys.zip(ignoreKeys).map{ case(h, i) => h ++ i }
+    // val allFeatures =
     val nullColumns = if (shouldTrackNulls) {
       for {
         (keys, f) <- allKeys.toArray.zip(features)
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
index f149f5abba..11e84e3528 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
@@ -39,7 +39,7 @@ import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
 import com.twitter.algebird.Monoid._
 import com.twitter.algebird.Operators._
-import com.twitter.algebird.Monoid
+import com.twitter.algebird.{Monoid, Semigroup}
 import com.twitter.algebird.macros.caseclass
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.{Dataset, Encoder}
@@ -63,7 +63,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
     with PivotParams with CleanTextFun with SaveOthersParams
     with TrackNullsParam with MinSupportParam with TextTokenizerParams with TrackTextLenParam
     with HashingVectorizerParams with MapHashingFun with OneHotFun with MapStringPivotHelper
-    with MapVectorizerFuns[String, OPMap[String]] with MaxCardinalityParams {
+    with MapVectorizerFuns[String, OPMap[String]] with MaxCardinalityParams with MinLengthStdDevParams {
 
   private implicit val textMapStatsSeqEnc: Encoder[Array[TextMapStats]] = ExpressionEncoder[Array[TextMapStats]]()
 
@@ -73,7 +73,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
   ): TextMapStats = {
     val keyValueCounts = textMap.map{ case (k, v) =>
       cleanTextFn(k, shouldCleanKeys) ->
-        TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1), Map(cleanTextFn(v, shouldCleanValues).length -> 1))
+        TextStats(Map(cleanTextFn(v, shouldCleanValues) -> 1L), Map(cleanTextFn(v, shouldCleanValues).length -> 1L))
     }
     TextMapStats(keyValueCounts)
   }
@@ -104,9 +104,24 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
       )
     } else Array.empty[OpVectorColumnMetadata]
 
-    val textColumns = if (args.textFeatureInfo.flatten.nonEmpty) {
+
+    /*
+    val hashedTextColumns = if (args.hashFeatureInfo.flatten.nonEmpty) {
+      val (mapFeatures, mapFeatureInfo) =
+        inN.toSeq.zip(args.hashFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
+      val allKeys = mapFeatureInfo.map(_.map(_.key))
+      makeVectorColumnMetadata(
+        features = mapFeatures.toArray,
+        params = makeHashingParams(),
+        allKeys = allKeys,
+        shouldTrackNulls = args.shouldTrackNulls,
+        shouldTrackLen = $(trackTextLen)
+      )
+    } else Array.empty[OpVectorColumnMetadata]
+
+    val ignoredTextColumns = if (args.ignoreFeatureInfo.flatten.nonEmpty) {
       val (mapFeatures, mapFeatureInfo) =
-        inN.toSeq.zip(args.textFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
+        inN.toSeq.zip(args.ignoreFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
       val allKeys = mapFeatureInfo.map(_.map(_.key))
       makeVectorColumnMetadata(
         features = mapFeatures.toArray,
@@ -116,13 +131,46 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
         shouldTrackLen = $(trackTextLen)
       )
     } else Array.empty[OpVectorColumnMetadata]
+      */
+
+    val allTextFeatureInfo = args.hashFeatureInfo.zip(args.ignoreFeatureInfo).map{ case (h, i) => h ++ i }
+    val allTextColumns = if (allTextFeatureInfo.flatten.nonEmpty) {
+      val (mapFeatures, mapFeatureInfo) =
+        inN.toSeq.zip(allTextFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
+      val allKeys = mapFeatureInfo.map(_.map(_.key))
+
+      val hashKeys = args.hashFeatureInfo.map(
+        _.filter(_.vectorizationMethod == TextVectorizationMethod.Hash).map(_.key)
+      )
+      val ignoreKeys = args.ignoreFeatureInfo.map(
+        _.filter(_.vectorizationMethod == TextVectorizationMethod.Ignore).map(_.key)
+      )
+      val hashFeatures = inN.toSeq.zip(args.hashFeatureInfo).filter{
+        case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty
+      }.unzip._1
+      val ignoreFeatures = inN.toSeq.zip(args.ignoreFeatureInfo).filter{
+        case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty
+      }.unzip._1
+
+      makeVectorColumnMetadata(
+        hashFeatures = hashFeatures.toArray,
+        ignoreFeatures = ignoreFeatures.toArray,
+        features = mapFeatures.toArray,
+        params = makeHashingParams(),
+        hashKeys = hashKeys,
+        ignoreKeys = ignoreKeys,
+        shouldTrackNulls = args.shouldTrackNulls,
+        shouldTrackLen = $(trackTextLen)
+      )
+    } else Array.empty[OpVectorColumnMetadata]
 
-    val columns = categoricalColumns ++ textColumns
+    val columns = categoricalColumns ++ allTextColumns
     OpVectorMetadata(getOutputFeatureName, columns, Transmogrifier.inputFeaturesToHistory(inN, stageName))
   }
 
   def makeSmartTextMapVectorizerModelArgs(aggregatedStats: Array[TextMapStats]): SmartTextMapVectorizerModelArgs = {
     val maxCard = $(maxCardinality)
+    val minLenStdDev = $(minLengthStdDev)
     val minSup = $(minSupport)
     val shouldCleanKeys = $(cleanKeys)
     val shouldCleanValues = $(cleanText)
@@ -130,14 +178,18 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
 
     val allFeatureInfo = aggregatedStats.toSeq.map { textMapStats =>
       textMapStats.keyValueCounts.toSeq.map { case (k, textStats) =>
-        val isCat = textStats.valueCounts.size <= maxCard
-        val topVals = if (isCat) {
+        val vecMethod: TextVectorizationMethod = textStats match {
+          case _ if textStats.valueCounts.size <= maxCard => TextVectorizationMethod.Pivot
+          case _ if textStats.lengthStdDev <= minLenStdDev => TextVectorizationMethod.Ignore
+          case _ => TextVectorizationMethod.Hash
+        }
+        val topVals = if (vecMethod == TextVectorizationMethod.Pivot) {
           textStats.valueCounts
             .filter { case (_, count) => count >= minSup }
             .toSeq.sortBy(v => -v._2 -> v._1)
             .take($(topK)).map(_._1).toArray
         } else Array.empty[String]
-        SmartTextFeatureInfo(key = k, isCategorical = isCat, topValues = topVals)
+        SmartTextFeatureInfo(key = k, vectorizationMethod = vecMethod, topValues = topVals)
       }
     }
 
@@ -197,11 +249,15 @@ private[op] object TextMapStats {
 /**
  * Info about each feature within a text map
  *
- * @param key           name of a feature
- * @param isCategorical indicate whether a feature is categorical or not
- * @param topValues     most common values of a feature (only for categoricals)
+ * @param key                   name of a feature
+ * @param vectorizationMethod   method to use for text vectorization (either pivot, hashing, or ignoring)
+ * @param topValues             most common values of a feature (only for categoricals)
  */
-case class SmartTextFeatureInfo(key: String, isCategorical: Boolean, topValues: Array[String]) extends JsonLike
+case class SmartTextFeatureInfo(
+  key: String,
+  vectorizationMethod: TextVectorizationMethod,
+  topValues: Array[String]
+) extends JsonLike
 
 
 /**
@@ -221,11 +277,20 @@ case class SmartTextMapVectorizerModelArgs
   shouldTrackNulls: Boolean,
   hashingParams: HashingFunctionParams
 ) extends JsonLike {
-  val (categoricalFeatureInfo, textFeatureInfo) = allFeatureInfo.map{ featureInfoSeq =>
-    featureInfoSeq.partition{_ .isCategorical }
-  }.unzip
-  val categoricalKeys = categoricalFeatureInfo.map(featureInfoSeq => featureInfoSeq.map(_.key))
-  val textKeys = textFeatureInfo.map(featureInfoSeq => featureInfoSeq.map(_.key))
+  val (categoricalFeatureInfo, hashFeatureInfo, ignoreFeatureInfo) = allFeatureInfo.map{ featureInfoSeq =>
+    val groups = featureInfoSeq.groupBy(_.vectorizationMethod)
+    val catGroup = groups.getOrElse(TextVectorizationMethod.Pivot, Seq.empty)
+    val hashGroup = groups.getOrElse(TextVectorizationMethod.Hash, Seq.empty)
+    val ignoreGroup = groups.getOrElse(TextVectorizationMethod.Ignore, Seq.empty)
+
+    (catGroup, hashGroup, ignoreGroup)
+  }.unzip3
+
+  val categoricalKeys = categoricalFeatureInfo.map(_.map(_.key))
+  val hashKeys = hashFeatureInfo.map(_.map(_.key))
+  val ignoreKeys = ignoreFeatureInfo.map(_.map(_.key))
+
+  val textKeys = hashKeys.zip(ignoreKeys).map{ case (hk, ik) => hk ++ ik }
 }
 
 
@@ -248,32 +313,46 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
   )
 
   private def partitionRow(row: Seq[OPMap[String]]):
-  (Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]]) = {
+  (Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]]) = {
     val (rowCategorical, keysCategorical) =
       row.view.zip(args.categoricalKeys).collect { case (elements, keys) if keys.nonEmpty =>
         val filtered = elements.value.filter { case (k, v) => keys.contains(k) }
         (TextMap(filtered), keys)
       }.unzip
 
-    val (rowText, keysText) =
-      row.view.zip(args.textKeys).collect { case (elements, keys) if keys.nonEmpty =>
+    val (rowHashedText, keysHashedText) =
+      row.view.zip(args.hashKeys).collect { case (elements, keys) if keys.nonEmpty =>
         val filtered = elements.value.filter { case (k, v) => keys.contains(k) }
         (TextMap(filtered), keys)
       }.unzip
 
-    (rowCategorical.toList, keysCategorical.toList, rowText.toList, keysText.toList)
+    val (rowIgnoredText, keysIgnoredText) =
+      row.view.zip(args.ignoreKeys).collect { case (elements, keys) if keys.nonEmpty =>
+        val filtered = elements.value.filter { case (k, v) => keys.contains(k) }
+        (TextMap(filtered), keys)
+      }.unzip
+
+    (rowCategorical.toList, keysCategorical.toList, rowHashedText.toList, keysHashedText.toList,
+      rowIgnoredText.toList, keysIgnoredText.toList)
   }
 
   def transformFn: Seq[T] => OPVector = row => {
-    val (rowCategorical, keysCategorical, rowText, keysText) = partitionRow(row)
+    implicit val textListMonoid: Monoid[TextList] = TextList.monoid
+
+    val (rowCategorical, keysCategorical, rowHash, keysHash, rowIgnore, keysIgnore) = partitionRow(row)
+    val keysText = keysHash + keysIgnore // Go algebird!
     val categoricalVector = categoricalPivotFn(rowCategorical)
-    val rowTextTokenized = rowText.map(_.value.map { case (k, v) => k -> tokenize(v.toText).tokens })
-    val textVector = hash(rowTextTokenized, keysText, args.hashingParams)
+
+    val rowHashTokenized = rowHash.map(_.value.map { case (k, v) => k -> tokenize(v.toText).tokens })
+    val rowIgnoreTokenized = rowIgnore.map(_.value.map { case (k, v) => k -> tokenize(v.toText).tokens })
+    val rowTextTokenized = rowHashTokenized + rowIgnoreTokenized // Go go algebird!
+    val hashVector = hash(rowHashTokenized, keysHash, args.hashingParams)
+
     val textNullIndicatorsVector =
       if (args.shouldTrackNulls) getNullIndicatorsVector(keysText, rowTextTokenized) else OPVector.empty
     val textLenVector = if ($(trackTextLen)) getLenVector(keysText, rowTextTokenized) else OPVector.empty
 
-    categoricalVector.combine(textVector, textLenVector, textNullIndicatorsVector)
+    categoricalVector.combine(hashVector, textLenVector, textNullIndicatorsVector)
   }
 
   private def getNullIndicatorsVector(keysSeq: Seq[Seq[String]], inputs: Seq[Map[String, TextList]]): OPVector = {
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
index e95939d249..a2eab274ff 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
@@ -225,8 +225,6 @@ private[op] object TextStats {
  * Arguments for [[SmartTextVectorizerModel]]
  *
  * @param vectorizationMethods method to use for text vectorization (either pivot, hashing, or ignoring)
- * @param isCategorical        is feature a categorical or not
- * @param isIgnorable          is a text feature that we think is ignorable? high cardinality + low length variance
  * @param topValues            top values to each feature
  * @param shouldCleanText      should clean text value
  * @param shouldTrackNulls     should track nulls
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala
index 0cb29643ab..8d812ae449 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala
@@ -49,6 +49,8 @@ trait AttributeAsserts {
     for {
       (x, i) <- output.zipWithIndex
       _ = withClue(s"Output vector $i and expectedNominal arrays are not of the same length:") {
+        println(s"x.value.size: ${x.value.size}")
+        println(s"expectedNominal.length: ${expectedNominal.length}")
         x.value.size shouldBe expectedNominal.length
       }
       (value, nominal) <- x.value.toArray.zip(expectedNominal)
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
index 5584ea1565..a39d8a0eff 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
@@ -32,13 +32,14 @@ package com.salesforce.op.stages.impl.feature
 
 import com.salesforce.op._
 import com.salesforce.op.stages.base.sequence.SequenceModel
-import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext}
+import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder}
 import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata}
 import com.salesforce.op.utils.spark.RichDataset._
 import org.apache.spark.ml.linalg.Vectors
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
 import com.salesforce.op.features.types._
+import com.salesforce.op.testkit.RandomText
 
 @RunWith(classOf[JUnitRunner])
 class SmartTextMapVectorizerTest
@@ -73,6 +74,37 @@ class SmartTextMapVectorizerTest
     )
   )
 
+  /*
+    Generate some more complicated input data to check things a little closer. There are four text fields with
+    different token distributions:
+
+    country: Uniformly distributed from a larger list of ~few hundred countries, should be hashed
+    categoricalText: Uniformly distributed from a small list of choices, should be pivoted (also has fixed lengths,
+      so serves as a test that the categorical check happens before the token length variance check)
+    textId: Uniformly distributed high cardinality Ids with fixed lengths, should be ignored
+    text: Uniformly distributed unicode strings with lengths ranging from 0-100, should be hashed
+   */
+  val countryData: Seq[Text] = RandomText.countries.withProbabilityOfEmpty(0.2).limit(1000)
+  val categoricalTextData: Seq[Text] = RandomText.textFromDomain(domain = List("A", "B", "C", "D", "E", "F"))
+    .withProbabilityOfEmpty(0.2).limit(1000)
+  // Generate List containing elements like 040231, 040232, ...
+  val textIdData: Seq[Text] = RandomText.textFromDomain(
+    domain = (1 to 1000).map(x => "%06d".format(40230 + x)).toList
+  ).withProbabilityOfEmpty(0.2).limit(1000)
+  val textData: Seq[Text] = RandomText.strings(minLen = 0, maxLen = 100).withProbabilityOfEmpty(0.2).limit(1000)
+  val generatedData: Seq[(Text, Text, Text, Text)] =
+    countryData.zip(categoricalTextData).zip(textIdData).zip(textData).map {
+      case (((co, ca), id), te) => (co, ca, id, te)
+    }
+
+  def mapifyText(textSeq: Seq[Text]): TextMap = {
+    textSeq.zipWithIndex.flatMap {
+      case (t, i) => t.value.map(tv => "f" + i.toString -> tv)
+    }.toMap.toTextMap
+  }
+  val mapData = generatedData.map { case (t1, t2, t3, t4) => mapifyText(Seq(t1, t2, t3, t4)) }
+  val (rawDF, rawTextMap) = TestFeatureBuilder("textMap", mapData)
+
   /**
    * Estimator instance to be tested
    */
@@ -398,4 +430,40 @@ class SmartTextMapVectorizerTest
     result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 }
   }
 
+  it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance" in {
+    val topKCategorial = 3
+    val hashSize = 5
+
+    val smartVectorized = new SmartTextMapVectorizer()
+      .setMaxCardinality(10).setNumFeatures(hashSize).setMinSupport(10).setTopK(topKCategorial)
+      // .setMinLengthStdDev(1.0)
+      .setAutoDetectLanguage(false).setMinTokenLength(1).setToLowercase(false)
+      .setTrackNulls(true).setTrackTextLen(true)
+      .setInput(rawTextMap).getOutput()
+
+    val transformed = new OpWorkflow().setResultFeatures(smartVectorized).transform(rawDF)
+    val result = transformed.collect(smartVectorized)
+
+    /*
+      Feature vector should have 16 components, corresponding to two hashed text fields, one categorical field, and
+      one ignored text field.
+
+      Hashed text: (5 hash buckets + 1 length + 1 null indicator) = 7 elements
+      Categorical: (3 topK + 1 other + 1 null indicator) = 5 elements
+      Ignored text: (1 length + 1 null indicator) = 2 elements
+     */
+    val featureVectorSize = 2 * (hashSize + 2) + (topKCategorial + 2) + 2
+    val firstRes = result.head
+    // firstRes.v.size shouldBe featureVectorSize
+
+    val meta = OpVectorMetadata(transformed.schema(smartVectorized.name))
+    meta.columns.foreach(println)
+    meta.columns.length shouldBe featureVectorSize
+    meta.columns.slice(0, 5).forall(_.grouping.contains("categorical"))
+    meta.columns.slice(5, 10).forall(_.grouping.contains("country"))
+    meta.columns.slice(10, 15).forall(_.grouping.contains("text"))
+    meta.columns.slice(15, 18).forall(_.descriptorValue.contains(OpVectorColumnMetadata.TextLenString))
+    meta.columns.slice(18, 21).forall(_.indicatorValue.contains(OpVectorColumnMetadata.NullString))
+  }
+
 }
diff --git a/features/src/main/scala/com/salesforce/op/features/types/Lists.scala b/features/src/main/scala/com/salesforce/op/features/types/Lists.scala
index eebf1f7a47..67147227c4 100644
--- a/features/src/main/scala/com/salesforce/op/features/types/Lists.scala
+++ b/features/src/main/scala/com/salesforce/op/features/types/Lists.scala
@@ -30,6 +30,8 @@
 
 package com.salesforce.op.features.types
 
+import com.twitter.algebird.{Monoid, SeqMonoid}
+
 /**
  * A list of text values
  *
@@ -41,6 +43,13 @@ class TextList(val value: Seq[String]) extends OPList[String] {
 object TextList {
   def apply(value: Seq[String]): TextList = new TextList(value)
   def empty: TextList = FeatureTypeDefaults.TextList
+
+  def monoid: Monoid[TextList] = new Monoid[TextList] {
+    override def zero = TextList.empty
+    override def plus(left: TextList, right: TextList): TextList = {
+      TextList(left.value ++ right.value)
+    }
+  }
 }
 
 /**

From b00775bbe354dcdd1aa41a47d58a27eb9b73f59c Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Fri, 10 Jan 2020 14:07:12 -0800
Subject: [PATCH 02/13] New test also working

---
 .../op/stages/impl/feature/SmartTextMapVectorizerTest.scala  | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
index a39d8a0eff..3b65a085f4 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
@@ -436,7 +436,7 @@ class SmartTextMapVectorizerTest
 
     val smartVectorized = new SmartTextMapVectorizer()
       .setMaxCardinality(10).setNumFeatures(hashSize).setMinSupport(10).setTopK(topKCategorial)
-      // .setMinLengthStdDev(1.0)
+      .setMinLengthStdDev(1.0)
       .setAutoDetectLanguage(false).setMinTokenLength(1).setToLowercase(false)
       .setTrackNulls(true).setTrackTextLen(true)
       .setInput(rawTextMap).getOutput()
@@ -454,10 +454,9 @@ class SmartTextMapVectorizerTest
      */
     val featureVectorSize = 2 * (hashSize + 2) + (topKCategorial + 2) + 2
     val firstRes = result.head
-    // firstRes.v.size shouldBe featureVectorSize
+    firstRes.v.size shouldBe featureVectorSize
 
     val meta = OpVectorMetadata(transformed.schema(smartVectorized.name))
-    meta.columns.foreach(println)
     meta.columns.length shouldBe featureVectorSize
     meta.columns.slice(0, 5).forall(_.grouping.contains("categorical"))
     meta.columns.slice(5, 10).forall(_.grouping.contains("country"))

From 03ae4d00874a136d9255f7e07a80a82c1e8b449a Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Fri, 10 Jan 2020 14:09:11 -0800
Subject: [PATCH 03/13] Remove debug output

---
 .../salesforce/op/stages/impl/feature/AttributeAsserts.scala    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala
index 8d812ae449..0cb29643ab 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/AttributeAsserts.scala
@@ -49,8 +49,6 @@ trait AttributeAsserts {
     for {
       (x, i) <- output.zipWithIndex
       _ = withClue(s"Output vector $i and expectedNominal arrays are not of the same length:") {
-        println(s"x.value.size: ${x.value.size}")
-        println(s"expectedNominal.length: ${expectedNominal.length}")
         x.value.size shouldBe expectedNominal.length
       }
       (value, nominal) <- x.value.toArray.zip(expectedNominal)

From 9759747524b71275e6cab8d2e0d0a9c275543410 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Fri, 10 Jan 2020 14:20:10 -0800
Subject: [PATCH 04/13] Added test for TextList monoid

---
 .../op/features/types/FeatureTypeValueTest.scala       | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala
index 345b98e6b4..c2576b1a49 100644
--- a/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala
+++ b/features/src/test/scala/com/salesforce/op/features/types/FeatureTypeValueTest.scala
@@ -32,6 +32,8 @@ package com.salesforce.op.features.types
 
 import com.salesforce.op.test.TestCommon
 import com.salesforce.op.utils.reflection.ReflectionUtils
+import com.twitter.algebird.Monoid
+import com.twitter.algebird.Operators._
 import org.apache.lucene.geo.GeoUtils
 import org.apache.spark.ml.linalg.DenseVector
 import org.junit.runner.RunWith
@@ -125,8 +127,14 @@ class FeatureTypeValueTest extends PropSpec with PropertyChecks with TestCommon
   }
 
   property("OPList types should correctly wrap their corresponding types") {
+    implicit val textListMonoid: Monoid[TextList] = TextList.monoid
+
     forAll(geoGen) { x => checkVals(Geolocation(x), x) }
-    forAll(textListGen) { x => checkVals(TextList(x), x) }
+    forAll(textListGen) { x =>
+      checkVals(TextList(x), x)
+      val tl = TextList(x)
+      tl + tl shouldBe TextList(x ++ x) // Test the monoid too
+    }
     forAll(longListGen) { x =>
       checkVals(DateList(x), x)
       checkVals(DateTimeList(x), x)

From 10644d87b7effc8cd1c62387016bf20bfeca28d1 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Mon, 13 Jan 2020 15:48:35 -0800
Subject: [PATCH 05/13] Added another test and fixed sneaky metadata issue

---
 .../OPCollectionHashingVectorizer.scala       | 15 +++---
 .../impl/feature/SmartTextMapVectorizer.scala | 39 +++-------------
 .../feature/SmartTextMapVectorizerTest.scala  | 46 ++++++++++++++++++-
 3 files changed, 59 insertions(+), 41 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
index 2e1fa78710..c3c4588c1d 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/OPCollectionHashingVectorizer.scala
@@ -316,7 +316,6 @@ private[op] trait MapHashingFun extends HashingFun {
   (
     hashFeatures: Array[TransientFeature],
     ignoreFeatures: Array[TransientFeature],
-    features: Array[TransientFeature],
     params: HashingFunctionParams,
     hashKeys: Seq[Seq[String]],
     ignoreKeys: Seq[Seq[String]],
@@ -337,25 +336,27 @@ private[op] trait MapHashingFun extends HashingFun {
         }.toArray
       } else {
         for {
-          (keys, f) <- hashKeys.toArray.zip(hashFeatures)
+          // Need to filter out empty key sequences since the hashFeatures only contain a map feature if one of their
+          // keys is to be hashed, but hashKeys contains a sequence per map (whether it's empty or not)
+          (keys, f) <- hashKeys.filter(_.nonEmpty).zip(hashFeatures)
           key <- keys
           i <- 0 until numHashes
         } yield f.toColumnMetaData().copy(grouping = Option(key))
-      }
+      }.toArray
 
     // All columns get null tracking or text length tracking, whether their contents are hashed or ignored
-    val allKeys = hashKeys.zip(ignoreKeys).map{ case(h, i) => h ++ i }
-    // val allFeatures =
+    val allTextKeys = hashKeys.zip(ignoreKeys).map{ case(h, i) => h ++ i }
+    val allTextFeatures = hashFeatures ++ ignoreFeatures
     val nullColumns = if (shouldTrackNulls) {
       for {
-        (keys, f) <- allKeys.toArray.zip(features)
+        (keys, f) <- allTextKeys.toArray.zip(allTextFeatures)
         key <- keys
       } yield f.toColumnMetaData(isNull = true).copy(grouping = Option(key))
     } else Array.empty[OpVectorColumnMetadata]
 
     val lenColumns = if (shouldTrackLen) {
       for {
-        (keys, f) <- allKeys.toArray.zip(features)
+        (keys, f) <- allTextKeys.toArray.zip(allTextFeatures)
         key <- keys
       } yield f.toColumnMetaData(descriptorValue = OpVectorColumnMetadata.TextLenString).copy(grouping = Option(key))
     } else Array.empty[OpVectorColumnMetadata]
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
index 11e84e3528..014407d9e6 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
@@ -104,58 +104,30 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
       )
     } else Array.empty[OpVectorColumnMetadata]
 
-
-    /*
-    val hashedTextColumns = if (args.hashFeatureInfo.flatten.nonEmpty) {
-      val (mapFeatures, mapFeatureInfo) =
-        inN.toSeq.zip(args.hashFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
-      val allKeys = mapFeatureInfo.map(_.map(_.key))
-      makeVectorColumnMetadata(
-        features = mapFeatures.toArray,
-        params = makeHashingParams(),
-        allKeys = allKeys,
-        shouldTrackNulls = args.shouldTrackNulls,
-        shouldTrackLen = $(trackTextLen)
-      )
-    } else Array.empty[OpVectorColumnMetadata]
-
-    val ignoredTextColumns = if (args.ignoreFeatureInfo.flatten.nonEmpty) {
-      val (mapFeatures, mapFeatureInfo) =
-        inN.toSeq.zip(args.ignoreFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
-      val allKeys = mapFeatureInfo.map(_.map(_.key))
-      makeVectorColumnMetadata(
-        features = mapFeatures.toArray,
-        params = makeHashingParams(),
-        allKeys = allKeys,
-        shouldTrackNulls = args.shouldTrackNulls,
-        shouldTrackLen = $(trackTextLen)
-      )
-    } else Array.empty[OpVectorColumnMetadata]
-      */
-
     val allTextFeatureInfo = args.hashFeatureInfo.zip(args.ignoreFeatureInfo).map{ case (h, i) => h ++ i }
     val allTextColumns = if (allTextFeatureInfo.flatten.nonEmpty) {
       val (mapFeatures, mapFeatureInfo) =
         inN.toSeq.zip(allTextFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
       val allKeys = mapFeatureInfo.map(_.map(_.key))
 
+      // Careful when zipping sequences like hashKeys (length and hashFeatures
       val hashKeys = args.hashFeatureInfo.map(
         _.filter(_.vectorizationMethod == TextVectorizationMethod.Hash).map(_.key)
       )
       val ignoreKeys = args.ignoreFeatureInfo.map(
         _.filter(_.vectorizationMethod == TextVectorizationMethod.Ignore).map(_.key)
       )
-      val hashFeatures = inN.toSeq.zip(args.hashFeatureInfo).filter{
+
+      val hashFeatures = inN.toSeq.zip(args.hashFeatureInfo).filter {
         case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty
-      }.unzip._1
+      }.map(_._1)
       val ignoreFeatures = inN.toSeq.zip(args.ignoreFeatureInfo).filter{
         case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty
-      }.unzip._1
+      }.map(_._1)
 
       makeVectorColumnMetadata(
         hashFeatures = hashFeatures.toArray,
         ignoreFeatures = ignoreFeatures.toArray,
-        features = mapFeatures.toArray,
         params = makeHashingParams(),
         hashKeys = hashKeys,
         ignoreKeys = ignoreKeys,
@@ -348,6 +320,7 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
     val rowTextTokenized = rowHashTokenized + rowIgnoreTokenized // Go go algebird!
     val hashVector = hash(rowHashTokenized, keysHash, args.hashingParams)
 
+    // All columns get null tracking or text length tracking, whether their contents are hashed or ignored
     val textNullIndicatorsVector =
       if (args.shouldTrackNulls) getNullIndicatorsVector(keysText, rowTextTokenized) else OPVector.empty
     val textLenVector = if ($(trackTextLen)) getLenVector(keysText, rowTextTokenized) else OPVector.empty
diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
index 3b65a085f4..a5af0c824c 100644
--- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
+++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala
@@ -105,6 +105,13 @@ class SmartTextMapVectorizerTest
   val mapData = generatedData.map { case (t1, t2, t3, t4) => mapifyText(Seq(t1, t2, t3, t4)) }
   val (rawDF, rawTextMap) = TestFeatureBuilder("textMap", mapData)
 
+  // Do the same thing with the data spread across many maps to test that they get combined correctly as well
+  val mapDataSeparate = generatedData.map {
+    case (t1, t2, t3, t4) => (mapifyText(Seq(t1, t2)), mapifyText(Seq(t3)), mapifyText(Seq(t4)))
+  }
+  val (rawDFSeparateMaps, rawTextMap1, rawTextMap2, rawTextMap3) =
+    TestFeatureBuilder("textMap1", "textMap2", "textMap3", mapDataSeparate)
+
   /**
    * Estimator instance to be tested
    */
@@ -430,7 +437,8 @@ class SmartTextMapVectorizerTest
     result.foreach { case (vec1, vec2) => vec1 shouldBe vec2 }
   }
 
-  it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance" in {
+  it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance " +
+    "when data is in a single TextMap" in {
     val topKCategorial = 3
     val hashSize = 5
 
@@ -465,4 +473,40 @@ class SmartTextMapVectorizerTest
     meta.columns.slice(18, 21).forall(_.indicatorValue.contains(OpVectorColumnMetadata.NullString))
   }
 
+  it should "detect and ignore fields that looks like machine-generated IDs by having a low token length variance " +
+    "when data is in many TextMaps" in {
+    val topKCategorial = 3
+    val hashSize = 5
+
+    val smartVectorized = new SmartTextMapVectorizer()
+      .setMaxCardinality(10).setNumFeatures(hashSize).setMinSupport(10).setTopK(topKCategorial)
+      .setMinLengthStdDev(1.0)
+      .setAutoDetectLanguage(false).setMinTokenLength(1).setToLowercase(false)
+      .setTrackNulls(true).setTrackTextLen(true)
+      .setInput(rawTextMap1, rawTextMap2, rawTextMap3).getOutput()
+
+    val transformed = new OpWorkflow().setResultFeatures(smartVectorized).transform(rawDFSeparateMaps)
+    val result = transformed.collect(smartVectorized)
+
+    /*
+      Feature vector should have 16 components, corresponding to two hashed text fields, one categorical field, and
+      one ignored text field.
+
+      Hashed text: (5 hash buckets + 1 length + 1 null indicator) = 7 elements
+      Categorical: (3 topK + 1 other + 1 null indicator) = 5 elements
+      Ignored text: (1 length + 1 null indicator) = 2 elements
+     */
+    val featureVectorSize = 2 * (hashSize + 2) + (topKCategorial + 2) + 2
+    val firstRes = result.head
+    firstRes.v.size shouldBe featureVectorSize
+
+    val meta = OpVectorMetadata(transformed.schema(smartVectorized.name))
+    meta.columns.length shouldBe featureVectorSize
+    meta.columns.slice(0, 5).forall(_.grouping.contains("categorical"))
+    meta.columns.slice(5, 10).forall(_.grouping.contains("country"))
+    meta.columns.slice(10, 15).forall(_.grouping.contains("text"))
+    meta.columns.slice(15, 18).forall(_.descriptorValue.contains(OpVectorColumnMetadata.TextLenString))
+    meta.columns.slice(18, 21).forall(_.indicatorValue.contains(OpVectorColumnMetadata.NullString))
+  }
+
 }

From a107db1572bea004fcdc4fbfd6fd7ba6a40f28d5 Mon Sep 17 00:00:00 2001
From: Michael Yang <michael.yang@salesforce.com>
Date: Tue, 14 Jan 2020 11:25:17 -0800
Subject: [PATCH 06/13] Pulled out SensitiveFeatureInformation metadata changes
 into its own branch

---
 .../com/salesforce/op/ModelInsights.scala     |  42 ++++-
 .../stages/impl/feature/Transmogrifier.scala  |   4 +-
 .../com/salesforce/op/ModelInsightsTest.scala |  77 +++++++-
 .../op/utils/spark/OpVectorMetadata.scala     |  51 ++++--
 .../op/utils/spark/OPVectorMetadataTest.scala | 103 ++++++++---
 .../op/SensitiveFeatureInformation.scala      | 165 ++++++++++++++++++
 .../op/SensitiveFeatureInformationTest.scala  | 134 ++++++++++++++
 7 files changed, 531 insertions(+), 45 deletions(-)
 create mode 100644 utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
 create mode 100644 utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala

diff --git a/core/src/main/scala/com/salesforce/op/ModelInsights.scala b/core/src/main/scala/com/salesforce/op/ModelInsights.scala
index 529217491d..08645260b4 100644
--- a/core/src/main/scala/com/salesforce/op/ModelInsights.scala
+++ b/core/src/main/scala/com/salesforce/op/ModelInsights.scala
@@ -333,7 +333,7 @@ case class Discrete(domain: Seq[String], prob: Seq[Double]) extends LabelInfo
  * @param metrics          sequence containing metrics computed in RawFeatureFilter
  * @param distributions    distribution information for the raw feature (if calculated in RawFeatureFilter)
  * @param exclusionReasons exclusion reasons for the raw feature (if calculated in RawFeatureFilter)
- *
+ * @param sensitiveInformation derived information about sensitive field checks (if performed)
  */
 case class FeatureInsights
 (
@@ -342,7 +342,8 @@ case class FeatureInsights
   derivedFeatures: Seq[Insights],
   metrics: Seq[RawFeatureFilterMetrics] = Seq.empty,
   distributions: Seq[FeatureDistribution] = Seq.empty,
-  exclusionReasons: Seq[ExclusionReasons] = Seq.empty
+  exclusionReasons: Seq[ExclusionReasons] = Seq.empty,
+  sensitiveInformation: Seq[SensitiveFeatureInformation] = Seq.empty
 )
 
 /**
@@ -697,8 +698,41 @@ case object ModelInsights {
           val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname)
           val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname)
           val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname)
-          FeatureInsights(featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2),
-            metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons)
+          val sensitiveFeatureInformation = vectorInfo.flatMap(_.sensitive.get(fname)) match {
+            case Some(info) => info
+            case _ => Seq.empty
+          }
+          FeatureInsights(
+            featureName = fname, featureType = ftype, derivedFeatures = seq.map(_._2),
+            metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons,
+            sensitiveInformation = sensitiveFeatureInformation
+          )
+      }.toSeq ++ {
+        /*
+          Add FeatureInsights for removed sensitive fields that do not have a column in OpVectorMetadata.
+          With current TMOG settings, this will not happen unless null tracking is turned off since
+          null indicators are created for all text features, even ignored ones.
+        */
+        vectorInfo match {
+          case Some(v) =>
+            // Find features where `actionTaken` is true for all of the sensitive feature informations
+            v.sensitive.collect {
+              case (fname, sensitiveFeatureInformation)
+                if sensitiveFeatureInformation.forall(_.actionTaken) =>
+              val ftype = allFeatures.find(_.name == fname)
+                .map(_.typeName)
+                .getOrElse("")
+              val metrics = rawFeatureFilterResults.rawFeatureFilterMetrics.filter(_.name == fname)
+              val distributions = rawFeatureFilterResults.rawFeatureDistributions.filter(_.name == fname)
+              val exclusionReasons = rawFeatureFilterResults.exclusionReasons.filter(_.name == fname)
+              FeatureInsights(
+                featureName = fname, featureType = ftype, derivedFeatures = Seq.empty,
+                metrics = metrics, distributions = distributions, exclusionReasons = exclusionReasons,
+                sensitiveInformation = sensitiveFeatureInformation
+              )
+            }
+          case None => Seq.empty[FeatureInsights]
+        }
       }.toSeq
   }
 
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala
index 8ce42cb8ac..f50f8f1726 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala
@@ -380,7 +380,7 @@ trait VectorizerDefaults extends OpPipelineStageBase {
     val cols =
       if (withNullTracking) tf.flatMap { f => Seq(f.toColumnMetaData(), f.toColumnMetaData(isNull = true)) }
       else tf.map { f => f.toColumnMetaData() }
-    OpVectorMetadata(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName))
+    OpVectorMetadata.apply(vectorOutputName, cols, Transmogrifier.inputFeaturesToHistory(tf, stageName))
   }
 
   /**
@@ -697,6 +697,6 @@ trait MapStringPivotHelper extends SaveOthersParams {
   ): OpVectorMetadata = {
     val otherValueString = $(unseenName)
     val cols = makeVectorColumnMetadata(topValues, inputFeatures, otherValueString, trackNulls)
-    OpVectorMetadata(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName))
+    OpVectorMetadata.apply(outputName, cols, Transmogrifier.inputFeaturesToHistory(inputFeatures, stageName))
   }
 }
diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
index d1fa503188..fc790dc81a 100644
--- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
+++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
@@ -574,7 +574,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
         indicatorValue = Option(name)
       )
     },
-    Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap
+    Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
+    Map(
+      "f0" -> Seq(SensitiveFeatureInformation.Name(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false))
+    )
   )
 
   it should "correctly extract the LabelSummary from the label and sanity checker info" in {
@@ -623,6 +626,18 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     f0In.featureName shouldBe "f0"
     f0In.featureType shouldBe classOf[PickList].getName
     f0In.derivedFeatures.size shouldBe 2
+    f0In.sensitiveInformation match {
+      case Seq(SensitiveFeatureInformation.Name(
+        probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
+      )) =>
+        actionTaken shouldBe false
+        probName shouldBe 0.0
+        genderDetectResults shouldBe Seq.empty[String]
+        probMale shouldBe 0.0
+        probFemale shouldBe 0.0
+        probOther shouldBe 1.0
+      case _ => fail("SensitiveFeatureInformation was not found.")
+    }
 
     val f0InDer2 = f0In.derivedFeatures.head
     f0InDer2.derivedFeatureName shouldBe "f0_f0_f2_1"
@@ -690,6 +705,62 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     insights.features.foreach(f => f.distributions shouldBe empty)
   }
 
+  it should
+    """include sensitive feature information
+      |even for sensitive features that are removed from output vector and output vector metadata""".stripMargin in {
+    // Copy metadata from above but add new feature that was removed in vectorizing to sensitive info
+    val f_notInMeta = Feature[Text]("f_notInMeta", false, null, Seq(), "test")
+    val newMeta = OpVectorMetadata(
+      "fv",
+      OpVectorColumnMetadata(
+        parentFeatureName = Seq("f1"),
+        parentFeatureType = Seq(classOf[Real].getName),
+        grouping = None,
+        indicatorValue = None
+      ) +: Array("f2", "f3").map { name =>
+        OpVectorColumnMetadata(
+          parentFeatureName = Seq("f0"),
+          parentFeatureType = Seq(classOf[PickList].getName),
+          grouping = Option("f0"),
+          indicatorValue = Option(name)
+        )
+      },
+      Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
+      Map(
+        "f0" -> Seq(SensitiveFeatureInformation.Name(
+          0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false
+        )),
+        "f_notInMeta" -> Seq(SensitiveFeatureInformation.Name(
+          1.0, Seq.empty[String], 0.0, 0.0, 1.0, "f_notInMeta", None, true
+        ))
+      )
+    )
+
+    val labelSum = ModelInsights.getLabelSummary(Option(lbl), Option(summary))
+
+    val featureInsights = ModelInsights.getFeatureInsights(
+      Option(newMeta), Option(summary), None, Array(f1, f0, f_notInMeta), Array.empty, Map.empty[String, Set[String]],
+      RawFeatureFilterResults(), labelSum
+    )
+    featureInsights.size shouldBe 3
+    val f_notInMeta_butInInsights = featureInsights.find(_.featureName == "f_notInMeta").get
+    f_notInMeta_butInInsights.featureName shouldBe "f_notInMeta"
+    f_notInMeta_butInInsights.featureType shouldBe classOf[Text].getName
+    f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0
+    f_notInMeta_butInInsights.sensitiveInformation match {
+      case Seq(SensitiveFeatureInformation.Name(
+        probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
+      )) =>
+        actionTaken shouldBe true
+        probName shouldBe 1.0
+        genderDetectResults shouldBe Seq.empty[String]
+        probMale shouldBe 0.0
+        probFemale shouldBe 0.0
+        probOther shouldBe 1.0
+      case _ => fail("SensitiveFeatureInformation was not found.")
+    }
+  }
+
   it should "return model insights for xgboost classification" in {
     noException should be thrownBy xgbWorkflowModel.modelInsights(xgbClassifierPred)
     val insights = xgbWorkflowModel.modelInsights(xgbClassifierPred)
@@ -794,8 +865,8 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     }
 
     cardinality.foreach { case (featureName, value) =>
-        val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet
-        actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble)
+      val actualUniques = df.select(featureName).as[Double].distinct.collect.toSet
+      actualUniques should contain allElementsOf value.valueCounts.keySet.map(_.toDouble)
     }
   }
 
diff --git a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala
index 71eb7b7254..ca059d791d 100644
--- a/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala
+++ b/features/src/main/scala/com/salesforce/op/utils/spark/OpVectorMetadata.scala
@@ -30,7 +30,7 @@
 
 package com.salesforce.op.utils.spark
 
-import com.salesforce.op.FeatureHistory
+import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation}
 import com.salesforce.op.features.types.{FeatureType, _}
 import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NumericAttribute}
 import org.apache.spark.ml.linalg.SQLDataTypes._
@@ -43,14 +43,17 @@ import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField}
  *
  * @param name    name of the feature vector
  * @param col     information about each element in the vector
- * @param history history of parent features used to create the vector map is from
+ * @param history history of parent features used to create the vector; map is from
  *                OpVectorColumnMetadata.parentFeatureName (String) to FeatureHistory
+ * @param sensitive parent features that were detected as sensitive in the creation of the vector;
+ *                  map is from OpVectorColumnMetadata.parentFeatureName (String) to SensitiveFeatureInformation
  */
 class OpVectorMetadata private
 (
   val name: String,
   col: Array[OpVectorColumnMetadata],
-  val history: Map[String, FeatureHistory] // TODO fix map -> causes problems when multiple vectorizers used on feature
+  val history: Map[String, FeatureHistory], // TODO fix map -> causes problems when multiple vectorizers used on feature
+  val sensitive: Map[String, Seq[SensitiveFeatureInformation]] = Map.empty[String, Seq[SensitiveFeatureInformation]]
 ) {
 
   /**
@@ -92,6 +95,7 @@ class OpVectorMetadata private
     val meta = new MetadataBuilder()
       .putMetadataArray(OpVectorMetadata.ColumnsKey, colMeta.toArray)
       .putMetadata(OpVectorMetadata.HistoryKey, FeatureHistory.toMetadata(history))
+      .putMetadata(OpVectorMetadata.SensitiveKey, SensitiveFeatureInformation.toMetadata(sensitive))
       .build()
     val attributes = columns.map {
       case c if (c.indicatorValue.isDefined || binaryTypes.exists(c.parentFeatureType.contains)) &&
@@ -161,7 +165,10 @@ class OpVectorMetadata private
   override def equals(obj: Any): Boolean =
     obj match {
       case o: OpVectorMetadata
-        if o.name == name && o.columns.toSeq == columns.toSeq && history == o.history => true
+        if o.name == name &&
+          o.columns.toSeq == columns.toSeq &&
+          history == o.history &&
+          sensitive == o.sensitive => true
       case _ => false
     }
 
@@ -169,7 +176,7 @@ class OpVectorMetadata private
   override def hashCode(): Int = 37 * columns.toSeq.hashCode()
 
   override def toString: String =
-    s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history)"
+    s"${this.getClass.getSimpleName}($name,${columns.mkString("Array(", ",", ")")},$history,$sensitive)"
 
 }
 
@@ -179,6 +186,7 @@ object OpVectorMetadata {
 
   val ColumnsKey = "vector_columns"
   val HistoryKey = "vector_history"
+  val SensitiveKey = "vector_detected_sensitive"
 
   /**
    * Construct an [[OpVectorMetadata]] from a [[StructField]], assuming that [[ColumnsKey]] is present and conforms
@@ -197,9 +205,14 @@ object OpVectorMetadata {
       if (wrapped.underlyingMap(HistoryKey).asInstanceOf[Metadata].isEmpty) Map.empty[String, FeatureHistory]
       else FeatureHistory.fromMetadataMap(field.metadata.getMetadata(HistoryKey))
 
-    new OpVectorMetadata(field.name, columns, history)
-  }
+    val sensitive =
+      if (wrapped.underlyingMap(SensitiveKey).asInstanceOf[Metadata].isEmpty) {
+        Map.empty[String, Seq[SensitiveFeatureInformation]]
+      }
+      else SensitiveFeatureInformation.fromMetadataMap(field.metadata.getMetadata(SensitiveKey))
 
+    new OpVectorMetadata(field.name, columns, history, sensitive)
+  }
 
   /**
    * Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]]
@@ -214,9 +227,24 @@ object OpVectorMetadata {
     name: String,
     columns: Array[OpVectorColumnMetadata],
     history: Map[String, FeatureHistory]
-  ): OpVectorMetadata = {
-    new OpVectorMetadata(name, columns, history)
-  }
+  ): OpVectorMetadata = new OpVectorMetadata(name, columns, history)
+
+  /**
+   * Construct an [[OpVectorMetadata]] from a string representing its name, and an array of [[OpVectorColumnMetadata]]
+   * representing its columns.
+   *
+   * @param name    The name of the column the metadata represents
+   * @param columns The columns within the vectors
+   * @param history The history of the parent features
+   * @param sensitive Which columns have been marked as sensitive and related information
+   * @return The constructed vector metadata
+   */
+  def apply(
+    name: String,
+    columns: Array[OpVectorColumnMetadata],
+    history: Map[String, FeatureHistory],
+    sensitive: Map[String, Seq[SensitiveFeatureInformation]]
+  ): OpVectorMetadata = new OpVectorMetadata(name, columns, history, sensitive)
 
   /**
    * Construct an [[OpVectorMetadata]] from its name and a [[Metadata]], assuming that [[ColumnsKey]] and
@@ -242,7 +270,8 @@ object OpVectorMetadata {
   def flatten(outputName: String, vectors: Seq[OpVectorMetadata]): OpVectorMetadata = {
     val allColumns = vectors.flatMap(_.columns).toArray
     val allHist = vectors.flatMap(_.history).toMap
-    new OpVectorMetadata(outputName, allColumns, allHist)
+    val allSensitive = vectors.flatMap(_.sensitive).toMap
+    new OpVectorMetadata(outputName, allColumns, allHist, allSensitive)
   }
 
 }
diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
index 438119ee57..0eff4dc9e1 100644
--- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
+++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
@@ -30,7 +30,7 @@
 
 package com.salesforce.op.utils.spark
 
-import com.salesforce.op.FeatureHistory
+import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation}
 import com.salesforce.op.features.types.{DateTime, Email, FeatureType, OPMap, PickList, Prediction, Real, RealMap, TextAreaMap}
 import com.salesforce.op.test.TestCommon
 import org.apache.spark.sql.types.Metadata
@@ -47,7 +47,11 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
 
   type OpVectorColumnTuple = (Seq[String], Seq[String], Option[String], Option[String], Option[String], Int)
   type FeatureHistoryTuple = (Seq[String], Seq[String])
-  type OpVectorTuple = (String, Array[OpVectorColumnTuple], FeatureHistoryTuple)
+
+  type SensitiveTuple = (SensitiveNameTuple, String, Option[String], Boolean)
+  type SensitiveNameTuple = (Double, Seq[String], Double, Double, Double)
+
+  type OpVectorTuple = (String, Array[OpVectorColumnTuple], FeatureHistoryTuple, Seq[SensitiveTuple])
 
   // AttributeGroup and Attribute require non-empty names
   val genName: Gen[String] = Gen.nonEmptyListOf(alphaNumChar).map(_.mkString)
@@ -72,24 +76,51 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
   )
   val arrVecColTupleGen: Gen[Array[OpVectorColumnTuple]] = Gen.containerOf[Array, OpVectorColumnTuple](vecColTupleGen)
 
+  val sensitiveGen: Gen[SensitiveTuple] = for {
+    featureName <- genName
+    mapKey <- Gen.option(genName)
+    actionTaken <- Gen.oneOf[Boolean](Seq(false, true))
+    probName <- Gen.choose(0.0, 1.0)
+    genderDetectResults <- Gen.containerOf[Seq, String](genName)
+    probMale <- Gen.choose(0.0, 1.0)
+    probFemale <- Gen.choose(0.0, 1.0 - probMale)
+    probOther <- Gen.choose(0.0, 1.0 - probMale - probFemale)
+  } yield {
+    ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken)
+  }
+
   val vecGen: Gen[OpVectorTuple] = for {
     name <- genName
     arr <- arrVecColTupleGen
     histories <- featHistTupleGen
+    sensitiveCols <- Gen.containerOf[Seq, SensitiveTuple](sensitiveGen)
   } yield {
-    (name, arr, histories)
+    (name, arr, histories, sensitiveCols)
   }
 
   val seqVecGen: Gen[Seq[OpVectorTuple]] = Gen.containerOf[Seq, OpVectorTuple](vecGen)
 
-  private def generateHistory(columnsMeta: Array[OpVectorColumnMetadata], hist: (Seq[String], Seq[String])) =
+  private def generateHistory(
+    columnsMeta: Array[OpVectorColumnMetadata], hist: (Seq[String], Seq[String])
+  ): Map[String, FeatureHistory] =
     columnsMeta.flatMap(v => v.parentFeatureName.map(p => p -> FeatureHistory(hist._1, hist._2))).toMap
 
-  private def checkTuples(tup: OpVectorColumnTuple) = tup._1.nonEmpty && tup._2.nonEmpty
+  private def generateSensitiveFeatureInfo(
+    columnsMeta: Array[OpVectorColumnMetadata], sensitiveInfoSeqRaw: Seq[SensitiveTuple]
+  ): Map[String, Seq[SensitiveFeatureInformation]] = {
+    val sensitiveInfoSeq = sensitiveInfoSeqRaw map {
+      case ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken) =>
+        SensitiveFeatureInformation.Name(
+          probName, genderDetectResults, probMale, probFemale, probOther, featureName, mapKey, actionTaken
+        )
+    }
+    columnsMeta.flatMap(v => v.parentFeatureName.map(p => p -> sensitiveInfoSeq)).toMap
+  }
 
+  private def checkTuples(tup: OpVectorColumnTuple): Boolean = tup._1.nonEmpty && tup._2.nonEmpty
 
   property("column metadata stays the same when serialized to spark metadata") {
-    forAll(vecColTupleGen) { (vct: OpVectorColumnTuple) =>
+    forAll(vecColTupleGen) { vct: OpVectorColumnTuple =>
       if (checkTuples(vct)) {
         val columnMeta = OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5)
         columnMeta shouldEqual OpVectorColumnMetadata.fromMetadata(columnMeta.toMetadata()).head
@@ -98,7 +129,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
   }
 
   property("column metadata cannot be created with empty parents or feature types") {
-    forAll(vecColTupleGen) { (vct: OpVectorColumnTuple) =>
+    forAll(vecColTupleGen) { vct: OpVectorColumnTuple =>
       if (!checkTuples(vct)) {
         assertThrows[IllegalArgumentException] { OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5) }
       }
@@ -106,22 +137,34 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
   }
 
   property("vector metadata stays the same when serialized to spark metadata") {
-    forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) =>
-      val cols = columns.filter(checkTuples)
-      val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5))
-      val history = generateHistory(columnsMeta, hist)
-      val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history)
-      val field = vectorMeta.toStructField()
-      vectorMeta shouldEqual OpVectorMetadata(field)
+    forAll(vecGen) {
+      case (outputName: String,
+        columns: Array[OpVectorColumnTuple],
+        hist: FeatureHistoryTuple,
+        sens: Seq[SensitiveTuple]
+      ) if outputName.nonEmpty =>
+        val cols = columns.filter(checkTuples)
+        val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5))
+        val history = generateHistory(columnsMeta, hist)
+        val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens)
+        val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive)
+        val field = vectorMeta.toStructField()
+        vectorMeta shouldEqual OpVectorMetadata(field)
+      case _ => true shouldEqual true
     }
   }
 
   property("vector metadata properly finds indices of its columns") {
-    forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) =>
+    forAll(vecGen) {
+      case (outputName: String,
+        columns: Array[OpVectorColumnTuple],
+        hist: FeatureHistoryTuple,
+        sens: Seq[SensitiveTuple]) =>
       val cols = columns.filter(checkTuples)
       val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5))
       val history = generateHistory(columnsMeta, hist)
-      val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history)
+      val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens)
+      val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive)
       for {(col, i) <- vectorMeta.columns.zipWithIndex} {
         vectorMeta.index(col) shouldEqual i
       }
@@ -139,13 +182,14 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
   }
 
   property("vector metadata flattens correctly") {
-    forAll(seqVecGen) { (vectors: Seq[OpVectorTuple]) =>
+    forAll(seqVecGen) { vectors: Seq[OpVectorTuple] =>
       val vecs = vectors.map {
-        case (outputName, columns, hist) =>
+        case (outputName, columns, hist, sens) =>
           val cols = columns.filter(checkTuples)
           val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5))
           val history = generateHistory(columnsMeta, hist)
-          OpVectorMetadata(outputName, columnsMeta, history)
+          val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens)
+          OpVectorMetadata(outputName, columnsMeta, history, sensitive)
       }
       val flattened = OpVectorMetadata.flatten("out", vecs)
       flattened.size shouldEqual vecs.map(_.size).sum
@@ -155,12 +199,16 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
   }
 
   property("vector metadata should properly serialize to and from spark metadata") {
-    forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) =>
+    forAll(vecGen) {
+      case (outputName: String,
+        columns: Array[OpVectorColumnTuple],
+        hist: FeatureHistoryTuple,
+        sens: Seq[SensitiveTuple]) =>
       val cols = columns.filter(checkTuples)
       val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5))
       val history = generateHistory(columnsMeta, hist)
-
-      val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history)
+      val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens)
+      val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive)
 
       val vectorMetaFromSerialized = OpVectorMetadata(vectorMeta.name, vectorMeta.toMetadata)
       vectorMeta.name shouldEqual vectorMetaFromSerialized.name
@@ -171,13 +219,18 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
 
 
   property("vector metadata should generate feature history correctly") {
-    forAll(vecGen) { case (outputName: String, columns: Array[OpVectorColumnTuple], hist: FeatureHistoryTuple) =>
+    forAll(vecGen) { case (
+      outputName: String,
+      columns: Array[OpVectorColumnTuple],
+      hist: FeatureHistoryTuple,
+      sens: Seq[SensitiveTuple]) =>
       val cols = columns.filter(checkTuples)
       val columnsMeta = cols.map(vct => OpVectorColumnMetadata(vct._1, vct._2, vct._3, vct._4, vct._5))
       val history = generateHistory(columnsMeta, hist)
+      val sensitive = generateSensitiveFeatureInfo(columnsMeta, sens)
+      val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history, sensitive)
 
-      val vectorMeta = OpVectorMetadata(outputName, columnsMeta, history)
-      if (history.isEmpty && columnsMeta.nonEmpty ) {
+      if (history.isEmpty && columnsMeta.nonEmpty) {
         assertThrows[RuntimeException](vectorMeta.getColumnHistory())
       } else {
         val colHist = vectorMeta.getColumnHistory()
diff --git a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
new file mode 100644
index 0000000000..918701d645
--- /dev/null
+++ b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op
+
+import com.salesforce.op.utils.json.JsonLike
+import com.salesforce.op.utils.spark.RichMetadata._
+import org.apache.spark.sql.types.{Metadata, MetadataBuilder}
+import enumeratum._
+
+/**
+ * A base class for different SensitiveFeatureInformation (implemented as an enum)
+ * Currently, only Name types are supported but there are placeholders for other possibilities
+ * The following three params are required for every kind of SensitiveFeatureInformation
+ *
+ * @param name        the name of the raw feature
+ * @param key         optionally, the name of the key (if the raw feature is a Map type)
+ * @param actionTaken whether the handling of the raw feature changed b/c it was detected as sensitive
+ */
+sealed class SensitiveFeatureInformation
+(
+  val name: String,
+  val key: Option[String] = None,
+  val actionTaken: Boolean = false
+) extends EnumEntry with JsonLike {
+
+  /**
+   * Convert to Spark metadata
+   *
+   * @return metadata representation
+   */
+  def toMetadata: Metadata = {
+    this match {
+      case SensitiveFeatureInformation.Name(
+        probName, genderDetectResults, probMale, probFemale, probOther, name, key, actionTaken
+      ) =>
+        new MetadataBuilder()
+          .putString(SensitiveFeatureInformation.NameKey, name)
+          .putString(SensitiveFeatureInformation.MapKeyKey, key.getOrElse(""))
+          .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken)
+          .putString(SensitiveFeatureInformation.TypeKey, this.entryName)
+          .putDouble(SensitiveFeatureInformation.Name.ProbNameKey, probName)
+          .putStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey, genderDetectResults.toArray)
+          .putDouble(SensitiveFeatureInformation.Name.ProbMaleKey, probMale)
+          .putDouble(SensitiveFeatureInformation.Name.ProbFemaleKey, probFemale)
+          .putDouble(SensitiveFeatureInformation.Name.ProbOtherKey, probOther)
+          .build()
+      case _ => throw new RuntimeException(
+        "Metadata for sensitive features other than names have not been implemented.")
+    }
+  }
+}
+case object SensitiveFeatureInformation extends Enum[SensitiveFeatureInformation] {
+  val NameKey = "FeatureName"
+  val MapKeyKey = "MapKey"
+  val ActionTakenKey = "ActionTaken"
+  val TypeKey = "DetectedSensitiveFeatureKind"
+  val values: Seq[SensitiveFeatureInformation] = findValues
+
+  // Utilized by SmartTextVectorizer's name detection
+  case class Name
+  (
+    probName: Double,
+    genderDetectResults: Seq[String],
+    probMale: Double,
+    probFemale: Double,
+    probOther: Double,
+    override val name: String,
+    override val key: Option[String] = None,
+    override val actionTaken: Boolean = false
+  ) extends SensitiveFeatureInformation(name, key, actionTaken) {
+    override val entryName: String = SensitiveFeatureInformation.Name.EntryName
+  }
+  case object Name {
+    val EntryName = "Name"
+    val ProbNameKey = "ProbName"
+    val GenderDetectStratsKey = "GenderDetectStrats"
+    val ProbMaleKey = "ProbMale"
+    val ProbFemaleKey = "ProbFemale"
+    val ProbOtherKey = "ProbOther"
+  }
+
+  // Not yet implemented
+  case object Salutation extends SensitiveFeatureInformation("None", None, false)
+  case object BirthDate extends SensitiveFeatureInformation("None", None, false)
+  case object PostalCode extends SensitiveFeatureInformation("None", None, false)
+  case object Other extends SensitiveFeatureInformation("None", None, false)
+
+  /**
+   * Build metadata from Map of [[SensitiveFeatureInformation]] instances
+   *
+   * @param map Map from feature name to Seq of [[SensitiveFeatureInformation]] about that feature
+   * @return metadata representation
+   */
+  def toMetadata(map: Map[String, Seq[SensitiveFeatureInformation]]): Metadata = {
+    val builder = new MetadataBuilder()
+    map.foreach { case (k, values) => builder.putMetadataArray(k, values map { _.toMetadata } toArray) }
+    builder.build()
+  }
+
+  /**
+   * Build Map of [[SensitiveFeatureInformation]] instances from metadata
+   *
+   * @param meta metadata containing a mapping from feature name to [[SensitiveFeatureInformation]]
+   * @return map of that information
+   */
+  def fromMetadataMap(meta: Metadata): Map[String, Seq[SensitiveFeatureInformation]] = {
+    val infoMap = meta.wrapped.underlyingMap
+    infoMap.map { case (k, values) => k -> values.asInstanceOf[Array[Metadata]].map(fromMetadata).toSeq }
+  }
+
+  /**
+   * Build [[SensitiveFeatureInformation]] from metadata
+   *
+   * @param meta Metadata representing [[SensitiveFeatureInformation]]
+   * @return new instance of [[SensitiveFeatureInformation]]
+   */
+  def fromMetadata(meta: Metadata): SensitiveFeatureInformation = {
+    meta.getString(SensitiveFeatureInformation.TypeKey) match {
+      case SensitiveFeatureInformation.Name.EntryName =>
+        SensitiveFeatureInformation.Name(
+          meta.getDouble(SensitiveFeatureInformation.Name.ProbNameKey),
+          meta.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey),
+          meta.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey),
+          meta.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey),
+          meta.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey),
+          meta.getString(SensitiveFeatureInformation.NameKey),
+          {
+            val mapKey = meta.getString(SensitiveFeatureInformation.MapKeyKey)
+            if (mapKey.isEmpty) None else Some(mapKey)
+          },
+          meta.getBoolean(SensitiveFeatureInformation.ActionTakenKey)
+        )
+      case _ => throw new RuntimeException(
+        "Metadata for sensitive features other than names have not been implemented.")
+    }
+  }
+}
diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
new file mode 100644
index 0000000000..7c26d04f23
--- /dev/null
+++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2017, Salesforce.com, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the copyright holder nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package com.salesforce.op
+
+import com.salesforce.op.test.TestCommon
+import org.apache.spark.sql.types.MetadataBuilder
+import org.junit.runner.RunWith
+import org.scalatest.FlatSpec
+import org.scalatest.junit.JUnitRunner
+
+@RunWith(classOf[JUnitRunner])
+class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
+
+  val probName = 1.0
+  val genderDetectResults: Seq[String] = Seq("ByIndex", "AnotherStrategy")
+  val probMale = 0.25
+  val probFemale = 0.50
+  val probOther = 0.25
+  val name = "feature"
+  val mapKey: Option[String] = None
+  val actionTaken = true
+
+  val sensitiveFeatureInfo: SensitiveFeatureInformation.Name = SensitiveFeatureInformation.Name(
+    probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
+  )
+
+  Spec[SensitiveFeatureInformation] should "convert sensitive feature information to metadata" in {
+    val metadata = sensitiveFeatureInfo.toMetadata
+
+    metadata.contains(SensitiveFeatureInformation.NameKey) shouldBe true
+    metadata.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true
+    metadata.contains(SensitiveFeatureInformation.ActionTakenKey) shouldBe true
+    metadata.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
+    metadata.contains(SensitiveFeatureInformation.Name.ProbNameKey) shouldBe true
+    metadata.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true
+    metadata.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true
+    metadata.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true
+    metadata.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true
+
+    metadata.getString(SensitiveFeatureInformation.NameKey) shouldBe name
+    metadata.getString(SensitiveFeatureInformation.MapKeyKey) shouldBe ""
+    metadata.getBoolean(SensitiveFeatureInformation.ActionTakenKey) shouldBe actionTaken
+    metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveFeatureInformation.Name.EntryName
+    metadata.getDouble(SensitiveFeatureInformation.Name.ProbNameKey) shouldBe probName
+    metadata.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe genderDetectResults
+    metadata.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe probMale
+    metadata.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale
+    metadata.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther
+  }
+
+  it should "create metadata from a map" in {
+    val info1 = sensitiveFeatureInfo
+    val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
+    val map = Map("1" -> Seq(info1), "2" -> Seq(info2))
+    val metadata = SensitiveFeatureInformation.toMetadata(map)
+
+    metadata.contains("1") shouldBe true
+    metadata.contains("2") shouldBe true
+
+    val f1 = metadata.getMetadata("1")
+    f1.contains(SensitiveFeatureInformation.NameKey) shouldBe true
+    f1.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true
+    f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
+    f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
+    f1.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true
+    f1.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true
+    f1.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true
+    f1.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true
+    f1.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe genderDetectResults
+    f1.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe probMale
+    f1.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale
+    f1.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther
+
+    val f2 = metadata.getMetadata("2")
+    f2.contains(SensitiveFeatureInformation.NameKey) shouldBe true
+    f2.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true
+    f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
+    f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
+    f2.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true
+    f2.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true
+    f2.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true
+    f2.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true
+    f2.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe Seq("")
+    f2.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe 0.0
+    f2.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe 0.0
+    f2.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe 0.0
+  }
+
+  it should "create a map from metadata" in {
+    val info1 = sensitiveFeatureInfo
+    val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
+
+    val mapMetadata = new MetadataBuilder()
+      .putMetadata("1", info1.toMetadata)
+      .putMetadata("2", info2.toMetadata)
+      .build()
+
+    val map = SensitiveFeatureInformation.fromMetadataMap(mapMetadata)
+
+    map.contains("1") shouldBe true
+    map("1") shouldBe info1
+    map.contains("2") shouldBe true
+    map("2") shouldBe info2
+  }
+}
+

From acb873e0df2845335bfda8bf06f7982d2e0d94fc Mon Sep 17 00:00:00 2001
From: Michael Yang <michael.yang@salesforce.com>
Date: Tue, 14 Jan 2020 13:18:19 -0800
Subject: [PATCH 07/13] Fixed SensitiveFeatureInformation tests failing due to
 not changing the to/fromMetadata tests

---
 .../op/SensitiveFeatureInformationTest.scala         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
index 7c26d04f23..8dfbb23f63 100644
--- a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
+++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
@@ -85,7 +85,7 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
     metadata.contains("1") shouldBe true
     metadata.contains("2") shouldBe true
 
-    val f1 = metadata.getMetadata("1")
+    val f1 = metadata.getMetadataArray("1").head
     f1.contains(SensitiveFeatureInformation.NameKey) shouldBe true
     f1.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true
     f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
@@ -99,7 +99,7 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
     f1.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale
     f1.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther
 
-    val f2 = metadata.getMetadata("2")
+    val f2 = metadata.getMetadataArray("2").head
     f2.contains(SensitiveFeatureInformation.NameKey) shouldBe true
     f2.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true
     f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
@@ -119,16 +119,16 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
     val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
 
     val mapMetadata = new MetadataBuilder()
-      .putMetadata("1", info1.toMetadata)
-      .putMetadata("2", info2.toMetadata)
+      .putMetadataArray("1", Array(info1.toMetadata))
+      .putMetadataArray("2", Array(info2.toMetadata))
       .build()
 
     val map = SensitiveFeatureInformation.fromMetadataMap(mapMetadata)
 
     map.contains("1") shouldBe true
-    map("1") shouldBe info1
+    map("1") shouldBe Array(info1)
     map.contains("2") shouldBe true
-    map("2") shouldBe info2
+    map("2") shouldBe Array(info2)
   }
 }
 

From 3ae735c4c4cc62c02ef0911684b1e363fb0a95ca Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Tue, 14 Jan 2020 14:06:32 -0800
Subject: [PATCH 08/13] Addressing comments

---
 .../impl/feature/SmartTextMapVectorizer.scala | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
index 014407d9e6..466e2a7e2e 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
@@ -110,7 +110,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
         inN.toSeq.zip(allTextFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
       val allKeys = mapFeatureInfo.map(_.map(_.key))
 
-      // Careful when zipping sequences like hashKeys (length and hashFeatures
+      // Careful when zipping sequences like hashKeys (lengtht and hashFeatures
       val hashKeys = args.hashFeatureInfo.map(
         _.filter(_.vectorizationMethod == TextVectorizationMethod.Hash).map(_.key)
       )
@@ -249,19 +249,21 @@ case class SmartTextMapVectorizerModelArgs
   shouldTrackNulls: Boolean,
   hashingParams: HashingFunctionParams
 ) extends JsonLike {
+  // Partition allFeatureInfo into separate SmartTextFeatureInfo sequences corresponding to each vectorization type
   val (categoricalFeatureInfo, hashFeatureInfo, ignoreFeatureInfo) = allFeatureInfo.map{ featureInfoSeq =>
     val groups = featureInfoSeq.groupBy(_.vectorizationMethod)
     val catGroup = groups.getOrElse(TextVectorizationMethod.Pivot, Seq.empty)
     val hashGroup = groups.getOrElse(TextVectorizationMethod.Hash, Seq.empty)
     val ignoreGroup = groups.getOrElse(TextVectorizationMethod.Ignore, Seq.empty)
-
     (catGroup, hashGroup, ignoreGroup)
   }.unzip3
 
+  // Seq[Seq[String]] corresponding to the keys in each map that are treated with each vectorization type
   val categoricalKeys = categoricalFeatureInfo.map(_.map(_.key))
   val hashKeys = hashFeatureInfo.map(_.map(_.key))
   val ignoreKeys = ignoreFeatureInfo.map(_.map(_.key))
 
+  // Combined keys for hashed and ignored features (everything that's not pivoted)
   val textKeys = hashKeys.zip(ignoreKeys).map{ case (hk, ik) => hk ++ ik }
 }
 
@@ -277,6 +279,25 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
   with MapHashingFun
   with TextMapPivotVectorizerModelFun[OPMap[String]] {
 
+  /**
+   * Storage for results of row partitioning
+   *
+   * @param categoricalMaps   Sequence of maps that have at least one key that should be treated as a categorical
+   * @param categoricalKeys   Sequence containing keys for each map that correspond to categorical features
+   * @param hashMaps          Sequence of maps that have at least one key that should be hashed
+   * @param hashKeys          Sequence containing keys for each map that correspond to hashed features
+   * @param ignoreMaps        Sequence of maps that have at least one key that should be ignored
+   * @param ignoreKeys        Sequence containing keys for each map that correspond to ignored features
+   */
+  case class PartitionResult(
+    categoricalMaps: Seq[OPMap[String]],
+    categoricalKeys: Seq[Seq[String]],
+    hashMaps: Seq[OPMap[String]],
+    hashKeys: Seq[Seq[String]],
+    ignoreMaps: Seq[OPMap[String]],
+    ignoreKeys: Seq[Seq[String]]
+  )
+
   private val categoricalPivotFn = pivotFn(
     topValues = args.categoricalFeatureInfo.filter(_.nonEmpty).map(_.map(info => info.key -> info.topValues)),
     shouldCleanKeys = args.shouldCleanKeys,
@@ -284,8 +305,7 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
     shouldTrackNulls = args.shouldTrackNulls
   )
 
-  private def partitionRow(row: Seq[OPMap[String]]):
-  (Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]], Seq[OPMap[String]], Seq[Seq[String]]) = {
+  private def partitionRow(row: Seq[OPMap[String]]): PartitionResult = {
     val (rowCategorical, keysCategorical) =
       row.view.zip(args.categoricalKeys).collect { case (elements, keys) if keys.nonEmpty =>
         val filtered = elements.value.filter { case (k, v) => keys.contains(k) }
@@ -304,14 +324,14 @@ final class SmartTextMapVectorizerModel[T <: OPMap[String]] private[op]
         (TextMap(filtered), keys)
       }.unzip
 
-    (rowCategorical.toList, keysCategorical.toList, rowHashedText.toList, keysHashedText.toList,
+    PartitionResult(rowCategorical.toList, keysCategorical.toList, rowHashedText.toList, keysHashedText.toList,
       rowIgnoredText.toList, keysIgnoredText.toList)
   }
 
   def transformFn: Seq[T] => OPVector = row => {
     implicit val textListMonoid: Monoid[TextList] = TextList.monoid
 
-    val (rowCategorical, keysCategorical, rowHash, keysHash, rowIgnore, keysIgnore) = partitionRow(row)
+    val PartitionResult(rowCategorical, keysCategorical, rowHash, keysHash, rowIgnore, keysIgnore) = partitionRow(row)
     val keysText = keysHash + keysIgnore // Go algebird!
     val categoricalVector = categoricalPivotFn(rowCategorical)
 

From bef2b22441b3e55a0dd5f95827fe89345041b844 Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Tue, 14 Jan 2020 16:14:51 -0800
Subject: [PATCH 09/13] Spelling

---
 .../op/stages/impl/feature/SmartTextMapVectorizer.scala        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
index 466e2a7e2e..7f62dff48e 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
@@ -110,7 +110,8 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
         inN.toSeq.zip(allTextFeatureInfo).filter{ case (tf, featureInfoSeq) => featureInfoSeq.nonEmpty }.unzip
       val allKeys = mapFeatureInfo.map(_.map(_.key))
 
-      // Careful when zipping sequences like hashKeys (lengtht and hashFeatures
+      // Careful when zipping sequences like hashKeys (length = number of maps, always) and
+      // hashFeatures (length <= number of maps, depending on which ones contain keys to hash)
       val hashKeys = args.hashFeatureInfo.map(
         _.filter(_.vectorizationMethod == TextVectorizationMethod.Hash).map(_.key)
       )

From 069031c8f298b46a68c914e110f9296420ca233c Mon Sep 17 00:00:00 2001
From: Kevin Moore <kevinmoore@salesforce.com>
Date: Tue, 14 Jan 2020 16:53:23 -0800
Subject: [PATCH 10/13] Fixed failing test by making default behavior of
 SmartTextVectorizer and SmartTextMapVectorizer the same as previous behavior
 - don't ignore any features

---
 .../op/stages/impl/feature/SmartTextMapVectorizer.scala         | 2 +-
 .../salesforce/op/stages/impl/feature/SmartTextVectorizer.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
index 7f62dff48e..03f4b47d99 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala
@@ -153,7 +153,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
       textMapStats.keyValueCounts.toSeq.map { case (k, textStats) =>
         val vecMethod: TextVectorizationMethod = textStats match {
           case _ if textStats.valueCounts.size <= maxCard => TextVectorizationMethod.Pivot
-          case _ if textStats.lengthStdDev <= minLenStdDev => TextVectorizationMethod.Ignore
+          case _ if textStats.lengthStdDev < minLenStdDev => TextVectorizationMethod.Ignore
           case _ => TextVectorizationMethod.Hash
         }
         val topVals = if (vecMethod == TextVectorizationMethod.Pivot) {
diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
index a2eab274ff..d75e42bf36 100644
--- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
+++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala
@@ -91,7 +91,7 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])(
     val (vectorizationMethods, topValues) = aggregatedStats.map { stats =>
       val vecMethod: TextVectorizationMethod = stats match {
         case _ if stats.valueCounts.size <= maxCard => TextVectorizationMethod.Pivot
-        case _ if stats.lengthStdDev <= minLenStdDev => TextVectorizationMethod.Ignore
+        case _ if stats.lengthStdDev < minLenStdDev => TextVectorizationMethod.Ignore
         case _ => TextVectorizationMethod.Hash
       }
       val topValues = stats.valueCounts

From a8504da0fefff9cbc09afc0203faafea08b7ee6a Mon Sep 17 00:00:00 2001
From: Michael Yang <michael.yang@salesforce.com>
Date: Thu, 23 Jan 2020 19:56:25 -0800
Subject: [PATCH 11/13] Removed enum from SensitiveFeatureInformation per PR
 comments

---
 .../com/salesforce/op/ModelInsightsTest.scala |  10 +-
 .../op/utils/spark/OPVectorMetadataTest.scala |   2 +-
 .../op/SensitiveFeatureInformation.scala      | 122 ++++++++----------
 .../op/SensitiveFeatureInformationTest.scala  |  60 ++++-----
 4 files changed, 89 insertions(+), 105 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
index fc790dc81a..3ec6a43306 100644
--- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
+++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
@@ -576,7 +576,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     },
     Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
     Map(
-      "f0" -> Seq(SensitiveFeatureInformation.Name(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false))
+      "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false))
     )
   )
 
@@ -627,7 +627,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     f0In.featureType shouldBe classOf[PickList].getName
     f0In.derivedFeatures.size shouldBe 2
     f0In.sensitiveInformation match {
-      case Seq(SensitiveFeatureInformation.Name(
+      case Seq(SensitiveNameInformation(
         probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
       )) =>
         actionTaken shouldBe false
@@ -727,10 +727,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
       },
       Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
       Map(
-        "f0" -> Seq(SensitiveFeatureInformation.Name(
+        "f0" -> Seq(SensitiveNameInformation(
           0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false
         )),
-        "f_notInMeta" -> Seq(SensitiveFeatureInformation.Name(
+        "f_notInMeta" -> Seq(SensitiveNameInformation(
           1.0, Seq.empty[String], 0.0, 0.0, 1.0, "f_notInMeta", None, true
         ))
       )
@@ -748,7 +748,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     f_notInMeta_butInInsights.featureType shouldBe classOf[Text].getName
     f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0
     f_notInMeta_butInInsights.sensitiveInformation match {
-      case Seq(SensitiveFeatureInformation.Name(
+      case Seq(SensitiveNameInformation(
         probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
       )) =>
         actionTaken shouldBe true
diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
index 0eff4dc9e1..a3c5963f4d 100644
--- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
+++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
@@ -110,7 +110,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
   ): Map[String, Seq[SensitiveFeatureInformation]] = {
     val sensitiveInfoSeq = sensitiveInfoSeqRaw map {
       case ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken) =>
-        SensitiveFeatureInformation.Name(
+        SensitiveNameInformation(
           probName, genderDetectResults, probMale, probFemale, probOther, featureName, mapKey, actionTaken
         )
     }
diff --git a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
index 918701d645..817914ae9e 100644
--- a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
+++ b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
@@ -33,85 +33,30 @@ package com.salesforce.op
 import com.salesforce.op.utils.json.JsonLike
 import com.salesforce.op.utils.spark.RichMetadata._
 import org.apache.spark.sql.types.{Metadata, MetadataBuilder}
-import enumeratum._
 
 /**
- * A base class for different SensitiveFeatureInformation (implemented as an enum)
- * Currently, only Name types are supported but there are placeholders for other possibilities
+ * A base class for different SensitiveFeatureInformation
  * The following three params are required for every kind of SensitiveFeatureInformation
  *
  * @param name        the name of the raw feature
  * @param key         optionally, the name of the key (if the raw feature is a Map type)
  * @param actionTaken whether the handling of the raw feature changed b/c it was detected as sensitive
  */
-sealed class SensitiveFeatureInformation
+sealed abstract class SensitiveFeatureInformation
 (
   val name: String,
   val key: Option[String] = None,
   val actionTaken: Boolean = false
-) extends EnumEntry with JsonLike {
-
-  /**
-   * Convert to Spark metadata
-   *
-   * @return metadata representation
-   */
-  def toMetadata: Metadata = {
-    this match {
-      case SensitiveFeatureInformation.Name(
-        probName, genderDetectResults, probMale, probFemale, probOther, name, key, actionTaken
-      ) =>
-        new MetadataBuilder()
-          .putString(SensitiveFeatureInformation.NameKey, name)
-          .putString(SensitiveFeatureInformation.MapKeyKey, key.getOrElse(""))
-          .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken)
-          .putString(SensitiveFeatureInformation.TypeKey, this.entryName)
-          .putDouble(SensitiveFeatureInformation.Name.ProbNameKey, probName)
-          .putStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey, genderDetectResults.toArray)
-          .putDouble(SensitiveFeatureInformation.Name.ProbMaleKey, probMale)
-          .putDouble(SensitiveFeatureInformation.Name.ProbFemaleKey, probFemale)
-          .putDouble(SensitiveFeatureInformation.Name.ProbOtherKey, probOther)
-          .build()
-      case _ => throw new RuntimeException(
-        "Metadata for sensitive features other than names have not been implemented.")
-    }
-  }
+) extends JsonLike {
+  val EntryName: String
+  def toMetadata: Metadata
 }
-case object SensitiveFeatureInformation extends Enum[SensitiveFeatureInformation] {
+
+object SensitiveFeatureInformation {
   val NameKey = "FeatureName"
   val MapKeyKey = "MapKey"
   val ActionTakenKey = "ActionTaken"
   val TypeKey = "DetectedSensitiveFeatureKind"
-  val values: Seq[SensitiveFeatureInformation] = findValues
-
-  // Utilized by SmartTextVectorizer's name detection
-  case class Name
-  (
-    probName: Double,
-    genderDetectResults: Seq[String],
-    probMale: Double,
-    probFemale: Double,
-    probOther: Double,
-    override val name: String,
-    override val key: Option[String] = None,
-    override val actionTaken: Boolean = false
-  ) extends SensitiveFeatureInformation(name, key, actionTaken) {
-    override val entryName: String = SensitiveFeatureInformation.Name.EntryName
-  }
-  case object Name {
-    val EntryName = "Name"
-    val ProbNameKey = "ProbName"
-    val GenderDetectStratsKey = "GenderDetectStrats"
-    val ProbMaleKey = "ProbMale"
-    val ProbFemaleKey = "ProbFemale"
-    val ProbOtherKey = "ProbOther"
-  }
-
-  // Not yet implemented
-  case object Salutation extends SensitiveFeatureInformation("None", None, false)
-  case object BirthDate extends SensitiveFeatureInformation("None", None, false)
-  case object PostalCode extends SensitiveFeatureInformation("None", None, false)
-  case object Other extends SensitiveFeatureInformation("None", None, false)
 
   /**
    * Build metadata from Map of [[SensitiveFeatureInformation]] instances
@@ -144,13 +89,13 @@ case object SensitiveFeatureInformation extends Enum[SensitiveFeatureInformation
    */
   def fromMetadata(meta: Metadata): SensitiveFeatureInformation = {
     meta.getString(SensitiveFeatureInformation.TypeKey) match {
-      case SensitiveFeatureInformation.Name.EntryName =>
-        SensitiveFeatureInformation.Name(
-          meta.getDouble(SensitiveFeatureInformation.Name.ProbNameKey),
-          meta.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey),
-          meta.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey),
-          meta.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey),
-          meta.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey),
+      case SensitiveNameInformation.EntryName =>
+        SensitiveNameInformation(
+          meta.getDouble(SensitiveNameInformation.ProbNameKey),
+          meta.getStringArray(SensitiveNameInformation.GenderDetectStratsKey),
+          meta.getDouble(SensitiveNameInformation.ProbMaleKey),
+          meta.getDouble(SensitiveNameInformation.ProbFemaleKey),
+          meta.getDouble(SensitiveNameInformation.ProbOtherKey),
           meta.getString(SensitiveFeatureInformation.NameKey),
           {
             val mapKey = meta.getString(SensitiveFeatureInformation.MapKeyKey)
@@ -163,3 +108,42 @@ case object SensitiveFeatureInformation extends Enum[SensitiveFeatureInformation
     }
   }
 }
+
+case class SensitiveNameInformation
+(
+  probName: Double,
+  genderDetectResults: Seq[String],
+  probMale: Double,
+  probFemale: Double,
+  probOther: Double,
+  override val name: String,
+  override val key: Option[String] = None,
+  override val actionTaken: Boolean = false
+) extends SensitiveFeatureInformation(name, key, actionTaken) {
+  override val EntryName: String = SensitiveNameInformation.EntryName
+  override def toMetadata: Metadata = {
+    new MetadataBuilder()
+      .putString(SensitiveFeatureInformation.NameKey, name)
+      .putString(SensitiveFeatureInformation.MapKeyKey, key.getOrElse(""))
+      .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken)
+      .putString(SensitiveFeatureInformation.TypeKey, this.EntryName)
+      .putDouble(SensitiveNameInformation.ProbNameKey, probName)
+      .putStringArray(SensitiveNameInformation.GenderDetectStratsKey, genderDetectResults.toArray)
+      .putDouble(SensitiveNameInformation.ProbMaleKey, probMale)
+      .putDouble(SensitiveNameInformation.ProbFemaleKey, probFemale)
+      .putDouble(SensitiveNameInformation.ProbOtherKey, probOther)
+      .build()
+  }
+}
+
+case object SensitiveNameInformation {
+  val EntryName = "SensitiveNameInformation"
+  val ProbNameKey = "ProbName"
+  val GenderDetectStratsKey = "GenderDetectStrats"
+  val ProbMaleKey = "ProbMale"
+  val ProbFemaleKey = "ProbFemale"
+  val ProbOtherKey = "ProbOther"
+}
+
+// TODO: Use this everywhere
+case class GenderDetectionResults(strategyString: String, pctUnidentified: Double) extends JsonLike
diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
index 8dfbb23f63..7ec239fce1 100644
--- a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
+++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
@@ -48,7 +48,7 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
   val mapKey: Option[String] = None
   val actionTaken = true
 
-  val sensitiveFeatureInfo: SensitiveFeatureInformation.Name = SensitiveFeatureInformation.Name(
+  val sensitiveFeatureInfo: SensitiveNameInformation = SensitiveNameInformation(
     probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
   )
 
@@ -59,26 +59,26 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
     metadata.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true
     metadata.contains(SensitiveFeatureInformation.ActionTakenKey) shouldBe true
     metadata.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
-    metadata.contains(SensitiveFeatureInformation.Name.ProbNameKey) shouldBe true
-    metadata.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true
-    metadata.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true
-    metadata.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true
-    metadata.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true
+    metadata.contains(SensitiveNameInformation.ProbNameKey) shouldBe true
+    metadata.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true
+    metadata.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true
+    metadata.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true
+    metadata.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true
 
     metadata.getString(SensitiveFeatureInformation.NameKey) shouldBe name
     metadata.getString(SensitiveFeatureInformation.MapKeyKey) shouldBe ""
     metadata.getBoolean(SensitiveFeatureInformation.ActionTakenKey) shouldBe actionTaken
-    metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveFeatureInformation.Name.EntryName
-    metadata.getDouble(SensitiveFeatureInformation.Name.ProbNameKey) shouldBe probName
-    metadata.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe genderDetectResults
-    metadata.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe probMale
-    metadata.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale
-    metadata.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther
+    metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveNameInformation.EntryName
+    metadata.getDouble(SensitiveNameInformation.ProbNameKey) shouldBe probName
+    metadata.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe genderDetectResults
+    metadata.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale
+    metadata.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale
+    metadata.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther
   }
 
   it should "create metadata from a map" in {
     val info1 = sensitiveFeatureInfo
-    val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
+    val info2 = SensitiveNameInformation(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
     val map = Map("1" -> Seq(info1), "2" -> Seq(info2))
     val metadata = SensitiveFeatureInformation.toMetadata(map)
 
@@ -90,33 +90,33 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
     f1.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true
     f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
     f1.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
-    f1.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true
-    f1.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true
-    f1.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true
-    f1.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true
-    f1.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe genderDetectResults
-    f1.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe probMale
-    f1.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe probFemale
-    f1.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe probOther
+    f1.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true
+    f1.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true
+    f1.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true
+    f1.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true
+    f1.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe genderDetectResults
+    f1.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale
+    f1.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale
+    f1.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther
 
     val f2 = metadata.getMetadataArray("2").head
     f2.contains(SensitiveFeatureInformation.NameKey) shouldBe true
     f2.contains(SensitiveFeatureInformation.MapKeyKey) shouldBe true
     f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
     f2.contains(SensitiveFeatureInformation.TypeKey) shouldBe true
-    f2.contains(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe true
-    f2.contains(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe true
-    f2.contains(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe true
-    f2.contains(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe true
-    f2.getStringArray(SensitiveFeatureInformation.Name.GenderDetectStratsKey) shouldBe Seq("")
-    f2.getDouble(SensitiveFeatureInformation.Name.ProbMaleKey) shouldBe 0.0
-    f2.getDouble(SensitiveFeatureInformation.Name.ProbFemaleKey) shouldBe 0.0
-    f2.getDouble(SensitiveFeatureInformation.Name.ProbOtherKey) shouldBe 0.0
+    f2.contains(SensitiveNameInformation.GenderDetectStratsKey) shouldBe true
+    f2.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true
+    f2.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true
+    f2.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true
+    f2.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe Seq("")
+    f2.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe 0.0
+    f2.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe 0.0
+    f2.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe 0.0
   }
 
   it should "create a map from metadata" in {
     val info1 = sensitiveFeatureInfo
-    val info2 = SensitiveFeatureInformation.Name(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
+    val info2 = SensitiveNameInformation(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
 
     val mapMetadata = new MetadataBuilder()
       .putMetadataArray("1", Array(info1.toMetadata))

From eac0e058a47ad7157738c820b2596a9148027702 Mon Sep 17 00:00:00 2001
From: Michael Yang <michael.yang@salesforce.com>
Date: Thu, 23 Jan 2020 20:25:07 -0800
Subject: [PATCH 12/13] Using case class for GenderDetectionStrategy
 information

---
 .../com/salesforce/op/ModelInsightsTest.scala |  6 ++---
 .../op/utils/spark/OPVectorMetadataTest.scala | 16 ++++++++----
 .../op/SensitiveFeatureInformation.scala      | 25 +++++++++++++++----
 .../op/SensitiveFeatureInformationTest.scala  | 23 ++++++++++++-----
 4 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
index 3ec6a43306..2427076b67 100644
--- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
+++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
@@ -576,7 +576,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     },
     Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
     Map(
-      "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false))
+      "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None, false))
     )
   )
 
@@ -728,10 +728,10 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
       Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
       Map(
         "f0" -> Seq(SensitiveNameInformation(
-          0.0, Seq.empty[String], 0.0, 0.0, 1.0, "f0", None, false
+          0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None, false
         )),
         "f_notInMeta" -> Seq(SensitiveNameInformation(
-          1.0, Seq.empty[String], 0.0, 0.0, 1.0, "f_notInMeta", None, true
+          1.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f_notInMeta", None, true
         ))
       )
     )
diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
index a3c5963f4d..b1532ef5dc 100644
--- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
+++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
@@ -30,7 +30,7 @@
 
 package com.salesforce.op.utils.spark
 
-import com.salesforce.op.{FeatureHistory, SensitiveFeatureInformation}
+import com.salesforce.op.{FeatureHistory, GenderDetectionResults, SensitiveFeatureInformation, SensitiveNameInformation}
 import com.salesforce.op.features.types.{DateTime, Email, FeatureType, OPMap, PickList, Prediction, Real, RealMap, TextAreaMap}
 import com.salesforce.op.test.TestCommon
 import org.apache.spark.sql.types.Metadata
@@ -49,7 +49,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
   type FeatureHistoryTuple = (Seq[String], Seq[String])
 
   type SensitiveTuple = (SensitiveNameTuple, String, Option[String], Boolean)
-  type SensitiveNameTuple = (Double, Seq[String], Double, Double, Double)
+  type SensitiveNameTuple = (Double, Seq[String], Seq[Double], Double, Double, Double)
 
   type OpVectorTuple = (String, Array[OpVectorColumnTuple], FeatureHistoryTuple, Seq[SensitiveTuple])
 
@@ -81,12 +81,13 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
     mapKey <- Gen.option(genName)
     actionTaken <- Gen.oneOf[Boolean](Seq(false, true))
     probName <- Gen.choose(0.0, 1.0)
-    genderDetectResults <- Gen.containerOf[Seq, String](genName)
+    genderDetectNames <- Gen.containerOf[Seq, String](genName)
+    genderDetectNums <- Gen.containerOf[Seq, Double](Gen.choose(0.0, 1.0))
     probMale <- Gen.choose(0.0, 1.0)
     probFemale <- Gen.choose(0.0, 1.0 - probMale)
     probOther <- Gen.choose(0.0, 1.0 - probMale - probFemale)
   } yield {
-    ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken)
+    ((probName, genderDetectNames, genderDetectNums, probMale, probFemale, probOther), featureName, mapKey, actionTaken)
   }
 
   val vecGen: Gen[OpVectorTuple] = for {
@@ -109,7 +110,12 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
     columnsMeta: Array[OpVectorColumnMetadata], sensitiveInfoSeqRaw: Seq[SensitiveTuple]
   ): Map[String, Seq[SensitiveFeatureInformation]] = {
     val sensitiveInfoSeq = sensitiveInfoSeqRaw map {
-      case ((probName, genderDetectResults, probMale, probFemale, probOther), featureName, mapKey, actionTaken) =>
+      case (
+        (probName, genderDetectNames, genderDetectNums, probMale, probFemale, probOther),
+        featureName, mapKey, actionTaken) =>
+        val genderDetectResults = genderDetectNames.zip(genderDetectNums).map {
+          case (name, pct) => GenderDetectionResults(name, pct)
+        }
         SensitiveNameInformation(
           probName, genderDetectResults, probMale, probFemale, probOther, featureName, mapKey, actionTaken
         )
diff --git a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
index 817914ae9e..2cbdb5d36b 100644
--- a/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
+++ b/utils/src/main/scala/com/salesforce/op/SensitiveFeatureInformation.scala
@@ -92,7 +92,9 @@ object SensitiveFeatureInformation {
       case SensitiveNameInformation.EntryName =>
         SensitiveNameInformation(
           meta.getDouble(SensitiveNameInformation.ProbNameKey),
-          meta.getStringArray(SensitiveNameInformation.GenderDetectStratsKey),
+          meta.getMetadataArray(
+            SensitiveNameInformation.GenderDetectStratsKey
+          ).map(GenderDetectionResults.fromMetadata),
           meta.getDouble(SensitiveNameInformation.ProbMaleKey),
           meta.getDouble(SensitiveNameInformation.ProbFemaleKey),
           meta.getDouble(SensitiveNameInformation.ProbOtherKey),
@@ -112,7 +114,7 @@ object SensitiveFeatureInformation {
 case class SensitiveNameInformation
 (
   probName: Double,
-  genderDetectResults: Seq[String],
+  genderDetectResults: Seq[GenderDetectionResults],
   probMale: Double,
   probFemale: Double,
   probOther: Double,
@@ -128,7 +130,7 @@ case class SensitiveNameInformation
       .putBoolean(SensitiveFeatureInformation.ActionTakenKey, actionTaken)
       .putString(SensitiveFeatureInformation.TypeKey, this.EntryName)
       .putDouble(SensitiveNameInformation.ProbNameKey, probName)
-      .putStringArray(SensitiveNameInformation.GenderDetectStratsKey, genderDetectResults.toArray)
+      .putMetadataArray(SensitiveNameInformation.GenderDetectStratsKey, genderDetectResults.toArray.map(_.toMetadata))
       .putDouble(SensitiveNameInformation.ProbMaleKey, probMale)
       .putDouble(SensitiveNameInformation.ProbFemaleKey, probFemale)
       .putDouble(SensitiveNameInformation.ProbOtherKey, probOther)
@@ -145,5 +147,18 @@ case object SensitiveNameInformation {
   val ProbOtherKey = "ProbOther"
 }
 
-// TODO: Use this everywhere
-case class GenderDetectionResults(strategyString: String, pctUnidentified: Double) extends JsonLike
+case class GenderDetectionResults(strategyString: String, pctUnidentified: Double) extends JsonLike {
+  def toMetadata: Metadata = {
+    new MetadataBuilder()
+      .putString(GenderDetectionResults.StrategyStringKey, strategyString)
+      .putDouble(GenderDetectionResults.PctUnidentifiedKey, pctUnidentified)
+      .build()
+  }
+}
+case object GenderDetectionResults {
+  val StrategyStringKey = "strategyString"
+  val PctUnidentifiedKey = "pctUnidentified"
+  def fromMetadata(meta: Metadata): GenderDetectionResults = {
+    GenderDetectionResults(meta.getString(StrategyStringKey), meta.getDouble(PctUnidentifiedKey))
+  }
+}
diff --git a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
index 7ec239fce1..74ff254e90 100644
--- a/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
+++ b/utils/src/test/scala/com/salesforce/op/SensitiveFeatureInformationTest.scala
@@ -40,7 +40,10 @@ import org.scalatest.junit.JUnitRunner
 class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
 
   val probName = 1.0
-  val genderDetectResults: Seq[String] = Seq("ByIndex", "AnotherStrategy")
+  val genderDetectResults: Seq[GenderDetectionResults] = Seq(
+    GenderDetectionResults("ByIndex", 0.1),
+    GenderDetectionResults("AnotherStrategy", 0.99)
+  )
   val probMale = 0.25
   val probFemale = 0.50
   val probOther = 0.25
@@ -70,7 +73,9 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
     metadata.getBoolean(SensitiveFeatureInformation.ActionTakenKey) shouldBe actionTaken
     metadata.getString(SensitiveFeatureInformation.TypeKey) shouldBe SensitiveNameInformation.EntryName
     metadata.getDouble(SensitiveNameInformation.ProbNameKey) shouldBe probName
-    metadata.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe genderDetectResults
+    metadata.getMetadataArray(
+      SensitiveNameInformation.GenderDetectStratsKey
+    ).map(GenderDetectionResults.fromMetadata) shouldBe genderDetectResults
     metadata.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale
     metadata.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale
     metadata.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther
@@ -78,7 +83,8 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
 
   it should "create metadata from a map" in {
     val info1 = sensitiveFeatureInfo
-    val info2 = SensitiveNameInformation(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
+    val info2 = SensitiveNameInformation(0.0,
+      Seq(GenderDetectionResults("", 0)), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
     val map = Map("1" -> Seq(info1), "2" -> Seq(info2))
     val metadata = SensitiveFeatureInformation.toMetadata(map)
 
@@ -94,7 +100,9 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
     f1.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true
     f1.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true
     f1.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true
-    f1.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe genderDetectResults
+    f1.getMetadataArray(
+      SensitiveNameInformation.GenderDetectStratsKey
+    ).map(GenderDetectionResults.fromMetadata) shouldBe genderDetectResults
     f1.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe probMale
     f1.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe probFemale
     f1.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe probOther
@@ -108,7 +116,9 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
     f2.contains(SensitiveNameInformation.ProbMaleKey) shouldBe true
     f2.contains(SensitiveNameInformation.ProbFemaleKey) shouldBe true
     f2.contains(SensitiveNameInformation.ProbOtherKey) shouldBe true
-    f2.getStringArray(SensitiveNameInformation.GenderDetectStratsKey) shouldBe Seq("")
+    f2.getMetadataArray(
+      SensitiveNameInformation.GenderDetectStratsKey
+    ).map(GenderDetectionResults.fromMetadata) shouldBe Seq(GenderDetectionResults("", 0))
     f2.getDouble(SensitiveNameInformation.ProbMaleKey) shouldBe 0.0
     f2.getDouble(SensitiveNameInformation.ProbFemaleKey) shouldBe 0.0
     f2.getDouble(SensitiveNameInformation.ProbOtherKey) shouldBe 0.0
@@ -116,7 +126,8 @@ class SensitiveFeatureInformationTest extends FlatSpec with TestCommon {
 
   it should "create a map from metadata" in {
     val info1 = sensitiveFeatureInfo
-    val info2 = SensitiveNameInformation(0.0, Seq(""), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
+    val info2 = SensitiveNameInformation(0.0,
+      Seq(GenderDetectionResults("", 0)), 0.0, 0.0, 0.0, "f2", Some("key"), actionTaken = true)
 
     val mapMetadata = new MetadataBuilder()
       .putMetadataArray("1", Array(info1.toMetadata))

From 26f30fb00a9b24b78e88891766b0f4094d4a973b Mon Sep 17 00:00:00 2001
From: Michael Yang <michael.yang@salesforce.com>
Date: Thu, 23 Jan 2020 20:34:06 -0800
Subject: [PATCH 13/13] Cleaning up tests per PR comments

---
 .../com/salesforce/op/ModelInsightsTest.scala | 51 ++++++++++---------
 .../op/utils/spark/OPVectorMetadataTest.scala |  4 +-
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
index 2427076b67..50f5f3b392 100644
--- a/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
+++ b/core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
@@ -576,7 +576,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     },
     Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
     Map(
-      "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None, false))
+      "f0" -> Seq(SensitiveNameInformation(0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None))
     )
   )
 
@@ -709,32 +709,33 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     """include sensitive feature information
       |even for sensitive features that are removed from output vector and output vector metadata""".stripMargin in {
     // Copy metadata from above but add new feature that was removed in vectorizing to sensitive info
-    val f_notInMeta = Feature[Text]("f_notInMeta", false, null, Seq(), "test")
-    val newMeta = OpVectorMetadata(
-      "fv",
+    val f_notInMeta = Feature[Text]("f_notInMeta", isResponse = false, null, Seq(), "test")
+    val newFeatureName = "fv"
+    val newColumnMeta = OpVectorColumnMetadata(
+      parentFeatureName = Seq("f1"),
+      parentFeatureType = Seq(classOf[Real].getName),
+      grouping = None,
+      indicatorValue = None
+    ) +: Array("f2", "f3").map { name =>
       OpVectorColumnMetadata(
-        parentFeatureName = Seq("f1"),
-        parentFeatureType = Seq(classOf[Real].getName),
-        grouping = None,
-        indicatorValue = None
-      ) +: Array("f2", "f3").map { name =>
-        OpVectorColumnMetadata(
-          parentFeatureName = Seq("f0"),
-          parentFeatureType = Seq(classOf[PickList].getName),
-          grouping = Option("f0"),
-          indicatorValue = Option(name)
-        )
-      },
-      Seq("f1", "f0").map(name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())).toMap,
-      Map(
-        "f0" -> Seq(SensitiveNameInformation(
-          0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None, false
-        )),
-        "f_notInMeta" -> Seq(SensitiveNameInformation(
-          1.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f_notInMeta", None, true
-        ))
+        parentFeatureName = Seq("f0"),
+        parentFeatureType = Seq(classOf[PickList].getName),
+        grouping = Option("f0"),
+        indicatorValue = Option(name)
       )
+    }
+    val newFeatureHistory = Seq("f1", "f0").map(
+      name => name -> FeatureHistory(originFeatures = Seq(name), stages = Seq())
+    ).toMap
+    val newSensitiveInfo = Map(
+      "f0" -> Seq(SensitiveNameInformation(
+        0.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f0", None
+      )),
+      "f_notInMeta" -> Seq(SensitiveNameInformation(
+        1.0, Seq.empty[GenderDetectionResults], 0.0, 0.0, 1.0, "f_notInMeta", None, actionTaken = true
+      ))
     )
+    val newMeta = OpVectorMetadata(newFeatureName, newColumnMeta, newFeatureHistory, newSensitiveInfo)
 
     val labelSum = ModelInsights.getLabelSummary(Option(lbl), Option(summary))
 
@@ -749,7 +750,7 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest with Dou
     f_notInMeta_butInInsights.derivedFeatures.size shouldBe 0
     f_notInMeta_butInInsights.sensitiveInformation match {
       case Seq(SensitiveNameInformation(
-        probName, genderDetectResults, probMale, probFemale, probOther, name, mapKey, actionTaken
+        probName, genderDetectResults, probMale, probFemale, probOther, _, _, actionTaken
       )) =>
         actionTaken shouldBe true
         probName shouldBe 1.0
diff --git a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
index b1532ef5dc..b2a51f28e4 100644
--- a/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
+++ b/features/src/test/scala/com/salesforce/op/utils/spark/OPVectorMetadataTest.scala
@@ -76,7 +76,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
   )
   val arrVecColTupleGen: Gen[Array[OpVectorColumnTuple]] = Gen.containerOf[Array, OpVectorColumnTuple](vecColTupleGen)
 
-  val sensitiveGen: Gen[SensitiveTuple] = for {
+  val sensitiveNameGen: Gen[SensitiveTuple] = for {
     featureName <- genName
     mapKey <- Gen.option(genName)
     actionTaken <- Gen.oneOf[Boolean](Seq(false, true))
@@ -94,7 +94,7 @@ class OPVectorMetadataTest extends PropSpec with TestCommon with PropertyChecks
     name <- genName
     arr <- arrVecColTupleGen
     histories <- featHistTupleGen
-    sensitiveCols <- Gen.containerOf[Seq, SensitiveTuple](sensitiveGen)
+    sensitiveCols <- Gen.containerOf[Seq, SensitiveTuple](sensitiveNameGen)
   } yield {
     (name, arr, histories, sensitiveCols)
   }