Skip to content

Commit

Permalink
Rename whitelist/blacklist to allowlist/blocklist (#491)
Browse files Browse the repository at this point in the history
* remove potentially offensive terms from Boston data set description

* rename whitelist/blacklist to allowlist/denylist

* use SF standard terminology: blocklist

* use SF standard terminology: blocklist

* typo

* use SF standard terminology: blocklist

* keep caps

* add TODOs for backward compatibility

* ensure backward compatibility with "blacklist" terms in older saved models

* flatMap!

* postpone changes to Boston dataset

* postpone changes to Boston dataset
  • Loading branch information
nicodv committed Jul 17, 2020
1 parent 95a1f39 commit f764842
Show file tree
Hide file tree
Showing 31 changed files with 1,121 additions and 359 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ New features / updates:
- Improve WeekOfMonth in date transformers [#323](https://github.com/salesforce/TransmogrifAI/pull/323)
- Improved datetime unit transformer shortcuts - Part 2 [#319](https://github.com/salesforce/TransmogrifAI/pull/319)
- Correctly pass main class for CLI sub project [#321](https://github.com/salesforce/TransmogrifAI/pull/321)
- Serialize blacklisted map keys with the model + updated access on workflow/model members [#320](https://github.com/salesforce/TransmogrifAI/pull/320)
- Serialize blocklisted map keys with the model + updated access on workflow/model members [#320](https://github.com/salesforce/TransmogrifAI/pull/320)
- Improved datetime unit transformer shortcuts [#316](https://github.com/salesforce/TransmogrifAI/pull/316)
- Improved OpScalarStandardScalerTest [#317](https://github.com/salesforce/TransmogrifAI/pull/317)
- improved PercentileCalibratorTest [#318](https://github.com/salesforce/TransmogrifAI/pull/318)
Expand All @@ -101,7 +101,7 @@ New features / updates:
Dependency updates:
- Bump shadowjar plugin to 5.0.0 [#306](https://github.com/salesforce/TransmogrifAI/pull/306)
- Bump Apache Tika to 1.21 [#331](https://github.com/salesforce/TransmogrifAI/pull/331)
- Enable CicleCI version 2.1 [#353](https://github.com/salesforce/TransmogrifAI/pull/353)
- Enable CircleCI version 2.1 [#353](https://github.com/salesforce/TransmogrifAI/pull/353)

## 0.5.3

Expand Down
22 changes: 11 additions & 11 deletions core/src/main/scala/com/salesforce/op/ModelInsights.scala
Original file line number Diff line number Diff line change
Expand Up @@ -436,17 +436,17 @@ case object ModelInsights {
* @param stages stages used to make the feature
* @param rawFeatures raw features in the workflow
* @param trainingParams parameters used to create the workflow model
* @param blacklistedFeatures blacklisted features from use in DAG
* @param blacklistedMapKeys blacklisted map keys from use in DAG
* @param blocklistedFeatures blocklisted features from use in DAG
* @param blocklistedMapKeys blocklisted map keys from use in DAG
* @param rawFeatureFilterResults results of raw feature filter
* @return
*/
private[op] def extractFromStages(
stages: Array[OPStage],
rawFeatures: Array[features.OPFeature],
trainingParams: OpParams,
blacklistedFeatures: Array[features.OPFeature],
blacklistedMapKeys: Map[String, Set[String]],
blocklistedFeatures: Array[features.OPFeature],
blocklistedMapKeys: Map[String, Set[String]],
rawFeatureFilterResults: RawFeatureFilterResults
): ModelInsights = {

Expand Down Expand Up @@ -523,7 +523,7 @@ case object ModelInsights {
ModelInsights(
label = labelSummary,
features = getFeatureInsights(vectorInput, checkerSummary, model, rawFeatures,
blacklistedFeatures, blacklistedMapKeys, rawFeatureFilterResults, labelSummary),
blocklistedFeatures, blocklistedMapKeys, rawFeatureFilterResults, labelSummary),
selectedModelInfo = getModelInfo(model),
trainingParams = trainingParams,
stageInfo = RawFeatureFilterConfig.toStageInfo(rawFeatureFilterResults.rawFeatureFilterConfig)
Expand Down Expand Up @@ -571,8 +571,8 @@ case object ModelInsights {
summary: Option[SanityCheckerSummary],
model: Option[Model[_]],
rawFeatures: Array[features.OPFeature],
blacklistedFeatures: Array[features.OPFeature],
blacklistedMapKeys: Map[String, Set[String]],
blocklistedFeatures: Array[features.OPFeature],
blocklistedMapKeys: Map[String, Set[String]],
rawFeatureFilterResults: RawFeatureFilterResults = RawFeatureFilterResults(),
label: LabelSummary
): Seq[FeatureInsights] = {
Expand Down Expand Up @@ -678,21 +678,21 @@ case object ModelInsights {
case (None, _) => Seq.empty
}

val blacklistInsights = blacklistedFeatures.map{ f =>
val blocklistInsights = blocklistedFeatures.map{ f =>
Seq(f.name) -> Insights(derivedFeatureName = f.name, stagesApplied = Seq.empty, derivedFeatureGroup = None,
derivedFeatureValue = None, excluded = Some(true))
}

val blacklistMapInsights = blacklistedMapKeys.toArray.flatMap { case (mname, keys) =>
val blocklistMapInsights = blocklistedMapKeys.toArray.flatMap { case (mname, keys) =>
keys.toArray.map(key => {
Seq(mname) ->
Insights(derivedFeatureName = key, stagesApplied = Seq.empty, derivedFeatureGroup = Some(key),
derivedFeatureValue = None, excluded = Some(true))
})
}

val allInsights = featureInsights ++ blacklistInsights ++ blacklistMapInsights
val allFeatures = rawFeatures ++ blacklistedFeatures
val allInsights = featureInsights ++ blocklistInsights ++ blocklistMapInsights
val allFeatures = rawFeatures ++ blocklistedFeatures

allInsights
.flatMap { case (feature, insights) => feature.map(_ -> insights) }
Expand Down
56 changes: 28 additions & 28 deletions core/src/main/scala/com/salesforce/op/OpWorkflow.scala
Original file line number Diff line number Diff line change
Expand Up @@ -111,64 +111,64 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {


/**
* Will set the blacklisted features variable and if list is non-empty it will
* @param features list of features to blacklist
* Will set the blocklisted features variable and if list is non-empty it will
* @param features list of features to blocklist
* @param distributions feature distributions calculated in raw feature filter
*/
private[op] def setBlacklist(features: Array[OPFeature], distributions: Seq[FeatureDistribution]): Unit = {
// TODO: Figure out a way to keep track of raw features that aren't explicitly blacklisted, but can't be used
// TODO: because they're inputs into an explicitly blacklisted feature. Eg. "height" in ModelInsightsTest
private[op] def setBlocklist(features: Array[OPFeature], distributions: Seq[FeatureDistribution]): Unit = {
// TODO: Figure out a way to keep track of raw features that aren't explicitly blocklisted, but can't be used
// TODO: because they're inputs into an explicitly blocklisted feature. Eg. "height" in ModelInsightsTest

def finalResultFeaturesCheck(resultFeatures: Array[OPFeature], blacklisted: List[OPFeature]): Unit = {
def finalResultFeaturesCheck(resultFeatures: Array[OPFeature], blocklisted: List[OPFeature]): Unit = {
if (resultFeaturePolicy == ResultFeatureRetention.Strict) {
resultFeatures.foreach{ f => if (blacklisted.contains(f)) {
throw new IllegalArgumentException(s"Blacklist of features (${blacklisted.map(_.name).mkString(", ")})" +
resultFeatures.foreach{ f => if (blocklisted.contains(f)) {
throw new IllegalArgumentException(s"Blocklist of features (${blocklisted.map(_.name).mkString(", ")})" +
s" from RawFeatureFilter contained the result feature ${f.name}") } }
} else if (resultFeaturePolicy == ResultFeatureRetention.AtLeastOne) {
if (resultFeatures.forall(blacklisted.contains)) throw new IllegalArgumentException(s"Blacklist of features" +
s" (${blacklisted.map(_.name).mkString(", ")}) from RawFeatureFilter removed all result features")
if (resultFeatures.forall(blocklisted.contains)) throw new IllegalArgumentException(s"Blocklist of features" +
s" (${blocklisted.map(_.name).mkString(", ")}) from RawFeatureFilter removed all result features")
} else throw new IllegalArgumentException(s"result feature retention policy $resultFeaturePolicy not supported")
}

blacklistedFeatures = features
if (blacklistedFeatures.nonEmpty) {
val allBlacklisted: MList[OPFeature] = MList(getBlacklist(): _*)
blocklistedFeatures = features
if (blocklistedFeatures.nonEmpty) {
val allBlocklisted: MList[OPFeature] = MList(getBlocklist(): _*)
val allUpdated: MList[OPFeature] = MList.empty

val initialResultFeatures = getResultFeatures()
finalResultFeaturesCheck(initialResultFeatures, allBlacklisted.toList)
finalResultFeaturesCheck(initialResultFeatures, allBlocklisted.toList)

val initialStages = getStages() // ordered by DAG so dont need to recompute DAG
// for each stage remove anything blacklisted from the inputs and update any changed input features
// for each stage remove anything blocklisted from the inputs and update any changed input features
initialStages.foreach { stg =>
val inFeatures = stg.getInputFeatures()
val blacklistRemoved = inFeatures
.filterNot { f => allBlacklisted.exists(bl => bl.sameOrigin(f)) }
val blocklistRemoved = inFeatures
.filterNot { f => allBlocklisted.exists(bl => bl.sameOrigin(f)) }
.map { f =>
if (f.isRaw) f.withDistributions(distributions.collect { case d if d.name == f.name => d }) else f
}
val inputsChanged = blacklistRemoved.map{ f => allUpdated.find(u => u.sameOrigin(f)).getOrElse(f) }
val inputsChanged = blocklistRemoved.map{ f => allUpdated.find(u => u.sameOrigin(f)).getOrElse(f) }
val oldOutput = stg.getOutput()
Try(stg.setInputFeatureArray(inputsChanged).setOutputFeatureName(oldOutput.name).getOutput()) match {
case Success(out) => allUpdated += out
case Failure(e) =>
log.info(s"Issue updating inputs for stage $stg: $e")
allBlacklisted += oldOutput
finalResultFeaturesCheck(initialResultFeatures, allBlacklisted.toList)
allBlocklisted += oldOutput
finalResultFeaturesCheck(initialResultFeatures, allBlocklisted.toList)
}
}

// Update the whole DAG with the blacklisted features expunged
// Update the whole DAG with the blocklisted features expunged
val updatedResultFeatures = initialResultFeatures
.filterNot(allBlacklisted.contains)
.filterNot(allBlocklisted.contains)
.map{ f => allUpdated.find(u => u.sameOrigin(f)).getOrElse(f) }
setResultFeatures(updatedResultFeatures: _*)
}
}


protected[op] def setBlacklistMapKeys(mapKeys: Map[String, Set[String]]): Unit = {
blacklistedMapKeys = mapKeys
protected[op] def setBlocklistMapKeys(mapKeys: Map[String, Set[String]]): Unit = {
blocklistedMapKeys = mapKeys
}

/**
Expand Down Expand Up @@ -253,8 +253,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
rf.generateFilteredRaw(rawFeatures, parameters)

setRawFeatureFilterResults(rawFeatureFilterResults)
setBlacklist(featuresToDrop, rawFeatureFilterResults.rawFeatureDistributions)
setBlacklistMapKeys(mapKeysToDrop)
setBlocklist(featuresToDrop, rawFeatureFilterResults.rawFeatureDistributions)
setBlocklistMapKeys(mapKeysToDrop)
cleanedData
}
}
Expand Down Expand Up @@ -366,8 +366,8 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
.setStages(fittedStages)
.setFeatures(newResultFeatures)
.setParameters(getParameters())
.setBlacklist(getBlacklist())
.setBlacklistMapKeys(getBlacklistMapKeys())
.setBlocklist(getBlocklist())
.setBlocklistMapKeys(getBlocklistMapKeys())
.setRawFeatureFilterResults(getRawFeatureFilterResults())

reader.map(model.setReader).getOrElse(model)
Expand Down
20 changes: 10 additions & 10 deletions core/src/main/scala/com/salesforce/op/OpWorkflowCore.scala
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ private[op] trait OpWorkflowCore {
// raw features generated after data is read in and aggregated
protected var rawFeatures: Array[OPFeature] = Array[OPFeature]()

// features that have been blacklisted from use in dag
protected var blacklistedFeatures: Array[OPFeature] = Array[OPFeature]()
// features that have been blocklisted from use in dag
protected var blocklistedFeatures: Array[OPFeature] = Array[OPFeature]()

// map keys that were blacklisted from use in dag
protected var blacklistedMapKeys: Map[String, Set[String]] = Map[String, Set[String]]()
// map keys that were blocklisted from use in dag
protected var blocklistedMapKeys: Map[String, Set[String]] = Map[String, Set[String]]()

// raw feature filter results calculated in raw feature filter
protected var rawFeatureFilterResults: RawFeatureFilterResults = RawFeatureFilterResults()
Expand Down Expand Up @@ -201,18 +201,18 @@ private[op] trait OpWorkflowCore {
}

/**
* Get the list of raw features which have been blacklisted
* Get the list of raw features which have been blocklisted
*
* @return blacklisted features
* @return blocklisted features
*/
final def getBlacklist(): Array[OPFeature] = blacklistedFeatures
final def getBlocklist(): Array[OPFeature] = blocklistedFeatures

/**
* Get the list of Map Keys which have been blacklisted
* Get the list of Map Keys which have been blocklisted
*
* @return blacklisted map keys
* @return blocklisted map keys
*/
final def getBlacklistMapKeys(): Map[String, Set[String]] = blacklistedMapKeys
final def getBlocklistMapKeys(): Map[String, Set[String]] = blocklistedMapKeys

/**
* Get the parameter settings passed into the workflow
Expand Down
16 changes: 8 additions & 8 deletions core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,13 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
this
}

protected[op] def setBlacklist(features: Array[OPFeature]): this.type = {
blacklistedFeatures = features
protected[op] def setBlocklist(features: Array[OPFeature]): this.type = {
blocklistedFeatures = features
this
}

protected[op] def setBlacklistMapKeys(mapKeys: Map[String, Set[String]]): this.type = {
blacklistedMapKeys = mapKeys
protected[op] def setBlocklistMapKeys(mapKeys: Map[String, Set[String]]): this.type = {
blocklistedMapKeys = mapKeys
this
}

Expand Down Expand Up @@ -138,7 +138,7 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
* @return Updated instance of feature
*/
def getUpdatedFeatures(features: Array[OPFeature]): Array[OPFeature] = {
val allFeatures = getRawFeatures() ++ getBlacklist() ++ getStages().map(_.getOutput())
val allFeatures = getRawFeatures() ++ getBlocklist() ++ getStages().map(_.getOutput())
features.map { f =>
allFeatures.find(_.sameOrigin(f))
.getOrElse(throw new IllegalArgumentException(s"feature $f is not a part of this workflow"))
Expand Down Expand Up @@ -175,7 +175,7 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
val parentStageIds = feature.traverse[Set[String]](Set.empty[String])((s, f) => s + f.originStage.uid)
val modelStages = stages.filter(s => parentStageIds.contains(s.uid))
ModelInsights.extractFromStages(modelStages, rawFeatures, trainingParams,
getBlacklist(), getBlacklistMapKeys(), getRawFeatureFilterResults())
getBlocklist(), getBlocklistMapKeys(), getRawFeatureFilterResults())
}
}

Expand Down Expand Up @@ -440,8 +440,8 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
val copy =
new OpWorkflowModel(uid = uid, trainingParams = trainingParams.copy())
.setFeatures(copyFeatures(resultFeatures))
.setBlacklist(copyFeatures(blacklistedFeatures))
.setBlacklistMapKeys(blacklistedMapKeys)
.setBlocklist(copyFeatures(blocklistedFeatures))
.setBlocklistMapKeys(blocklistedMapKeys)
.setRawFeatureFilterResults(rawFeatureFilterResults.copy())
.setStages(stages.map(_.copy(ParamMap.empty)))
.setParameters(parameters.copy())
Expand Down
51 changes: 30 additions & 21 deletions core/src/main/scala/com/salesforce/op/OpWorkflowModelReader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ package com.salesforce.op

import com.salesforce.op.OpWorkflowModelReadWriteShared.{FieldNames => FN}
import com.salesforce.op.OpWorkflowModelReadWriteShared.FieldNames._
import com.salesforce.op.OpWorkflowModelReadWriteShared.DeprecatedFieldNames._
import com.salesforce.op.features.{FeatureJsonHelper, OPFeature, TransientFeature}
import com.salesforce.op.filters.{FeatureDistribution, RawFeatureFilterResults}
import com.salesforce.op.stages.OpPipelineStageReaderWriter._
Expand Down Expand Up @@ -95,15 +96,15 @@ class OpWorkflowModelReader(val workflowOpt: Option[OpWorkflow]) extends MLReade
stages <- loadStages(json, workflowOpt, path)
resolvedFeatures <- resolveFeatures(json, stages)
resultFeatures <- resolveResultFeatures(json, resolvedFeatures)
blacklist <- resolveBlacklist(json, workflowOpt, resolvedFeatures, path)
blacklistMapKeys <- resolveBlacklistMapKeys(json)
blocklist <- resolveBlocklist(json, workflowOpt, resolvedFeatures, path)
blocklistMapKeys <- resolveBlocklistMapKeys(json)
rffResults <- resolveRawFeatureFilterResults(json)
} yield model
.setStages(stages.filterNot(_.isInstanceOf[FeatureGeneratorStage[_, _]]))
.setFeatures(resultFeatures)
.setParameters(params)
.setBlacklist(blacklist)
.setBlacklistMapKeys(blacklistMapKeys)
.setBlocklist(blocklist)
.setBlocklistMapKeys(blocklistMapKeys)
.setRawFeatureFilterResults(rffResults)
}

Expand Down Expand Up @@ -166,31 +167,39 @@ class OpWorkflowModelReader(val workflowOpt: Option[OpWorkflow]) extends MLReade
features.filter(f => resultIds.contains(f.uid))
}

private def resolveBlacklist
private def resolveBlocklist
(
json: JValue,
wfOpt: Option[OpWorkflow],
features: Array[OPFeature],
path: String
): Try[Array[OPFeature]] = {
if ((json \ BlacklistedFeaturesUids.entryName) != JNothing) { // for backwards compatibility
for {
feats <- wfOpt
.map(wf => Success(wf.getAllFeatures() ++ wf.getBlacklist()))
.getOrElse(loadStages(json, BlacklistedStages, path).map(_._2))
allFeatures = features ++ feats
blacklistIds = (json \ BlacklistedFeaturesUids.entryName).extract[Array[String]]
} yield blacklistIds.flatMap(uid => allFeatures.find(_.uid == uid))
} else {
Success(Array.empty[OPFeature])
}
// For backward compatibility. The relevant field name is determined
// by the max length of the blocklist found for each name.
val potentialNames = Seq(
(json \ BlocklistedFeaturesUids.entryName, BlocklistedStages),
(json \ OldBlocklistedFeaturesUids.entryName, OldBlocklistedStages)
)
potentialNames.map { names =>
if (names._1 != JNothing) { // for backwards compatibility
for {
feats <- wfOpt
.map(wf => Success(wf.getAllFeatures() ++ wf.getBlocklist()))
.getOrElse(loadStages(json, names._2, path).map(_._2))
allFeatures = features ++ feats
blocklistIds = names._1.extract[Array[String]]
} yield blocklistIds.flatMap(uid => allFeatures.find(_.uid == uid))
} else {
Success(Array.empty[OPFeature])
}
}.maxBy(_.getOrElse(Array()).length)
}

private def resolveBlacklistMapKeys(json: JValue): Try[Map[String, Set[String]]] = Try {
(json \ BlacklistedMapKeys.entryName).extractOpt[Map[String, List[String]]] match {
case Some(blackMapKeys) => blackMapKeys.map { case (k, vs) => k -> vs.toSet }
case None => Map.empty
}
private def resolveBlocklistMapKeys(json: JValue): Try[Map[String, Set[String]]] = Try {
// For backward compatibility we combine new and deprecated keys.
Seq(json \ BlocklistedMapKeys.entryName, json \ OldBlocklistedMapKeys.entryName)
.flatMap(_.extractOpt[Map[String, List[String]]])
.flatMap(_.map { case (k, vs) => k -> vs.toSet }).toMap
}

private def resolveRawFeatureFilterResults(json: JValue): Try[RawFeatureFilterResults] = {
Expand Down
Loading

0 comments on commit f764842

Please sign in to comment.