-
Notifications
You must be signed in to change notification settings - Fork 393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Detect and remove IDs disguised in text features #415
Changes from all commits
681d98a
ef7cdfb
6a57693
bb858c7
cae7fe4
aa55539
7950924
8f3befe
9ed2a02
90350d0
8640d6d
91285b6
61d26b1
a7a0781
6c887f3
7829dfd
b611289
a30eca6
c0ceaa6
b3930dc
33afe00
b7f050b
d79456e
b99b395
ac8757e
2a4ccbc
febdc13
0c016b8
93267ab
d73f3c5
7c1f262
88b5867
a133392
095a180
7031d16
1e767c1
72ce224
df5562e
c3cc3b0
b022921
9bad875
9f7bc99
126ddcd
7753280
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -142,7 +142,8 @@ case class RawFeatureFilterMetrics | |
scoringFillRate: Option[Double], | ||
jsDivergence: Option[Double], | ||
fillRateDiff: Option[Double], | ||
fillRatioDiff: Option[Double] | ||
fillRatioDiff: Option[Double], | ||
trainingCardSize: Option[Int] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's name it fully, i.e |
||
) | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package com.salesforce.op.stages.impl.feature | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. License header is missing |
||
|
||
import com.salesforce.op.UID | ||
import com.salesforce.op.features.types.{TextMap} | ||
import com.salesforce.op.stages.base.unary.UnaryTransformer | ||
|
||
class IdMapRemover( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. docs please |
||
minUniqueTokLen: Int, | ||
uid: String = UID[IdMapRemover] | ||
) extends UnaryTransformer[TextMap, TextMap](operationName = "IdMapRemover", uid = uid) { | ||
|
||
private var dropMap: Map[String, Boolean] = Map() | ||
|
||
override protected def onSetInput(): Unit = { | ||
super.onSetInput() | ||
val dist = in1.asFeatureLike.distributions | ||
val keys = dist.flatMap(_.key) | ||
val drop = dist.flatMap(_.cardEstimate).map(_.valueCounts.size < minUniqueTokLen) | ||
dropMap = (keys zip drop) toMap | ||
} | ||
|
||
override def transformFn: TextMap => TextMap = | ||
a => { | ||
val filteredMap = a.value.map { case (k, v) => | ||
dropMap.get(k) match { | ||
case Some(true) => (k, "") | ||
case _ => (k, v) | ||
} | ||
} | ||
TextMap(filteredMap) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package com.salesforce.op.stages.impl.feature | ||
|
||
import com.salesforce.op.UID | ||
import com.salesforce.op.features.types.Text | ||
import com.salesforce.op.stages.base.unary.UnaryTransformer | ||
|
||
class IdRemover( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. docs please |
||
minUniqueTokLen: Int, | ||
uid: String = UID[IdRemover], | ||
operationName: String = "IDremover" | ||
) extends UnaryTransformer[Text, Text] (operationName = operationName, uid = uid) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to expose operationName on the |
||
|
||
private var drop: Boolean = false | ||
|
||
override protected def onSetInput(): Unit = { | ||
super.onSetInput() | ||
val dist = in1.asFeatureLike.distributions | ||
val tokenLenCardFilter = dist.flatMap(_.cardEstimate).map(_.valueCounts.size < minUniqueTokLen) | ||
drop = tokenLenCardFilter.headOption.getOrElse(false) | ||
} | ||
|
||
override def transformFn: Text => Text = a => if (drop) Text.empty else a | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about backwards compatibility?