Skip to content

Commit

Permalink
Refactor boxplot (#790)
Browse files Browse the repository at this point in the history
* Refactor boxplot jfx demo.

* First step of the geom_boxplot() splitting.

* Fixes after merging with master.

* Fix outlier positions when there is additional grouping.

* Add geometry for the boxplot outlier.

* Update Python function for the geom_boxplot().

* Remove Aes.Y from GeomMeta for the boxplot geometry.

* Remove the BoxplotOutlierGeom and connected code.

* Remove sampling parameter from API of the geom_boxplot() function.

* Small improvements of code in the BoxplotStat.

* Tiny fix in the BoxplotStat.

* Small fixes of boxplot in different places.

* Remove extra mentions of an outliers.

* Increase default value of the midline fatten for the boxplot geometry.

* Add dodge width to default position for the outlier geometry.
  • Loading branch information
ASmirnov-HORIS committed Jun 8, 2023
1 parent abc950d commit 3aa5d09
Show file tree
Hide file tree
Showing 18 changed files with 409 additions and 308 deletions.
4 changes: 4 additions & 0 deletions future_changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,14 @@

### Changed

- [BREAKING] `geom_boxplot()` no longer support parameter `sampling`.


- Reduce the default `width`/`height` values for `geom_errorbar()`.


### Fixed

- ggsave: saving geomImshow() to SVG produces fuzzy picture [[LPK-188](https://github.com/JetBrains/lets-plot-kotlin/issues/188)].
- ggsave: saving geomImshow() to raster format produces fuzzy picture.
- geom_livemap: memory leak when re-run cells without reloading a page
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,11 @@ object GeomMeta {
)

GeomKind.BOX_PLOT -> listOf(
Aes.LOWER, // NaN for 'outlier' data-point
Aes.MIDDLE, // NaN for 'outlier' data-point
Aes.UPPER, // NaN for 'outlier' data-point
Aes.LOWER,
Aes.MIDDLE,
Aes.UPPER,

Aes.X,
Aes.Y, // NaN for 'box' data-point (used for outliers)
Aes.YMAX,
Aes.YMIN,

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,19 @@ package jetbrains.datalore.plot.base.geom

import jetbrains.datalore.base.geometry.DoubleRectangle
import jetbrains.datalore.base.geometry.DoubleVector
import jetbrains.datalore.base.values.Color
import jetbrains.datalore.plot.base.*
import jetbrains.datalore.plot.base.aes.AestheticsDefaults
import jetbrains.datalore.plot.base.geom.util.*
import jetbrains.datalore.plot.base.geom.util.GeomUtil.extendHeight
import jetbrains.datalore.plot.base.geom.util.HintColorUtil.colorWithAlpha
import jetbrains.datalore.plot.base.interact.NullGeomTargetCollector
import jetbrains.datalore.plot.base.interact.TipLayoutHint
import jetbrains.datalore.plot.base.render.LegendKeyElementFactory
import jetbrains.datalore.plot.base.render.SvgRoot
import jetbrains.datalore.plot.base.render.point.PointShape
import jetbrains.datalore.vis.svg.SvgLineElement

class BoxplotGeom : GeomBase() {

var fattenMidline: Double = 1.0
var whiskerWidth: Double = 0.5

var outlierColor: Color? = null
var outlierFill: Color? = null
var outlierShape: PointShape? = null
var outlierSize: Double? = null
var outlierStroke: Double? = null
var fattenMidline: Double = DEF_FATTEN_MIDLINE
var whiskerWidth: Double = DEF_WHISKER_WIDTH

override val legendKeyElementFactory: LegendKeyElementFactory
get() = LEGEND_FACTORY
Expand All @@ -47,7 +37,6 @@ class BoxplotGeom : GeomBase() {
clientRectByDataPoint(ctx, geomHelper, isHintRect = false)
)
buildLines(root, aesthetics, ctx, geomHelper)
buildOutliers(root, aesthetics, pos, coord, ctx)
BarTooltipHelper.collectRectangleTargets(
listOf(Aes.YMAX, Aes.UPPER, Aes.MIDDLE, Aes.LOWER, Aes.YMIN),
aesthetics, pos, coord, ctx,
Expand Down Expand Up @@ -121,59 +110,12 @@ class BoxplotGeom : GeomBase() {
}
}

private fun buildOutliers(
root: SvgRoot,
aesthetics: Aesthetics,
pos: PositionAdjustment,
coord: CoordinateSystem,
ctx: GeomContext
) {
val outlierAesthetics = getOutliersAesthetics(aesthetics)
PointGeom()
.buildIntern(root, outlierAesthetics, pos, coord, ctx.withTargetCollector(NullGeomTargetCollector()))
}

private fun getOutliersAesthetics(aesthetics: Aesthetics): Aesthetics {
return MappedAesthetics(aesthetics) { p ->
toOutlierDataPointAesthetics(p)
}
}

/**
* The geom `Aesthetics` contains both: reqular data-points and "outlier" data-points.
* Regular data-point do not yave Y defined. We use this feature to feature to
* detect regular data-points and ignore them.
*/
private fun toOutlierDataPointAesthetics(p: DataPointAesthetics): DataPointAesthetics {
if (!p.defined(Aes.Y)) {
// not an "outlier" data-point
return p
}

return object : DataPointAestheticsDelegate(p) {
override operator fun <T> get(aes: Aes<T>): T? {
val value: Any? = when (aes) {
Aes.COLOR -> outlierColor ?: super.get(aes)
Aes.FILL -> outlierFill ?: super.get(aes)
Aes.SHAPE -> outlierShape ?: super.get(aes)
Aes.SIZE -> outlierSize ?: OUTLIER_DEF_SIZE // 'size' of 'super' is line thickness on box-plot
Aes.STROKE -> outlierStroke ?: OUTLIER_DEF_STROKE // other elements of boxplot has no 'stroke' aes
Aes.ALPHA -> 1.0 // Don't apply boxplot' alpha to outlier points.
else -> super.get(aes)
}
@Suppress("UNCHECKED_CAST")
return value as T?
}
}
}


companion object {
const val DEF_FATTEN_MIDLINE = 2.5
const val DEF_WHISKER_WIDTH = 0.5
const val HANDLES_GROUPS = false

private val LEGEND_FACTORY = CrossBarHelper.legendFactory(true)
private val OUTLIER_DEF_SIZE = AestheticsDefaults.point().defaultValue(Aes.SIZE)
private val OUTLIER_DEF_STROKE = AestheticsDefaults.point().defaultValue(Aes.STROKE)

private fun clientRectByDataPoint(
ctx: GeomContext,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Copyright (c) 2023. JetBrains s.r.o.
* Use of this source code is governed by the MIT license that can be found in the LICENSE file.
*/

package jetbrains.datalore.plot.base.stat

import jetbrains.datalore.plot.base.Aes
import jetbrains.datalore.plot.base.DataFrame
import jetbrains.datalore.plot.base.StatContext
import jetbrains.datalore.plot.base.data.TransformVar
import jetbrains.datalore.plot.common.data.SeriesUtil
import kotlin.math.sqrt

class BoxplotOutlierStat(
private val whiskerIQRRatio: Double, // ggplot: 'coef'
private val computeWidth: Boolean // ggplot: 'varWidth'
) : BaseStat(DEF_MAPPING) {
// Note: outliers will need 'width' value, for the 'dodge' positioning to work correctly for all data-points.

override fun hasDefaultMapping(aes: Aes<*>): Boolean {
return super.hasDefaultMapping(aes) ||
aes == Aes.WIDTH && computeWidth
}

override fun getDefaultMapping(aes: Aes<*>): DataFrame.Variable {
return if (aes == Aes.WIDTH) {
Stats.WIDTH
} else {
super.getDefaultMapping(aes)
}
}

override fun consumes(): List<Aes<*>> {
return listOf(Aes.X, Aes.Y)
}

override fun apply(data: DataFrame, statCtx: StatContext, messageConsumer: (s: String) -> Unit): DataFrame {
if (!hasRequiredValues(data, Aes.Y)) {
return withEmptyStatValues()
}

val ys = data.getNumeric(TransformVar.Y)
val xs = if (data.has(TransformVar.X)) {
data.getNumeric(TransformVar.X)
} else {
List(ys.size) { 0.0 }
}

val statData = buildStat(xs, ys, whiskerIQRRatio)
val statCount = statData.remove(Stats.COUNT)

if (computeWidth) {
// 'width' is in range 0..1
val maxCountPerBin = statCount?.maxOrNull()?.toInt() ?: 0
val norm = sqrt(maxCountPerBin.toDouble())
val statWidth = statCount!!.map { count -> sqrt(count) / norm }
statData[Stats.WIDTH] = statWidth
}

val builder = DataFrame.Builder()
for ((variable, series) in statData) {
builder.putNumeric(variable, series)
}
return builder.build()
}

companion object {
private val DEF_MAPPING: Map<Aes<*>, DataFrame.Variable> = mapOf(
Aes.X to Stats.X,
Aes.Y to Stats.Y
)

private fun buildStat(
xs: List<Double?>,
ys: List<Double?>,
whiskerIQRRatio: Double
): MutableMap<DataFrame.Variable, List<Double>> {
val xyPairs = SeriesUtil.filterFinite(xs, ys)
.let { (xs, ys) -> xs zip ys }
if (xyPairs.isEmpty()) {
return mutableMapOf()
}

val binnedData: MutableMap<Double, MutableList<Double>> = HashMap()
for ((x, y) in xyPairs) {
binnedData.getOrPut(x) { ArrayList() }.add(y)
}

val statX = ArrayList<Double>()
val statY = ArrayList<Double>()
val statCount = ArrayList<Double>()

for ((x, bin) in binnedData) {
val count = bin.size.toDouble()
val summary = FiveNumberSummary(bin)
val lowerHinge = summary.firstQuartile
val upperHinge = summary.thirdQuartile
val IQR = upperHinge - lowerHinge
val lowerFence = lowerHinge - IQR * whiskerIQRRatio
val upperFence = upperHinge + IQR * whiskerIQRRatio
val outliers = bin.filter { y -> y < lowerFence || y > upperFence }
for (y in outliers) {
statX.add(x)
statY.add(y)
statCount.add(count)
}

// If there are no outliers, add a fake one to correct splitting for additional grouping
if (outliers.isEmpty() && count > 0) {
statX.add(x)
statY.add(Double.NaN)
statCount.add(count)
}
}

return mutableMapOf(
Stats.X to statX,
Stats.Y to statY,
Stats.COUNT to statCount
)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,14 @@ import kotlin.math.sqrt
/**
* Calculate components of box and whisker plot.
*
* Creates a "stat" dataframe contaning:
* a) "box" data-points
* x
* y = NaN
* width - width of box
* ymin - lower whisker = smallest observation greater than or equal to lower hinge - 1.5 * IQR
* lower - lower hinge, 25% quantile
* middle - median, 50% quantile
* upper - upper hinge, 75% quantile
* ymax - upper whisker = largest observation less than or equal to upper hinge + 1.5 * IQR
*
* b) "outlier" data-points
* x, y, width
* ymin, lower... = NaN
* Creates a "stat" dataframe with:
* x
* width - width of box
* ymin - lower whisker = smallest observation greater than or equal to lower hinge - 1.5 * IQR
* lower - lower hinge, 25% quantile
* middle - median, 50% quantile
* upper - upper hinge, 75% quantile
* ymax - upper whisker = largest observation less than or equal to upper hinge + 1.5 * IQR
*
* Not implemented:
* notchlower - lower edge of notch = median - 1.58 * IQR / sqrt(n)
Expand Down Expand Up @@ -66,7 +60,7 @@ class BoxplotStat(
val xs = if (data.has(TransformVar.X)) {
data.getNumeric(TransformVar.X)
} else {
List<Double>(ys.size) { 0.0 }
List(ys.size) { 0.0 }
}

val statData = buildStat(xs, ys, whiskerIQRRatio)
Expand Down Expand Up @@ -96,34 +90,31 @@ class BoxplotStat(

private val DEF_MAPPING: Map<Aes<*>, DataFrame.Variable> = mapOf(
Aes.X to Stats.X,
Aes.Y to Stats.Y,
Aes.YMIN to Stats.Y_MIN,
Aes.YMAX to Stats.Y_MAX,
Aes.LOWER to Stats.LOWER,
Aes.MIDDLE to Stats.MIDDLE,
Aes.UPPER to Stats.UPPER
)

fun buildStat(
private fun buildStat(
xs: List<Double?>,
ys: List<Double?>,
whiskerIQRRatio: Double
): MutableMap<DataFrame.Variable, List<Double>> {

val xyPairs = xs.zip(ys).filter { (x, y) ->
SeriesUtil.allFinite(x, y)
}
val xyPairs = SeriesUtil.filterFinite(xs, ys)
.let { (xs, ys) -> xs zip ys }
if (xyPairs.isEmpty()) {
return mutableMapOf()
}

val binnedData: MutableMap<Double, MutableList<Double>> = HashMap()
for ((x, y) in xyPairs) {
binnedData.getOrPut(x!!) { ArrayList() }.add(y!!)
binnedData.getOrPut(x) { ArrayList() }.add(y)
}

val statX = ArrayList<Double>()
val statY = ArrayList<Double>()
val statMiddle = ArrayList<Double>()
val statLower = ArrayList<Double>()
val statUpper = ArrayList<Double>()
Expand All @@ -146,36 +137,15 @@ class BoxplotStat(
var lowerWhisker = lowerFence
var upperWhisker = upperFence
if (SeriesUtil.allFinite(lowerFence, upperFence)) {
val boxed = bin.filter { y -> y >= lowerFence && y <= upperFence }
val boxed = bin.filter { y -> y in lowerFence..upperFence }
val range = SeriesUtil.range(boxed)
if (range != null) {
lowerWhisker = range.lowerEnd
upperWhisker = range.upperEnd
}
}

// add outliers first
val outliers = bin.filter { y -> y < lowerFence || y > upperFence }
for (y in outliers) {
// 'outlier' data-point
statX.add(x)
statY.add(y)
// no 'box' data
statMiddle.add(Double.NaN)
statLower.add(Double.NaN)
statUpper.add(Double.NaN)
statMin.add(Double.NaN)
statMax.add(Double.NaN)

statCount.add(count)

// Note: outliers will also need 'width' value,
// for the 'dodge' positioning to work correctly for all data-points.
}

// add 'box' data-point
statX.add(x)
statY.add(Double.NaN) // no Y for 'box' data-point
statMiddle.add(middle)
statLower.add(lowerHinge)
statUpper.add(upperHinge)
Expand All @@ -187,7 +157,6 @@ class BoxplotStat(

return mutableMapOf(
Stats.X to statX,
Stats.Y to statY,
Stats.MIDDLE to statMiddle,
Stats.LOWER to statLower,
Stats.UPPER to statUpper,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,13 @@ object Stats {
return BoxplotStat(whiskerIQRRatio, computeWidth)
}

fun boxplotOutlier(
whiskerIQRRatio: Double = BoxplotStat.DEF_WHISKER_IQR_RATIO,
computeWidth: Boolean = BoxplotStat.DEF_COMPUTE_WIDTH
): BoxplotOutlierStat {
return BoxplotOutlierStat(whiskerIQRRatio, computeWidth)
}

fun density(
trim: Boolean = DensityStat.DEF_TRIM,
bandWidth: Double? = null,
Expand Down
Loading

0 comments on commit 3aa5d09

Please sign in to comment.