Skip to content

Commit

Permalink
Reordering (#411)
Browse files Browse the repository at this point in the history
* - When ordering by the variable which has same values, need also check own values. Add test.
- Add new tests: check ordering in data, data with null values.

* Add merging of groups after stat applying according to ordering settings.

* Minor code improvement.

* Add checking of order direction: require -1 or 1.
Fix comparator for groups. Add new tests.

* Add order settings from discrete to non-discrete mappings.
Fix test.

* Code cleanup.

* Apply order options from layer's discrete variable to plot's non-discrete variable and vice versa.
Add test.
Update notebook.

* Update the data in tests (it will improve examples with 'count').
Add new examples in the notebook.

* Add 'sum' as aggregate operation for position='stack'.

* Update notebook: add sampling_pick + order_by='..count..'.

* Make class GroupMerger as a top-level internal class.
  • Loading branch information
OLarionova-HORIS authored Jul 29, 2021
1 parent 99bc39e commit df2c7cd
Show file tree
Hide file tree
Showing 12 changed files with 1,153 additions and 160 deletions.
479 changes: 423 additions & 56 deletions docs/examples/jupyter-notebooks-dev/ordering_examples.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,13 @@ class DataFrame private constructor(builder: Builder) {
get(orderSpec.variable)
.zip(getNumeric(orderSpec.orderBy))
.groupBy({ (value) -> value }) { (_, byValue) -> byValue }
.mapValues { (_, byValues) -> orderSpec.aggregateOperation.invoke(byValues) }
.mapValues { (_, byValues) -> orderSpec.aggregateOperation.invoke(byValues.filter(::isValueComparable)) }
.toList()
} else {
get(orderSpec.variable).zip(get(orderSpec.orderBy))
}
.filter { isValueComparable(it.second) }
.sortedWith(compareBy { it.second as Comparable<*> })
.filter { isValueComparable(it.second) && isValueComparable(it.first)}
.sortedWith(compareBy({ it.second as Comparable<*> }, { it.first as Comparable<*> }))
.mapNotNull { it.first }

// the values corresponding to non-comparable values will be placed at the end of the result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class DataFrameDistinctValuesTest {
run {
// Ascending
val orderSpecs = listOf(
OrderSpec(variable, orderBy = variable, direction = 1, aggregateOperation = null),
OrderSpec(variable, orderBy = variable, direction = 1),
OrderSpec(orderByVariable, orderBy = orderByVariable, direction = 1)
)
val df = builder()
Expand All @@ -56,7 +56,7 @@ class DataFrameDistinctValuesTest {
run {
// Descending
val orderSpecs = listOf(
OrderSpec(variable, orderBy = variable, direction = -1, aggregateOperation = null),
OrderSpec(variable, orderBy = variable, direction = -1),
OrderSpec(orderByVariable, orderBy = orderByVariable, direction = -1)
)
val df = builder()
Expand All @@ -72,12 +72,7 @@ class DataFrameDistinctValuesTest {
run {
// order by ascending orderByVariable
val df = builder()
.addOrderSpec(
OrderSpec(
variable, orderByVariable, direction = 1,
aggregateOperation = { v: List<Double?> -> v.filterNotNull().minOrNull() }
)
)
.addOrderSpec(OrderSpec(variable, orderByVariable, direction = 1))
.build()

assertDistinctValues(df, mapOf(
Expand All @@ -101,7 +96,7 @@ class DataFrameDistinctValuesTest {
@Test
fun `correct ordering should be kept after dataframe rebuilding`() {
val orderSpecs = listOf(
OrderSpec(variable, variable, direction = 1, aggregateOperation = null),
OrderSpec(variable, variable, direction = 1),
OrderSpec(orderByVariable, orderByVariable, direction = -1)
)
// Build dataFrame with ordering specifications
Expand Down Expand Up @@ -134,7 +129,7 @@ class DataFrameDistinctValuesTest {
run {
// Add ordering specs
val df = builder
.addOrderSpec(OrderSpec(variable, orderBy = variable, direction = 1, aggregateOperation = null))
.addOrderSpec(OrderSpec(variable, orderBy = variable, direction = 1))
.build()
assertDistinctValues(df, mapOf(variable to listOf("A", "B", "C")))
}
Expand Down Expand Up @@ -176,7 +171,7 @@ class DataFrameDistinctValuesTest {
val df = builder()
.addOrderSpec(OrderSpec(variable, orderByVariable, direction = -1))
.build()
assertDistinctValues(df, mapOf(variable to listOf("A", "B", "D", "C")))
assertDistinctValues(df, mapOf(variable to listOf("B", "A", "D", "C")))
}
run {
val df = DataFrame.Builder()
Expand Down Expand Up @@ -205,7 +200,7 @@ class DataFrameDistinctValuesTest {
val df = builder()
.addOrderSpec(OrderSpec(variable, orderByVariable, direction = -1))
.build()
assertDistinctValues(df, mapOf(variable to listOf("A", "B", "D", "C")))
assertDistinctValues(df, mapOf(variable to listOf("B", "A", "D", "C")))
}
}

Expand Down Expand Up @@ -309,6 +304,22 @@ class DataFrameDistinctValuesTest {
}
}

@Test
fun `order by the same values - check also the variable values`() {
fun builder() = DataFrame.Builder()
.put(variable, listOf("B", "A", "C"))
.put(orderByVariable, listOf(0.0, 0.0, 0.0))

val df = builder()
.addOrderSpec(OrderSpec(variable, orderBy = orderByVariable, direction = 1))
.build()
val expectedDistinctValues = mapOf(
variable to listOf("A", "B", "C"),
orderByVariable to listOf(0.0)
)
assertDistinctValues(df, expectedDistinctValues)
}

private fun assertDistinctValues(df: DataFrame, expectedDistinctValues: Map<DataFrame.Variable, List<Any>>) {
expectedDistinctValues.forEach { (variable, expected) ->
assertEquals(expected, df.distinctValues(variable).toList())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ class GeomLayerBuilder {
statCtx,
varsWithoutBinding = emptyList(),
orderOptions = emptyList(),
aggregateOperation = null,
::println
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,35 +58,34 @@ object DataProcessing {
statCtx: StatContext,
varsWithoutBinding: List<String>,
orderOptions: List<OrderOptionUtil.OrderOption>,
aggregateOperation: ((List<Double?>) -> Double?)?,
messageConsumer: Consumer<String>
): DataAndGroupingContext {
if (stat === Stats.IDENTITY) {
return DataAndGroupingContext(emptyFrame(), groupingContext)
}

val groups = groupingContext.groupMapper
val resultSeries = HashMap<Variable, List<Any>>()

val groupSizeListAfterStat = ArrayList<Int>()
val resultSeries: Map<Variable, List<Any?>>
val groupSizeListAfterStat: List<Int>

// if only one group no need to modify
if (groups === GroupUtil.SINGLE_GROUP) {
val sd = applyStat(data, stat, bindings, scaleMap, facets, statCtx, varsWithoutBinding, messageConsumer)
groupSizeListAfterStat.add(sd.rowCount())
for (variable in sd.variables()) {
@Suppress("UNCHECKED_CAST")
val list = sd[variable] as List<Any>
resultSeries[variable] = list
}
groupSizeListAfterStat = listOf(sd.rowCount())
resultSeries = sd.variables().associateWith { variable -> sd[variable] }
} else { // add offset to each group
val groupMerger = GroupsMerger()
var lastStatGroupEnd = -1
for (d in splitByGroup(data, groups)) {
var sd = applyStat(d, stat, bindings, scaleMap, facets, statCtx, varsWithoutBinding, messageConsumer)
if (sd.isEmpty) {
continue
}
groupMerger.initOrderSpecs(orderOptions, sd.variables(), bindings, aggregateOperation)

groupSizeListAfterStat.add(sd.rowCount())
val curGroupSizeAfterStat = sd.rowCount()

// update 'stat group' to avoid collisions as stat is applied independently to each original data group
if (sd.has(Stats.GROUP)) {
Expand All @@ -112,15 +111,12 @@ object DataProcessing {
}
}

// merge results
for (variable in sd.variables()) {
if (!resultSeries.containsKey(variable)) {
resultSeries[variable] = ArrayList()
}
@Suppress("UNCHECKED_CAST")
(resultSeries[variable] as MutableList).addAll(sd[variable] as List<Any>)
}
// Add group's data
groupMerger.addGroup(sd, curGroupSizeAfterStat)
}
// Get merged series
resultSeries = groupMerger.getResultSeries()
groupSizeListAfterStat = groupMerger.getGroupSizes()
}

val dataAfterStat = Builder().run {
Expand All @@ -131,7 +127,7 @@ object DataProcessing {

// set ordering specifications
val orderSpecs = orderOptions.map { orderOption ->
OrderOptionUtil.createOrderSpec(resultSeries.keys, bindings, orderOption)
OrderOptionUtil.createOrderSpec(resultSeries.keys, bindings, orderOption, aggregateOperation)
}
addOrderSpecs(orderSpecs)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright (c) 2021. JetBrains s.r.o.
* Use of this source code is governed by the MIT license that can be found in the LICENSE file.
*/

package jetbrains.datalore.plot.builder.data

import jetbrains.datalore.plot.base.Aes
import jetbrains.datalore.plot.base.DataFrame
import jetbrains.datalore.plot.builder.VarBinding

internal class GroupsMerger {
private var myOrderSpecs: List<DataFrame.OrderSpec>? = null
private val myOrderedGroups = ArrayList<Group>()

fun initOrderSpecs(
orderOptions: List<OrderOptionUtil.OrderOption>,
variables: Set<DataFrame.Variable>,
bindings: List<VarBinding>,
aggregateOperation: ((List<Double?>) -> Double?)?
) {
if (myOrderSpecs != null) return
myOrderSpecs = orderOptions
.filter { orderOption ->
// no need to reorder groups by X
bindings.find { it.variable.name == orderOption.variableName && it.aes == Aes.X } == null
}
.map { OrderOptionUtil.createOrderSpec(variables, bindings, it, aggregateOperation) }
}

fun getResultSeries(): HashMap<DataFrame.Variable, MutableList<Any?>> {
val resultSeries = HashMap<DataFrame.Variable, MutableList<Any?>>()
myOrderedGroups.forEach { group ->
group.df.variables().forEach { variable ->
resultSeries.getOrPut(variable, ::ArrayList).addAll(group.df[variable])
}
}
return resultSeries
}

fun getGroupSizes(): List<Int> {
return myOrderedGroups.map(Group::groupSize)
}

inner class Group(
val df: DataFrame,
val groupSize: Int
) : Comparable<Group> {
override fun compareTo(other: Group): Int {
fun compareGroupValue(v1: Any?, v2: Any?, dir: Int): Int {
// null value is always greater - will be at the end of the result
if (v1 == null && v2 == null ) return 0
if (v1 == null) return 1
if (v2 == null) return -1
return compareValues(v1 as Comparable<*>, v2 as Comparable<*>) * dir
}
fun getValue(
df: DataFrame,
variable: DataFrame.Variable,
aggregateOperation: ((List<Double?>) -> Double?)? = null
): Any? {
return if (aggregateOperation != null) {
require(df.isNumeric(variable)) { "Can't apply aggregate operation to non-numeric values" }
aggregateOperation.invoke(df.getNumeric(variable).requireNoNulls())
} else {
// group has no more than one unique element
df[variable].firstOrNull()
}
}

myOrderSpecs?.forEach { spec ->
var cmp = compareGroupValue(
getValue(df, spec.orderBy, spec.aggregateOperation),
getValue(other.df, spec.orderBy, spec.aggregateOperation),
spec.direction
)
if (cmp == 0) {
// ensure the order as in the legend
cmp = compareGroupValue(
getValue(df, spec.variable),
getValue(other.df, spec.variable),
spec.direction
)
}
if (cmp != 0) {
return cmp
}
}
return 0
}
}

fun addGroup(d: DataFrame, groupSize: Int) {
val group = Group(d, groupSize)
val indexToInsert = findIndexToInsert(group)
myOrderedGroups.add(indexToInsert, group)
}

private fun findIndexToInsert(group: Group): Int {
if (myOrderSpecs.isNullOrEmpty()) {
return myOrderedGroups.size
}
var index = myOrderedGroups.binarySearch(group)
if (index < 0) index = index.inv()
return index
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import jetbrains.datalore.plot.base.Aes
import jetbrains.datalore.plot.base.DataFrame
import jetbrains.datalore.plot.builder.VarBinding
import jetbrains.datalore.plot.builder.sampling.method.SamplingUtil
import jetbrains.datalore.plot.common.data.SeriesUtil

object OrderOptionUtil {
class OrderOption internal constructor(
Expand All @@ -27,7 +26,7 @@ object OrderOptionUtil {
if (orderBy == null && order == null) {
return null
}
require(order == null || order is Number) {
require(order == null || (order is Number && order.toInt() in listOf(-1, 1))) {
"Unsupported `order` value: $order. Use 1 (ascending) or -1 (descending)."
}

Expand Down Expand Up @@ -56,7 +55,8 @@ object OrderOptionUtil {
fun createOrderSpec(
variables: Set<DataFrame.Variable>,
varBindings: List<VarBinding>,
orderOption: OrderOption
orderOption: OrderOption,
aggregateOperation: ((List<Double?>) -> Double?)?
): DataFrame.OrderSpec {
fun getVariableByName(varName: String): DataFrame.Variable {
return variables.find { it.name == varName }
Expand All @@ -73,21 +73,14 @@ object OrderOptionUtil {
getVariableByName(orderOption.variableName)
}

// TODO Need to define the aggregate operation
val aggregateOperation =
if (orderOption.byVariable != null && orderOption.byVariable != orderOption.variableName) {
// Use ordering by the 'order_by' variable with the specified aggregation
{ v: List<Double?> -> SeriesUtil.mean(v, defaultValue = null) }
} else {
// Use ordering by the 'variable' without aggregation
null
}

return DataFrame.OrderSpec(
variable,
orderOption.byVariable?.let(::getVariableByName) ?: getVariableByName(orderOption.variableName),
orderOption.getOrderDir(),
aggregateOperation
aggregateOperation.takeIf {
// Use the aggregation for ordering by the specified 'order_by' variable
orderOption.byVariable != null && orderOption.byVariable != orderOption.variableName
}
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,26 @@ object DataMetaUtil {
parameters?.getString(ORDER_BY),
parameters?.read(ORDER)
)
} ?: emptyList()
}
?: emptyList()
}

fun List<OrderOptionUtil.OrderOption>.inheritToNonDiscrete(mappings: Map<*, *>): List<OrderOptionUtil.OrderOption> {
// non-discrete mappings should inherit settings from the as_discrete
return this + mappings.variables()
.filterNot(::isDiscrete)
.mapNotNull { varName ->
val orderOptionForVar = this
.filter { isDiscrete(it.variableName) }
.find { fromDiscrete(it.variableName) == varName }
?: return@mapNotNull null

OrderOptionUtil.OrderOption.create(
varName,
orderBy = orderOptionForVar.byVariable.takeIf { it != orderOptionForVar.variableName },
orderOptionForVar.getOrderDir()
)
}
}
}

Expand Down
Loading

0 comments on commit df2c7cd

Please sign in to comment.