Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pretty print model summaries #25

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
some renames
  • Loading branch information
tovbinm committed Jun 22, 2018
commit 64c91c53fe1dbc0f258b9012962ed85934e657ea
61 changes: 40 additions & 21 deletions core/src/main/scala/com/salesforce/op/ModelInsights.scala
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ import com.salesforce.op.stages.impl.classification.ClassificationModelsToTry.{D
import com.salesforce.op.stages.impl.preparators._
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry.{DecisionTreeRegression, GBTRegression, LinearRegression, RandomForestRegression}
import com.salesforce.op.stages.impl.selector.{ModelSelectorBase, ModelSelectorBaseNames, SelectedModel}
import com.salesforce.op.stages.impl.selector.ModelSelectorBaseNames._
import com.salesforce.op.stages.impl.selector.{ModelSelectorBase, SelectedModel}
import com.salesforce.op.stages.{OPStage, OpPipelineStageParams, OpPipelineStageParamsNames}
import com.salesforce.op.utils.json.JsonUtils
import com.salesforce.op.utils.spark.OpVectorMetadata
import com.salesforce.op.utils.spark.RichMetadata._
import enumeratum.EnumEntry
import enumeratum._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.regression._
import org.apache.spark.ml.{Model, PipelineStage, Transformer}
Expand Down Expand Up @@ -79,30 +79,42 @@ case class ModelInsights
) {

/**
* Best model UID
* Selected model UID
*/
def bestModelUid: String = selectedModelInfo(BestModelUid).toString
def selectedModelUID: String = selectedModelInfo(BestModelUid).toString

/**
* Best model name
* Selected model name
*/
def bestModelName: String = selectedModelInfo(BestModelName).toString
def selectedModelName: String = selectedModelInfo(BestModelName).toString

/**
* Best model type, i.e. LogisticRegression, RandomForest etc.
* Selected model type, i.e. LogisticRegression, RandomForest etc.
*/
def bestModelType: EnumEntry = {
classificationModelTypeOfUID.orElse(regressionModelTypeOfUID).lift(bestModelUid).getOrElse(
throw new Exception(s"Unsupported model type for best model '$bestModelUid'"))
def selectedModelType: EnumEntry = {
classificationModelTypeOfUID.orElse(regressionModelTypeOfUID).lift(selectedModelUID).getOrElse(
throw new Exception(s"Unsupported model type for best model '$selectedModelUID'"))
}

/**
* Best model validation results computed during Cross Validation or Train Validation Split
* Selected model validation results computed during Cross Validation or Train Validation Split
*/
def bestModelValidationResults: Map[String, String] = validationResults(bestModelName)
def selectedModelValidationResults: Map[String, String] = validationResults(selectedModelName)

/**
* Validation results computed during Cross Validation or Train Validation Split
* Train set evaluation metrics for selected model
*/
def selectedModelTrainEvalMetrics: EvaluationMetrics = evaluationMetrics(TrainingEval)

/**
* Test set evaluation metrics (if any) for selected model
*/
def selectedModelTestEvalMetrics: Option[EvaluationMetrics] = {
selectedModelInfo.get(HoldOutEval).map(_ => evaluationMetrics(HoldOutEval))
}

/**
* Validation results for all models computed during Cross Validation or Train Validation Split
*
* @return validation results keyed by model name
*/
Expand All @@ -119,15 +131,13 @@ case class ModelInsights
}

/**
* Train set evaluation metrics
*/
def trainEvaluationMetrics: EvaluationMetrics = evaluationMetrics(TrainingEval)

/**
* Test set evaluation metrics (if any)
* Problem type, i.e. Binary Classification, Multi Classification or Regression
*/
def testEvaluationMetrics: Option[EvaluationMetrics] = {
selectedModelInfo.get(HoldOutEval).map(_ => evaluationMetrics(HoldOutEval))
def problemType: ProblemType = selectedModelTrainEvalMetrics match {
case _: BinaryClassificationMetrics => ProblemType.BinaryClassification
case _: MultiClassificationMetrics => ProblemType.MultiClassification
case _: RegressionMetrics => ProblemType.Regression
case _ => ProblemType.Unknown
}

/**
Expand Down Expand Up @@ -192,6 +202,15 @@ case class ModelInsights
}
}

sealed trait ProblemType extends EnumEntry with Serializable
object ProblemType extends Enum[ProblemType] {
val values = findValues
case object BinaryClassification extends ProblemType
case object MultiClassification extends ProblemType
case object Regression extends ProblemType
case object Unknown extends ProblemType
}

/**
* Summary information about label used in model creation (all fields will be empty if no label is found)
*
Expand Down
53 changes: 40 additions & 13 deletions core/src/main/scala/com/salesforce/op/OpWorkflowModel.scala
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,13 @@ import com.salesforce.op.evaluators.{EvaluationMetrics, OpEvaluatorBase}
import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.features.{FeatureLike, OPFeature}
import com.salesforce.op.readers.DataFrameFieldNames._
import com.salesforce.op.stages.impl.selector.StageParamNames
import com.salesforce.op.stages.{OPStage, OpPipelineStage, OpTransformer}
import com.salesforce.op.utils.spark.RichDataset._
import com.salesforce.op.utils.spark.RichMetadata._
import org.apache.spark.ml._
import org.apache.spark.rdd.RDD
import com.salesforce.op.utils.table.Alignment._
import com.salesforce.op.utils.table.Table
import org.apache.spark.sql.types.Metadata
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.json4s.JValue
import org.json4s.JsonAST.{JField, JObject}
import org.json4s.jackson.JsonMethods.{pretty, render}
Expand Down Expand Up @@ -190,15 +189,43 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
* @return compact print friendly string
*/
def summaryPretty(): String = {
val prediction = resultFeatures.find(_.name == StageParamNames.outputParam1Name).orElse(
stages.map(_.getOutput()).find(_.name == StageParamNames.outputParam1Name)
).getOrElse(
throw new Exception("No prediction feature is defined")
)
val insights = modelInsights(prediction)

// TODO
throw new NotImplementedError
val response = resultFeatures.find(_.isResponse).getOrElse(throw new Exception("No response feature is defined"))
val insights = modelInsights(response)
val summary = new ArrayBuffer[String]()

// Selected model information
summary += {
val bestModelType = insights.selectedModelType
val name = s"Selected model - $bestModelType"
val validationResults = insights.selectedModelValidationResults.toSeq ++ Seq(
"name" -> insights.selectedModelName,
"uid" -> insights.selectedModelUID,
"modelType" -> insights.selectedModelType
)
val table = Table(name = name, columns = Seq("Model Param", "Value"), rows = validationResults.sortBy(_._1))
table.prettyString()
}

// Model evaluation metrics
summary += {
val name = "Model Evaluation Metrics"
val trainEvaluationMetrics = insights.selectedModelTrainEvalMetrics
val testEvaluationMetrics = insights.selectedModelTestEvalMetrics
val (metricNameCol, holdOutCol, trainingCol) = ("Metric Name", "Hold Out Set Value", "Training Set Value")
val trainMetrics = trainEvaluationMetrics.toMap.map { case (k, v) => k -> v.toString }.toSeq.sortBy(_._1)
val table = testEvaluationMetrics match {
case Some(testMetrics) =>
val testMetricsMap = testMetrics.toMap
val rows = trainMetrics.map { case (k, v) => (k, v.toString, testMetricsMap(k).toString) }
Table(name = name, columns = Seq(metricNameCol, trainingCol, holdOutCol), rows = rows)
case None =>
Table(name = name, columns = Seq(metricNameCol, trainingCol), rows = trainMetrics)
}
table.prettyString(columnAlignments = Map(holdOutCol -> Right, trainingCol -> Right))
}


summary.mkString("\n\n")
}

/**
Expand Down
15 changes: 8 additions & 7 deletions core/src/test/scala/com/salesforce/op/ModelInsightsTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -241,22 +241,23 @@ class ModelInsightsTest extends FlatSpec with PassengerSparkFixtureTest {

it should "return best model information" in {
val insights = workflowModel.modelInsights(prob)
insights.bestModelUid should startWith("logreg_")
insights.bestModelName should startWith("logreg_")
insights.bestModelType shouldBe LogisticRegression
val bestModelValidationResults = insights.bestModelValidationResults
insights.selectedModelUID should startWith("logreg_")
insights.selectedModelName should startWith("logreg_")
insights.selectedModelType shouldBe LogisticRegression
val bestModelValidationResults = insights.selectedModelValidationResults
bestModelValidationResults.size shouldBe 15
bestModelValidationResults.get("area under PR") shouldBe Some("0.0")
val validationResults = insights.validationResults
validationResults.size shouldBe 2
validationResults.get(insights.bestModelName) shouldBe Some(bestModelValidationResults)
validationResults.get(insights.selectedModelName) shouldBe Some(bestModelValidationResults)
}

it should "return test/train evaluation metrics" in {
val insights = workflowModel.modelInsights(prob)
insights.trainEvaluationMetrics shouldBe
insights.problemType shouldBe ProblemType.BinaryClassification
insights.selectedModelTrainEvalMetrics shouldBe
BinaryClassificationMetrics(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0)
insights.testEvaluationMetrics shouldBe Some(
insights.selectedModelTestEvalMetrics shouldBe Some(
BinaryClassificationMetrics(0.0, 0.0, 0.0, 0.5, 0.75, 0.5, 0.0, 1.0, 0.0, 1.0)
)
}
Expand Down