Skip to content

Commit

Permalink
Cleanup Helloworld examples (#230)
Browse files Browse the repository at this point in the history
  • Loading branch information
tovbinm committed Feb 20, 2019
1 parent 25061c7 commit 14f697d
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 148 deletions.
24 changes: 0 additions & 24 deletions helloworld/src/main/avro/Iris.avsc

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -34,35 +34,19 @@ import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._

trait BostonFeatures extends Serializable {

val rowId = FeatureBuilder.Integral[BostonHouse].extract(_.rowId.toIntegral).asPredictor

val crim = FeatureBuilder.RealNN[BostonHouse].extract(_.crim.toRealNN).asPredictor

val zn = FeatureBuilder.RealNN[BostonHouse].extract(_.zn.toRealNN).asPredictor

val indus = FeatureBuilder.RealNN[BostonHouse].extract(_.indus.toRealNN).asPredictor

val chas = FeatureBuilder.PickList[BostonHouse].extract(x => Option(x.chas).toPickList).asPredictor

val nox = FeatureBuilder.RealNN[BostonHouse].extract(_.nox.toRealNN).asPredictor

val rm = FeatureBuilder.RealNN[BostonHouse].extract(_.rm.toRealNN).asPredictor

val age = FeatureBuilder.RealNN[BostonHouse].extract(_.age.toRealNN).asPredictor

val dis = FeatureBuilder.RealNN[BostonHouse].extract(_.dis.toRealNN).asPredictor

val rad = FeatureBuilder.Integral[BostonHouse].extract(_.rad.toIntegral).asPredictor

val tax = FeatureBuilder.RealNN[BostonHouse].extract(_.tax.toRealNN).asPredictor

val ptratio = FeatureBuilder.RealNN[BostonHouse].extract(_.ptratio.toRealNN).asPredictor

val b = FeatureBuilder.RealNN[BostonHouse].extract(_.b.toRealNN).asPredictor

val lstat = FeatureBuilder.RealNN[BostonHouse].extract(_.lstat.toRealNN).asPredictor

val medv = FeatureBuilder.RealNN[BostonHouse].extract(_.medv.toRealNN).asResponse

}

This file was deleted.

33 changes: 14 additions & 19 deletions helloworld/src/main/scala/com/salesforce/hw/boston/OpBoston.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ import com.salesforce.op.readers.CustomReader
import com.salesforce.op.stages.impl.regression.RegressionModelSelector
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry._
import com.salesforce.op.stages.impl.tuning.DataSplitter
import com.salesforce.op.utils.kryo.OpKryoRegistrator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}

Expand All @@ -45,52 +44,48 @@ import org.apache.spark.sql.{Dataset, SparkSession}
*/
object OpBoston extends OpAppWithRunner with BostonFeatures {

override def kryoRegistrator: Class[_ <: OpKryoRegistrator] = classOf[BostonKryoRegistrator]

////////////////////////////////////////////////////////////////////////////////
// READERS DEFINITION
/////////////////////////////////////////////////////////////////////////////////

val randomSeed = 112233L
val randomSeed = 42L

def customRead(path: Option[String], spark: SparkSession): RDD[BostonHouse] = {
require(path.isDefined, "The path is not set")
val myFile = spark.sparkContext.textFile(path.get)
def customRead(path: String)(implicit spark: SparkSession): RDD[BostonHouse] = {
val myFile = spark.sparkContext.textFile(path)

myFile.filter(_.nonEmpty).zipWithIndex.map { case (x, number) =>
myFile.filter(_.nonEmpty).zipWithIndex.map { case (x, id) =>
val words = x.replaceAll("\\s+", " ").replaceAll(s"^\\s+(?m)", "").replaceAll(s"(?m)\\s+$$", "").split(" ")
BostonHouse(number.toInt, words(0).toDouble, words(1).toDouble, words(2).toDouble, words(3), words(4).toDouble,
BostonHouse(id.toInt, words(0).toDouble, words(1).toDouble, words(2).toDouble, words(3), words(4).toDouble,
words(5).toDouble, words(6).toDouble, words(7).toDouble, words(8).toInt, words(9).toDouble,
words(10).toDouble, words(11).toDouble, words(12).toDouble, words(13).toDouble)
}
}

val trainingReader = new CustomReader[BostonHouse](key = _.rowId.toString) {
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = {
val Array(train, _) = customRead(Some(getFinalReadPath(params)), spark).randomSplit(weights = Array(0.9, 0.1),
seed = randomSeed)
Left(train)
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = Left {
val Array(train, _) = customRead(getFinalReadPath(params)).randomSplit(weights = Array(0.9, 0.1), randomSeed)
train
}
}

val scoringReader = new CustomReader[BostonHouse](key = _.rowId.toString) {
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = {
val Array(_, test) = customRead(Some(getFinalReadPath(params)), spark).randomSplit(weights = Array(0.9, 0.1),
seed = randomSeed)
Left(test)
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[BostonHouse], Dataset[BostonHouse]] = Left {
val Array(_, test) = customRead(getFinalReadPath(params)).randomSplit(weights = Array(0.9, 0.1), randomSeed)
test
}
}


////////////////////////////////////////////////////////////////////////////////
// WORKFLOW DEFINITION
/////////////////////////////////////////////////////////////////////////////////

val houseFeatures = Seq(crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat).transmogrify()

val splitter = DataSplitter(seed = randomSeed)

val prediction = RegressionModelSelector
.withCrossValidation(
dataSplitter = Some(DataSplitter(seed = randomSeed)), seed = randomSeed,
dataSplitter = Some(splitter), seed = randomSeed,
modelTypesToUse = Seq(OpGBTRegressor, OpRandomForestRegressor)
).setInput(medv, houseFeatures).getOutput()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,11 @@

package com.salesforce.hw.iris

import com.esotericsoftware.kryo.Kryo
import com.salesforce.op.utils.kryo.OpKryoRegistrator

class IrisKryoRegistrator extends OpKryoRegistrator {

override def registerCustomClasses(kryo: Kryo): Unit = {
doAvroRegistration[com.salesforce.hw.iris.Iris](kryo)
}

}
case class Iris
(
sepalLength: Double,
sepalWidth: Double,
petalLength: Double,
petalWidth: Double,
irisClass: String
)
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._

trait IrisFeatures extends Serializable {
val id = FeatureBuilder.Integral[Iris].extract(_.getID.toIntegral).asPredictor
val sepalLength = FeatureBuilder.Real[Iris].extract(_.getSepalLength.toReal).asPredictor
val sepalWidth = FeatureBuilder.Real[Iris].extract(_.getSepalWidth.toReal).asPredictor
val petalLength = FeatureBuilder.Real[Iris].extract(_.getPetalLength.toReal).asPredictor
val petalWidth = FeatureBuilder.Real[Iris].extract(_.getPetalWidth.toReal).asPredictor
val irisClass = FeatureBuilder.Text[Iris].extract(_.getClass$.toText).asResponse
val sepalLength = FeatureBuilder.Real[Iris].extract(_.sepalLength.toReal).asPredictor
val sepalWidth = FeatureBuilder.Real[Iris].extract(_.sepalWidth.toReal).asPredictor
val petalLength = FeatureBuilder.Real[Iris].extract(_.petalLength.toReal).asPredictor
val petalWidth = FeatureBuilder.Real[Iris].extract(_.petalWidth.toReal).asPredictor
val irisClass = FeatureBuilder.Text[Iris].extract(_.irisClass.toText).asResponse
}
23 changes: 6 additions & 17 deletions helloworld/src/main/scala/com/salesforce/hw/iris/OpIris.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,37 +32,24 @@ package com.salesforce.hw.iris

import com.salesforce.op._
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.readers.CustomReader
import com.salesforce.op.readers.DataReaders
import com.salesforce.op.stages.impl.classification.MultiClassificationModelSelector
import com.salesforce.op.stages.impl.tuning.DataCutter
import com.salesforce.op.utils.kryo.OpKryoRegistrator
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.Encoders

/**
* TransmogrifAI MultiClass Classification example on the Iris Dataset
*/
object OpIris extends OpAppWithRunner with IrisFeatures {

override def kryoRegistrator: Class[_ <: OpKryoRegistrator] = classOf[IrisKryoRegistrator]
implicit val irisEncoder = Encoders.product[Iris]

////////////////////////////////////////////////////////////////////////////////
// READER DEFINITIONS
/////////////////////////////////////////////////////////////////////////////////

val randomSeed = 42

val irisReader = new CustomReader[Iris](key = _.getID.toString){
def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[Iris], Dataset[Iris]] = {
val path = getFinalReadPath(params)
val myFile = spark.sparkContext.textFile(path)

Left(myFile.filter(_.nonEmpty).zipWithIndex.map { case (x, id) =>
val Array(sepalLength, sepalWidth, petalLength, petalWidth, klass) = x.split(",")
new Iris(id.toInt, sepalLength.toDouble, sepalWidth.toDouble, petalLength.toDouble, petalWidth.toDouble, klass)
})
}
}
val irisReader = DataReaders.Simple.csvCase[Iris]()

////////////////////////////////////////////////////////////////////////////////
// WORKFLOW DEFINITION
Expand All @@ -72,6 +59,8 @@ object OpIris extends OpAppWithRunner with IrisFeatures {

val features = Seq(sepalLength, sepalWidth, petalLength, petalWidth).transmogrify()

val randomSeed = 42L

val cutter = DataCutter(reserveTestFraction = 0.2, seed = randomSeed)

val prediction = MultiClassificationModelSelector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ object OpTitanic extends OpAppWithRunner with TitanicFeatures {
// READER DEFINITION
/////////////////////////////////////////////////////////////////////////////////

val randomSeed = 112233L
val randomSeed = 42L
val simpleReader = DataReaders.Simple.csv[Passenger](
schema = Passenger.getClassSchema.toString, key = _.getPassengerId.toString
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,27 +34,15 @@ import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._

trait TitanicFeatures extends Serializable {

val survived = FeatureBuilder.RealNN[Passenger].extract(_.getSurvived.toDouble.toRealNN).asResponse

val pClass = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getPclass).map(_.toString).toPickList).asPredictor // scalastyle:off

val name = FeatureBuilder.Text[Passenger].extract(d => Option(d.getName).toText).asPredictor

val sex = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getSex).toPickList).asPredictor

val age = FeatureBuilder.Real[Passenger].extract(d => Option(Double.unbox(d.getAge)).toReal).asPredictor

val sibSp = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getSibSp).map(_.toString).toPickList).asPredictor

val parch = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getParch).map(_.toString).toPickList).asPredictor

val ticket = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getTicket).toPickList).asPredictor

val fare = FeatureBuilder.Real[Passenger].extract(d => Option(Double.unbox(d.getFare)).toReal).asPredictor

val cabin = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getCabin).toPickList).asPredictor

val embarked = FeatureBuilder.PickList[Passenger].extract(d => Option(d.getEmbarked).toPickList).asPredictor

}

0 comments on commit 14f697d

Please sign in to comment.