salesforce · nicodv · May 30, 2019 · May 30, 2019 · May 30, 2019 · May 30, 2019
@@ -128,7 +128,8 @@ Start by picking TransmogrifAI version to match your project dependencies from t
 
 | TransmogrifAI Version | Spark Version | Scala Version | Java Version |
 |-------------------------------------------------------|:-------------:|:-------------:|:------------:|
-| 0.7.1 (unreleased, master), **0.7.0 (stable)** | **2.4** | **2.11** | **1.8** |
+| 0.8.0 (unreleased, master) | 3.1 | 2.12 | 1.8 |
+| **0.7.1 (stable)**, 0.7.0 | **2.4** | **2.11** | **1.8** |
 | 0.6.1, 0.6.0, 0.5.3, 0.5.2, 0.5.1, 0.5.0 | 2.3 | 2.11 | 1.8 |
 | 0.4.0, 0.3.4 | 2.2 | 2.11 | 1.8 |
 
@@ -140,10 +141,10 @@ repositories {
 }
 dependencies {
  // TransmogrifAI core dependency
- compile 'com.salesforce.transmogrifai:transmogrifai-core_2.11:0.7.0'
+ compile 'com.salesforce.transmogrifai:transmogrifai-core_2.12:0.8.0'
 
  // TransmogrifAI pretrained models, e.g. OpenNLP POS/NER models etc. (optional)
- // compile 'com.salesforce.transmogrifai:transmogrifai-models_2.11:0.7.0'
+ // compile 'com.salesforce.transmogrifai:transmogrifai-models_2.12:0.8.0'
 }
 ```
 
@@ -154,10 +155,10 @@ scalaVersion := "2.11.12"
 resolvers += Resolver.jcenterRepo
 
 // TransmogrifAI core dependency
-libraryDependencies += "com.salesforce.transmogrifai" %% "transmogrifai-core" % "0.7.0"
+libraryDependencies += "com.salesforce.transmogrifai" %% "transmogrifai-core" % "0.8.0"
 
 // TransmogrifAI pretrained models, e.g. OpenNLP POS/NER models etc. (optional)
-// libraryDependencies += "com.salesforce.transmogrifai" %% "transmogrifai-models" % "0.7.0"
+// libraryDependencies += "com.salesforce.transmogrifai" %% "transmogrifai-models" % "0.8.0"
 ```
 
 Then import TransmogrifAI into your code:

@@ -7,6 +7,7 @@ buildscript {
  dependencies {
  classpath 'org.github.ngbinh.scalastyle:gradle-scalastyle-plugin_2.11:1.0.1'
  classpath 'com.commercehub.gradle.plugin:gradle-avro-plugin:0.16.0'
+ classpath 'com.adtran:scala-multiversion-plugin:1.+'
  }
 }
 
@@ -46,6 +47,7 @@ configure(allProjs) {
  apply plugin: 'net.minecrell.licenser'
  apply plugin: 'com.github.jk1.dependency-license-report'
  apply plugin: 'com.github.johnrengelman.shadow'
+ apply plugin: 'com.adtran.scala-multiversion-plugin'
 
  sourceCompatibility = 1.8
  targetCompatibility = 1.8
@@ -54,23 +56,21 @@ configure(allProjs) {
  mainClassName = "please.set.main.class.in.build.gradle"
 
  ext {
- scalaVersion = '2.11'
- scalaVersionRevision = '12'
  scalaTestVersion = '3.0.5'
  scalaCheckVersion = '1.14.0'
  junitVersion = '4.12'
  avroVersion = '1.8.2'
- sparkVersion = '2.4.5'
+ sparkVersion = '3.1.1'
  scalaGraphVersion = '1.12.5'
  scalafmtVersion = '1.5.1'
  hadoopVersion = 'hadoop2'
- json4sVersion = '3.5.3' // matches Spark dependency version
+ json4sVersion = '3.7.0-M5' // matches Spark dependency version
  jodaTimeVersion = '2.9.4'
  jodaConvertVersion = '1.8.1'
  algebirdVersion = '0.13.4'
- jacksonVersion = '2.7.3'
+ jacksonVersion = '2.12.2'
  luceneVersion = '7.3.0'
- enumeratumVersion = '1.4.12'
+ enumeratumVersion = '1.4.18'
  scoptVersion = '3.5.0'
  googleLibPhoneNumberVersion = '8.8.5'
  googleGeoCoderVersion = '2.82'
@@ -80,15 +80,15 @@ configure(allProjs) {
  collectionsVersion = '3.2.2'
  optimaizeLangDetectorVersion = '0.0.1'
  tikaVersion = '1.22'
- sparkTestingBaseVersion = '2.4.3_0.12.0'
+ sparkTestingBaseVersion = '3.0.1_1.0.0'
  sourceCodeVersion = '0.1.3'
  pegdownVersion = '1.4.2'
  commonsValidatorVersion = '1.6'
  commonsIOVersion = '2.6'
  scoveragePluginVersion = '1.3.1'
- xgboostVersion = '0.90'
- akkaSlf4jVersion = '2.3.11'
- mleapVersion = '0.16.0'
+ xgboostVersion = '1.3.1'
+ akkaSlf4jVersion = '2.5.23'
+ mleapVersion = '0.16.0' // TODO: upgrade to Spark 3-compatibel 0.17 when ready: https://github.com/combust/mleap/issues/727
  memoryFilesystemVersion = '2.1.0'
  }
 
@@ -100,37 +100,37 @@ configure(allProjs) {
  dependencies {
  // Scala
  zinc 'com.typesafe.zinc:zinc:0.3.15'
- scoverage "org.scoverage:scalac-scoverage-plugin_$scalaVersion:$scoveragePluginVersion"
- scoverage "org.scoverage:scalac-scoverage-runtime_$scalaVersion:$scoveragePluginVersion"
- scalaLibrary "org.scala-lang:scala-library:$scalaVersion.$scalaVersionRevision"
- scalaCompiler "org.scala-lang:scala-compiler:$scalaVersion.$scalaVersionRevision"
- compile "org.scala-lang:scala-library:$scalaVersion.$scalaVersionRevision"
+ scoverage "org.scoverage:scalac-scoverage-plugin_%%:$scoveragePluginVersion"
+ scoverage "org.scoverage:scalac-scoverage-runtime_%%:$scoveragePluginVersion"
+ scalaLibrary "org.scala-lang:scala-library:$scalaVersion"
+ scalaCompiler "org.scala-lang:scala-compiler:$scalaVersion"
+ compile "org.scala-lang:scala-library:$scalaVersion"
 
  // Spark
- compileOnly "org.apache.spark:spark-core_$scalaVersion:$sparkVersion"
- testCompile "org.apache.spark:spark-core_$scalaVersion:$sparkVersion"
- compileOnly "org.apache.spark:spark-mllib_$scalaVersion:$sparkVersion"
- testCompile "org.apache.spark:spark-mllib_$scalaVersion:$sparkVersion"
- compileOnly "org.apache.spark:spark-sql_$scalaVersion:$sparkVersion"
- testCompile "org.apache.spark:spark-sql_$scalaVersion:$sparkVersion"
+ compileOnly "org.apache.spark:spark-core_%%:$sparkVersion"
+ testCompile "org.apache.spark:spark-core_%%:$sparkVersion"
+ compileOnly "org.apache.spark:spark-mllib_%%:$sparkVersion"
+ testCompile "org.apache.spark:spark-mllib_%%:$sparkVersion"
+ compileOnly "org.apache.spark:spark-sql_%%:$sparkVersion"
+ testCompile "org.apache.spark:spark-sql_%%:$sparkVersion"
 
  // Test
- compileOnly "org.scalatest:scalatest_$scalaVersion:$scalaTestVersion"
- testCompile "org.scalatest:scalatest_$scalaVersion:$scalaTestVersion"
- compileOnly "org.scalacheck:scalacheck_$scalaVersion:$scalaCheckVersion"
- testCompile "org.scoverage:scalac-scoverage-plugin_$scalaVersion:$scoveragePluginVersion"
- testCompile "org.scoverage:scalac-scoverage-runtime_$scalaVersion:$scoveragePluginVersion"
- testCompile "org.scalacheck:scalacheck_$scalaVersion:$scalaCheckVersion"
- testCompile ("com.holdenkarau:spark-testing-base_$scalaVersion:$sparkTestingBaseVersion") { transitive = false }
+ compileOnly "org.scalatest:scalatest_%%:$scalaTestVersion"
+ testCompile "org.scalatest:scalatest_%%:$scalaTestVersion"
+ compileOnly "org.scalacheck:scalacheck_%%:$scalaCheckVersion"
+ testCompile "org.scoverage:scalac-scoverage-plugin_%%:$scoveragePluginVersion"
+ testCompile "org.scoverage:scalac-scoverage-runtime_%%:$scoveragePluginVersion"
+ testCompile "org.scalacheck:scalacheck_%%:$scalaCheckVersion"
+ testCompile ("com.holdenkarau:spark-testing-base_%%:$sparkTestingBaseVersion") { transitive = false }
  testCompile "junit:junit:$junitVersion"
  testRuntime "org.pegdown:pegdown:$pegdownVersion"
  }
 
  configurations.all {
  resolutionStrategy {
  force "commons-collections:commons-collections:$collectionsVersion",
- "org.scala-lang:scala-library:$scalaVersion.$scalaVersionRevision",
- "org.scala-lang:scala-reflect:$scalaVersion.$scalaVersionRevision"
+ "org.scala-lang:scala-library:$scalaVersion",
+ "org.scala-lang:scala-reflect:$scalaVersion"
  }
  }
  configurations.zinc {
@@ -149,7 +149,7 @@ configure(allProjs) {
  "-language:implicitConversions", "-language:existentials", "-language:postfixOps"
  ]
  }
- compileScala.scalaCompileOptions.additionalParameters += "-optimize"
+ compileScala.scalaCompileOptions.additionalParameters += ["-opt:l:inline", "-opt-inline-from:**"]
  [compileJava, compileTestJava]*.options.collect { options -> options.encoding = 'UTF-8' }
 
  jar {
@@ -161,6 +161,7 @@ configure(allProjs) {
  }
 
  scalaStyle {
+ scalaVersion = '$scalaVersion'
  configLocation = "$rootProject.rootDir/gradle/scalastyle-config.xml"
  includeTestSourceDirectory = true
  source = "src/main/scala"

@@ -1,14 +1,14 @@
 dependencies {
  // scopt
- compile "com.github.scopt:scopt_$scalaVersion:$scoptVersion"
+ compile "com.github.scopt:scopt_%%:$scoptVersion"
 
  // scalafmt
- compile "com.geirsson:scalafmt-core_$scalaVersion:$scalafmtVersion"
+ compile "com.geirsson:scalafmt-core_%%:$scalafmtVersion"
 
  // Reflections
  compile "org.reflections:reflections:$reflectionsVersion"
 
- compile "org.apache.spark:spark-sql_$scalaVersion:$sparkVersion"
+ compile "org.apache.spark:spark-sql_%%:$sparkVersion"
 
  testCompile project(':utils')
 
@@ -71,7 +71,6 @@ task copyTemplates(type: Copy) {
  expand([
  version: scalaVersion,
  scalaVersion: scalaVersion,
- scalaVersionRevision: scalaVersionRevision,
  scalaTestVersion: scalaTestVersion,
  junitVersion: junitVersion,
  sparkVersion: sparkVersion,

@@ -21,13 +21,14 @@ dependencies {
  compile "org.apache.lucene:lucene-suggest:$luceneVersion"
 
  // Scopt
- compile "com.github.scopt:scopt_$scalaVersion:$scoptVersion"
+ compile "com.github.scopt:scopt_%%:$scoptVersion"
 
  // Zip util
  compile 'org.zeroturnaround:zt-zip:1.14'
 
  // XGBoost
- compile ("ml.dmlc:xgboost4j-spark:$xgboostVersion") { exclude group: 'com.esotericsoftware.kryo', module: 'kryo' }
+ compile ("ml.dmlc:xgboost4j_%%:$xgboostVersion") { exclude group: 'com.esotericsoftware.kryo', module: 'kryo' }
+ compile ("ml.dmlc:xgboost4j-spark_%%:$xgboostVersion") { exclude group: 'com.esotericsoftware.kryo', module: 'kryo' }
  // Akka slfj4 logging (version matches XGBoost dependency)
- testCompile "com.typesafe.akka:akka-slf4j_$scalaVersion:$akkaSlf4jVersion"
+ testCompile "com.typesafe.akka:akka-slf4j_%%:$akkaSlf4jVersion"
 }
@@ -453,7 +453,7 @@ case object ModelInsights {
  ): ModelInsights = {
 
  // TODO support other model types?
- val models = stages.collect{
+ val models: Array[OPStage with Model[_]] = stages.collect{
  case s: SelectedModel => s
  case s: OpPredictorWrapperModel[_] => s
  case s: SelectedCombinerModel => s

@@ -39,12 +39,11 @@ import com.salesforce.op.stages.impl.preparators.CorrelationType
 import com.salesforce.op.stages.impl.selector.ModelSelector
 import com.salesforce.op.utils.reflection.ReflectionUtils
 import com.salesforce.op.utils.spark.{JobGroupUtil, OpStep}
-import com.salesforce.op.utils.spark.RichDataset._
 import com.salesforce.op.utils.stages.FitStagesUtil
 import com.salesforce.op.utils.stages.FitStagesUtil.{CutDAG, FittedDAG, Layer, StagesDAG}
 import enumeratum.{Enum, EnumEntry}
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.{Estimator, Transformer}
+import org.apache.spark.ml.Transformer
 import org.apache.spark.sql.{DataFrame, SparkSession}
 
 import scala.collection.mutable.{MutableList => MList}
@@ -91,7 +90,6 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
  val featuresArr = features.toArray
  resultFeatures = featuresArr
  rawFeatures = featuresArr.flatMap(_.rawFeatures).distinct.sortBy(_.name)
- checkUnmatchedFeatures()
  setStagesDAG(features = featuresArr)
  validateStages()
 
@@ -238,7 +236,7 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
  case (None, None) => throw new IllegalArgumentException(
  "Data reader must be set either directly on the workflow or through the RawFeatureFilter")
  case (Some(r), None) =>
- checkReadersAndFeatures()
+ checkFeatures()
  r.generateDataFrame(rawFeatures, parameters).persist()
  case (rd, Some(rf)) =>
  rd match {
@@ -247,7 +245,7 @@ class OpWorkflow(val uid: String = UID[OpWorkflow]) extends OpWorkflowCore {
  "Workflow data reader and RawFeatureFilter training reader do not match! " +
  "The RawFeatureFilter training reader will be used to generate the data for training")
  }
- checkReadersAndFeatures()
+ checkFeatures()
 
  val FilteredRawData(cleanedData, featuresToDrop, mapKeysToDrop, rawFeatureFilterResults) =
  rf.generateFilteredRaw(rawFeatures, parameters)

@@ -122,7 +122,6 @@ private[op] trait OpWorkflowCore {
  */
  final def setReader(r: Reader[_]): this.type = {
  reader = Option(r)
- checkUnmatchedFeatures()
  this
  }
 
@@ -149,7 +148,6 @@ private[op] trait OpWorkflowCore {
  def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[T], Dataset[T]] = Right(ds)
  }
  reader = Option(newReader)
- checkUnmatchedFeatures()
  this
  }
 
@@ -166,7 +164,6 @@ private[op] trait OpWorkflowCore {
  def readFn(params: OpParams)(implicit spark: SparkSession): Either[RDD[T], Dataset[T]] = Left(rdd)
  }
  reader = Option(newReader)
- checkUnmatchedFeatures()
  this
  }
 
@@ -247,40 +244,11 @@ private[op] trait OpWorkflowCore {
  */
  final def getRawFeatureFilterResults(): RawFeatureFilterResults = rawFeatureFilterResults
 
-
  /**
- * Determine if any of the raw features do not have a matching reader
+ * Check that features are set and that params match them
  */
- protected def checkUnmatchedFeatures(): Unit = {
- if (rawFeatures.nonEmpty && reader.nonEmpty) {
- val readerInputTypes = reader.get.subReaders.map(_.fullTypeName).toSet
- val unmatchedFeatures = rawFeatures.filterNot(f =>
- readerInputTypes
- .contains(f.originStage.asInstanceOf[FeatureGeneratorStage[_, _ <: FeatureType]].tti.tpe.toString)
- )
- require(
- unmatchedFeatures.isEmpty,
- s"No matching data readers for ${unmatchedFeatures.length} input features:" +
- s" ${unmatchedFeatures.mkString(",")}. Readers had types: ${readerInputTypes.mkString(",")}"
- )
- }
- }
-
- /**
- * Check that readers and features are set and that params match them
- */
- protected def checkReadersAndFeatures() = {
+ protected def checkFeatures() = {
  require(rawFeatures.nonEmpty, "Result features must be set")
- checkUnmatchedFeatures()
-
- val subReaderTypes = reader.get.subReaders.map(_.typeName).toSet
- val unmatchedReaders = subReaderTypes.filterNot { t => parameters.readerParams.contains(t) }
-
- if (unmatchedReaders.nonEmpty) {
- log.info(
- "Readers for types: {} do not have an override path in readerParams, so the default will be used",
- unmatchedReaders.mkString(","))
- }
  }
 
  /**

@@ -94,7 +94,7 @@ class OpWorkflowModel(val uid: String = UID[OpWorkflowModel], val trainingParams
  protected def generateRawData()(implicit spark: SparkSession): DataFrame = {
  JobGroupUtil.withJobGroup(OpStep.DataReadingAndFiltering) {
  require(reader.nonEmpty, "Data reader must be set")
- checkReadersAndFeatures()
+ checkFeatures()
  reader.get.generateDataFrame(rawFeatures, parameters).persist() // don't want to redo this
  }
  }

@@ -116,9 +116,10 @@ private[op] class OpBinaryClassificationEvaluator
  val aUPR = sparkMLMetrics.areaUnderPR()
 
  val confusionMatrixByThreshold = sparkMLMetrics.confusionMatrixByThreshold().collect()
+ // Since we're not using sample weights, we simply cast the counts back to Longs.
  val (copiedTupPos, copiedTupNeg) = confusionMatrixByThreshold.map { case (_, confusionMatrix) =>
- ((confusionMatrix.numTruePositives, confusionMatrix.numFalsePositives),
- (confusionMatrix.numTrueNegatives, confusionMatrix.numFalseNegatives))
+ ((confusionMatrix.weightedTruePositives.toLong, confusionMatrix.weightedFalsePositives.toLong),
+ (confusionMatrix.weightedTrueNegatives.toLong, confusionMatrix.weightedFalseNegatives.toLong))
  }.unzip
  val (tpByThreshold, fpByThreshold) = copiedTupPos.unzip
  val (tnByThreshold, fnByThreshold) = copiedTupNeg.unzip

@@ -67,7 +67,9 @@ private[op] class OpRegressionEvaluator
  isValid = l => l.nonEmpty && (l sameElements l.sorted)
  )
  setDefault(signedPercentageErrorHistogramBins,
- Array(Double.NegativeInfinity) ++ (-100.0 to 100.0 by 10) ++ Array(Double.PositiveInfinity)
+ Array(Double.NegativeInfinity)
+ ++ (Range.BigDecimal(-100, 100, 10)).map(_.toDouble)
+ ++ Array(Double.PositiveInfinity)
  )
 
  def setPercentageErrorHistogramBins(v: Array[Double]): this.type = set(signedPercentageErrorHistogramBins, v)

@@ -180,8 +180,8 @@ case class FeatureDistribution
  case _ => false
  }
 
- override def hashCode(): Int = Objects.hashCode(name, key, count, nulls, distribution,
- summaryInfo, moments, cardEstimate, `type`)
+ override def hashCode(): Int = Objects.hashCode((name, key, count, nulls, distribution.deep,
+ summaryInfo.deep, moments, cardEstimate, `type`))
 }
 
 object FeatureDistribution {