Change name GradientBoostingTree to GradientBoostedTree

jtengyp · jtengyp · commit c31c908d75f8 · 2017-10-31T15:32:53.000+08:00
diff --git a/bin/workloads/ml/gbt/prepare/prepare.sh b/bin/workloads/ml/gbt/prepare/prepare.sh
@@ -26,7 +26,7 @@ show_bannar start
 rmr_hdfs $INPUT_HDFS || true
 START_TIME=`timestamp`
 
-run_spark_job com.intel.hibench.sparkbench.ml.GradientBoostingTreeDataGenerator $INPUT_HDFS $NUM_EXAMPLES_GBT $NUM_FEATURES_GBT
+run_spark_job com.intel.hibench.sparkbench.ml.GradientBoostedTreeDataGenerator $INPUT_HDFS $NUM_EXAMPLES_GBT $NUM_FEATURES_GBT
 
 END_TIME=`timestamp`
 
diff --git a/bin/workloads/ml/gbt/spark/run.sh b/bin/workloads/ml/gbt/spark/run.sh
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.GradientBoostingTree ${INPUT_HDFS} ${NUM_ITERATIONS_GBT}
+run_spark_job com.intel.hibench.sparkbench.ml.GradientBoostedTree ${INPUT_HDFS} ${NUM_ITERATIONS_GBT}
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}
diff --git a/conf/workloads/ml/gbt.conf b/conf/workloads/ml/gbt.conf
@@ -1,21 +1,26 @@
-hibench.gbt.tiny.examples                        10
-hibench.gbt.tiny.features                        100
-hibench.gbt.small.examples                       100
-hibench.gbt.small.features                       500
-hibench.gbt.large.examples                       1000
-hibench.gbt.large.features                       2000
-hibench.gbt.huge.examples                        1000
-hibench.gbt.huge.features                        4000
-hibench.gbt.gigantic.examples                    1000
-hibench.gbt.gigantic.features                    8000
-hibench.gbt.bigdata.examples                     1000
-hibench.gbt.bigdata.features                     12000
+hibench.gbt.tiny.examples               10
+hibench.gbt.tiny.features               100
+hibench.gbt.small.examples              100
+hibench.gbt.small.features              500
+hibench.gbt.large.examples              1000
+hibench.gbt.large.features              2000
+hibench.gbt.huge.examples               1000
+hibench.gbt.huge.features               4000
+hibench.gbt.gigantic.examples           1000
+hibench.gbt.gigantic.features           8000
+hibench.gbt.bigdata.examples            1000
+hibench.gbt.bigdata.features            12000
 
 
 hibench.gbt.examples                    ${hibench.gbt.${hibench.scale.profile}.examples}
 hibench.gbt.features                    ${hibench.gbt.${hibench.scale.profile}.features}
 hibench.gbt.partitions                  ${hibench.default.map.parallelism}
-hibench.gbt.numIterations               100
+
+hibench.gbt.numClasses                  2
+hibench.gbt.maxDepth                    30
+hibench.gbt.maxBins                     32
+hibench.gbt.numIterations               20
+hibench.gbt.learningRate                0.1
 
 hibench.workload.input                  ${hibench.hdfs.data.dir}/GBT/Input
 hibench.workload.output                 ${hibench.hdfs.data.dir}/GBT/Output
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/GradientBoostedTree.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/GradientBoostedTree.scala
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.hibench.sparkbench.ml
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.tree.GradientBoostedTrees
+import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.regression.LabeledPoint
+
+import scopt.OptionParser
+
+object GradientBoostedTree {
+
+  case class Params(
+    numClasses: Int = 2,
+    maxDepth: Int = 30,
+    maxBins: Int = 32,
+    numIterations: Int = 20,
+    learningRate: Double = 0.1,
+    dataPath: String = null
+  )
+
+  def main(args: Array[String]): Unit = {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("GBT"){
+      head("GBT: an example of Gradient Boosted Tree for classification")
+      opt[Int]("numClasses")
+        .text(s"numClasses, default: ${defaultParams.numClasses}")
+        .action((x,c) => c.copy(numClasses = x))
+      opt[Int]("maxDepth")
+        .text(s"maxDepth, default: ${defaultParams.maxDepth}")
+        .action((x,c) => c.copy(maxDepth = x))
+      opt[Int]("maxBins")
+        .text(s"maxBins, default: ${defaultParams.maxBins}")
+        .action((x,c) => c.copy(maxBins = x))
+      opt[Int]("numIterations")
+        .text(s"numIterations, default: ${defaultParams.numIterations}")
+        .action((x,c) => c.copy(numIterations = x))
+      opt[Double]("learningRate")
+        .text(s"learningRate, default: ${defaultParams.learningRate}")
+        .action((x,c) => c.copy(learningRate = x))
+      arg[String]("<dataPath>")
+        .required()
+        .text("data path for Gradient Boosted Tree")
+        .action((xc) => c.copy(dataPath = x))
+    }
+    parser.parse(args, defaultParams) match {
+      case some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
+
+  def run(params: Params): Unit = {
+    val conf = new SparkConf().setAppName(s"Gradient Boosted Tree with $params")
+    val sc = new SparkContext(conf)
+
+    val dataPath = params.dataPath
+    val numClasses = params.numClasses
+    val maxDepth = params.maxDepth
+    val maxBins = params.maxBins
+    val numIterations = params.numIterations
+    val learningRate = params.learningRate
+
+    // Load  data file.
+    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)
+
+    // Split the data into training and test sets (30% held out for testing)
+    val splits = data.randomSplit(Array(0.7, 0.3))
+    val (trainingData, testData) = (splits(0), splits(1))
+
+    // Train a GradientBoostedTrees model.
+    val boostingStrategy = BoostingStrategy.defaultParams("Classification")
+    boostingStrategy.numIterations = numIterations
+    boostingStrategy.learningRate = learningRate
+    boostingStrategy.treeStrategy.numClasses = numClasses
+    boostingStrategy.treeStrategy.maxDepth = maxDepth
+    boostingStrategy.treeStrategy.maxBins = maxBins
+    // Empty categoricalFeaturesInfo indicates all features are continuous.
+    boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
+
+    val model = GradientBoostedTrees.train(trainingData, boostingStrategy)
+
+    // Evaluate model on test instances and compute test error
+    val labelAndPreds = testData.map { point =>
+      val prediction = model.predict(point.features)
+      (point.label, prediction)
+    }
+    val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
+    println("Test Error = " + testErr)
+
+    sc.stop()
+  }
+}
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/GradientBoostedTreeDataGenerator.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/GradientBoostedTreeDataGenerator.scala
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.hibench.sparkbench.ml
+
+import com.intel.hibench.sparkbench.common.IOCommon
+
+import scala.util.Random
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.annotation.{DeveloperApi, Since}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * :: DeveloperApi ::
+ * Generate test data for Gradient Boosting Tree. This class chooses positive labels
+ * with probability `probOne` and scales features for positive examples by `eps`.
+ */
+object GradientBoostingTreeDataGenerator {
+
+  /**
+   * Generate an RDD containing test data for Gradient Boosting Tree.
+   *
+   * @param sc SparkContext to use for creating the RDD.
+   * @param nexamples Number of examples that will be contained in the RDD.
+   * @param nfeatures Number of features to generate for each example.
+   * @param eps Epsilon factor by which positive examples are scaled.
+   * @param nparts Number of partitions of the generated RDD. Default value is 2.
+   * @param probOne Probability that a label is 1 (and not 0). Default value is 0.5.
+   */
+  def generateGBTRDD(
+    sc: SparkContext,
+    nexamples: Int,
+    nfeatures: Int,
+    eps: Double,
+    nparts: Int = 2,
+    probOne: Double = 0.5): RDD[LabeledPoint] = {
+    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
+      val rnd = new Random(42 + idx)
+
+      val y = if (idx % 2 == 0) 0.0 else 1.0
+      val x = Array.fill[Double](nfeatures) {
+        rnd.nextGaussian() + (y * eps)
+      }
+      LabeledPoint(y, Vectors.dense(x))
+    }
+    data
+  }
+
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("GradientBoostingTreeDataGenerator")
+    val sc = new SparkContext(conf)
+
+    var outputPath = ""
+    var numExamples: Int = 200000
+    var numFeatures: Int = 20
+    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
+    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
+      .getOrElse((parallel / 2).toString).toInt
+    val eps = 0.3
+
+    if (args.length == 3) {
+      outputPath = args(0)
+      numExamples = args(1).toInt
+      numFeatures = args(2).toInt
+      println(s"Output Path: $outputPath")
+      println(s"Num of Examples: $numExamples")
+      println(s"Num of Features: $numFeatures")
+    } else {
+      System.err.println(
+        s"Usage: $GradientBoostingTreeDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"
+      )
+      System.exit(1)
+    }
+
+    val data = generateGBTRDD(sc, numExamples, numFeatures, eps, numPartitions)
+
+    data.saveAsObjectFile(outputPath)
+
+    sc.stop()
+  }
+}