CrossValidator with Pipeline Example-spark技术分享

CrossValidator with Pipeline Example

Caution

FIXME The example below does NOT work. Being investigated.

Caution

FIXME Can k-means be crossvalidated? Does it make any sense? Does it only applies to supervised learning?



// Let's create a pipeline with transformers and estimator
import org.apache.spark.ml.feature._

val tok = new Tokenizer().setInputCol("text")

val hashTF = new HashingTF()
  .setInputCol(tok.getOutputCol)
  .setOutputCol("features")
  .setNumFeatures(10)

import org.apache.spark.ml.classification.RandomForestClassifier
val rfc = new RandomForestClassifier

import org.apache.spark.ml.Pipeline
val pipeline = new Pipeline()
  .setStages(Array(tok, hashTF, rfc))

// CAUTION: label must be double
// 0 = scientific text
// 1 = non-scientific text
val trainDS = Seq(
  (0L, "[science] hello world", 0d),
  (1L, "long text", 1d),
  (2L, "[science] hello all people", 0d),
  (3L, "[science] hello hello", 0d)).toDF("id", "text", "label").cache

// Check out the train dataset
// Values in label and prediction columns should be alike
val sampleModel = pipeline.fit(trainDS)
sampleModel
  .transform(trainDS)
  .select('text, 'label, 'features, 'prediction)
  .show(truncate = false)

+--------------------------+-----+--------------------------+----------+
|text                      |label|features                  |prediction|
+--------------------------+-----+--------------------------+----------+
|[science] hello world     |0.0  |(10,[0,8],[2.0,1.0])      |0.0       |
|long text                 |1.0  |(10,[4,9],[1.0,1.0])      |1.0       |
|[science] hello all people|0.0  |(10,[0,6,8],[1.0,1.0,2.0])|0.0       |
|[science] hello hello     |0.0  |(10,[0,8],[1.0,2.0])      |0.0       |
+--------------------------+-----+--------------------------+----------+

val input = Seq("Hello ScienCE").toDF("text")
sampleModel
  .transform(input)
  .select('text, 'rawPrediction, 'prediction)
  .show(truncate = false)

+-------------+--------------------------------------+----------+
|text         |rawPrediction                         |prediction|
+-------------+--------------------------------------+----------+
|Hello ScienCE|[12.666666666666668,7.333333333333333]|0.0       |
+-------------+--------------------------------------+----------+

import org.apache.spark.ml.tuning.ParamGridBuilder
val paramGrid = new ParamGridBuilder().build

import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
val binEval = new BinaryClassificationEvaluator

import org.apache.spark.ml.tuning.CrossValidator
val cv = new CrossValidator()
  .setEstimator(pipeline) // <-- pipeline is the estimator
  .setEvaluator(binEval)  // has to match the estimator
  .setEstimatorParamMaps(paramGrid)

// WARNING: It does not work!!!
val cvModel = cv.fit(trainDS)

// Let's create a pipeline with transformers and estimator

import org.apache.spark.ml.feature._

val tok = new Tokenizer().setInputCol("text")

val hashTF = new HashingTF()

.setInputCol(tok.getOutputCol)

.setOutputCol("features")

.setNumFeatures(10)

import org.apache.spark.ml.classification.RandomForestClassifier

val rfc = new RandomForestClassifier

import org.apache.spark.ml.Pipeline

val pipeline = new Pipeline()

.setStages(Array(tok, hashTF, rfc))

// CAUTION: label must be double

// 0 = scientific text

// 1 = non-scientific text

val trainDS = Seq(

(0L, "[science] hello world", 0d),

(1L, "long text", 1d),

(2L, "[science] hello all people", 0d),

(3L, "[science] hello hello", 0d)).toDF("id", "text", "label").cache

// Check out the train dataset

// Values in label and prediction columns should be alike

val sampleModel = pipeline.fit(trainDS)

sampleModel

.transform(trainDS)

.select('text, 'label, 'features, 'prediction)

.show(truncate = false)

+--------------------------+-----+--------------------------+----------+

+--------------------------+-----+--------------------------+----------+

|[science] hello world |0.0 |(10,[0,8],[2.0,1.0]) |0.0 |

|long text |1.0 |(10,[4,9],[1.0,1.0]) |1.0 |

|[science] hello all people|0.0 |(10,[0,6,8],[1.0,1.0,2.0])|0.0 |

|[science] hello hello |0.0 |(10,[0,8],[1.0,2.0]) |0.0 |

+--------------------------+-----+--------------------------+----------+

val input = Seq("Hello ScienCE").toDF("text")

sampleModel

.transform(input)

.select('text, 'rawPrediction, 'prediction)

.show(truncate = false)

+-------------+--------------------------------------+----------+

|text |rawPrediction |prediction|

+-------------+--------------------------------------+----------+

|Hello ScienCE|[12.666666666666668,7.333333333333333]|0.0 |

+-------------+--------------------------------------+----------+

import org.apache.spark.ml.tuning.ParamGridBuilder

val paramGrid = new ParamGridBuilder().build

import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

val binEval = new BinaryClassificationEvaluator

import org.apache.spark.ml.tuning.CrossValidator

val cv = new CrossValidator()

.setEstimator(pipeline) // <-- pipeline is the estimator

.setEvaluator(binEval) // has to match the estimator

.setEstimatorParamMaps(paramGrid)

// WARNING: It does not work!!!

val cvModel = cv.fit(trainDS)

CrossValidator with Pipeline Example

CrossValidator with Pipeline Example

相关推荐

欢迎关注：spark技术分享

热门标签

近期文章

分类目录

关注公众号：spark技术分享

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏

QQ咨询

回顶部