-
Notifications
You must be signed in to change notification settings - Fork 0
/
hashingTF_pg15and16.txt
51 lines (34 loc) · 1.41 KB
/
hashingTF_pg15and16.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
//Hashing TF
import org.apache.spark.ml.feature.HashingTF
//Stop Words Removal
import org.apache.spark.ml.feature.StopWordsRemover
//Tokenization
import org.apache.spark.ml.feature.Tokenizer
//CreateDataFrame
val lines = Seq(
(1, "Hello there, how do you like the book so far?"),
(2, "I am new to Machine Learning"),
(3, "Maybe i should get some coffee before starting"),
(4, "Coffee is best when you drink it hot"),
(5, "Book stores have coffee too so i should go to a book store")
)
val sentenceDF = spark.createDataFrame(lines).toDF("id", "sentence")
//initializing a Tokenizer
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
//invoking transform() function
val wordsDF = tokenizer.transform(sentenceDF)
//output dataset showing input columns id, sentence, and output column words
wordsDF.show(false)
//Initialize a StopWordsRemoval
val remover = new StopWordsRemover().setInputCol("words").setOutputCol("filteredWords")
//invoke transform function
val noStopWordsDF = remover.transform(wordsDF)
//output Dataset
noStopWordsDF.show(false)
//output dataset showing only sentence and filtered words
noStopWordsDF.select("sentence", "filteredWords").show(5,false)
val hashingTF = new HashingTF().setInputCol("filteredWords").setOutputCol("rawFeatures").setNumFeatures(100)
//transform() function
val rawFeaturesDF = hashingTF.transform(noStopWordsDF)
//output dataset
rawFeaturesDF.show(false)