-
Notifications
You must be signed in to change notification settings - Fork 0
/
Lecture-9.Rhistory
103 lines (84 loc) · 2.65 KB
/
Lecture-9.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# First, attach the package "tm"
# First we load in the corpus - create a path to the directory
# Note this is my path, you need to create the one that
# works on your machine.
cname <- file.path("Data", "TextData")
cname
dir(cname)
docs <- VCorpus(DirSource(cname))
summary(docs)
# We do some preprocessing on the corpus ...
# First, remove punctuation
docs <- tm_map(docs, removePunctuation)
inspect(docs[[5]])
# We can also remove special chars ...
# And numbers ...
docs <- tm_map(docs, removeNumbers)
# Convert to lowercase
docs <- tm_map(docs, content_transformer(tolower))
# Finally, stop words
docs <- tm_map(docs, removeWords, stopwords("english"))
# This applies stemming ...
docs <- tm_map(docs, stemDocument)
# Strip whitespace
docs <- tm_map(docs, stripWhitespace)
# Finally, indicate that this is plaintext - not really needed?
docs <- tm_map(docs, content_transformer(PlainTextDocument))
docs[[3]]
names(docs[[3]])
docs[[3]]$content
dtm <- DocumentTermMatrix(docs)
dtm
inspect(dtm[1:5,1:20])
# Here we organize terms by frequency
freq <- colSums(as.matrix(dtm))
length(freq)
ord <- order(freq)
# The DTM is large - we can reduce it by removing "sparse" words
# The following command creates a DTM that is a maximum
# of 10% empty space
dtms <- removeSparseTerms(dtm, 0.1)
inspect(dtms)
# Here are the least frequently occurring terms ...
freq[head(ord)]
# ..and the most frequently occurring terms
freq[tail(ord)]
# Here is a table of the frequencies ...
head(table(freq), 20)
# There are 1514 words that appear only once, etc.
# We can also do
tail(table(freq), 20)
# This is the other side - there is one word that appears 126
# times
# Now let's look at the less sparse DTM, DTMS
freq <- colSums(as.matrix(dtms))
freq
# We can also do something like this ...
findFreqTerms(dtm, lowfreq=20)
# These are words that appear 20 or more times
# One more approach ...
wf <- data.frame(word=names(freq), freq=freq)
head(wf)
# Attach the package "ggplot2"
# Let's plot the words that appear at least 20 times
p <- ggplot(subset(wf, freq>20), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
# One thing we may want to do is see how often terms
# appear together
findAssocs(dtm, c("piano" , "cartoon"), corlimit=0.98)
# Word Clouds can be created using the
# wordcloud package
# Let's try an H-Cluster on the corpus ...
d <- dist(t(dtms), method="euclidian")
fit <- hclust(d=d, method="ward.D2")
fit
plot(fit)
# Notice in this hclust, we used the tranpose of dtms
# So we actually clustered the terms. Can we cluster
# the docs?
d <- dist(dtms, method="euclidian")
fit <- hclust(d=d, method="ward.D2")
fit
plot(fit)