Lecture-9.Rhistory

# First, attach the package "tm"
# First we load in the corpus - create a path to the directory
# Note this is my path, you need to create the one that
# works on your machine.
cname <- file.path("Data", "TextData")
cname
dir(cname)
docs <- VCorpus(DirSource(cname))
summary(docs)

# We do some preprocessing on the corpus ...
# First, remove punctuation
docs <- tm_map(docs, removePunctuation)
inspect(docs[[5]])

# We can also remove special chars ...
# And numbers ...
docs <- tm_map(docs, removeNumbers)

# Convert to lowercase
docs <- tm_map(docs, content_transformer(tolower))

# Finally, stop words
docs <- tm_map(docs, removeWords, stopwords("english"))

# This applies stemming ...
docs <- tm_map(docs, stemDocument)

# Strip whitespace
docs <- tm_map(docs, stripWhitespace)
# Finally, indicate that this is plaintext - not really needed?
docs <- tm_map(docs, content_transformer(PlainTextDocument))
docs[[3]]
names(docs[[3]])
docs[[3]]$content
dtm <- DocumentTermMatrix(docs)
dtm
inspect(dtm[1:5,1:20])

# Here we organize terms by frequency
freq <- colSums(as.matrix(dtm))
length(freq)
ord <- order(freq)

# The DTM is large - we can reduce it by removing "sparse" words
# The following command creates a DTM that is a maximum
# of 10% empty space
dtms <- removeSparseTerms(dtm, 0.1)
inspect(dtms)

# Here are the least frequently occurring terms ...
freq[head(ord)]

# ..and the most frequently occurring terms
freq[tail(ord)]

# Here is a table of the frequencies ...
head(table(freq), 20)

# There are 1514 words that appear only once, etc.
# We can also do
tail(table(freq), 20)

# This is the other side - there is one word that appears 126 
# times
# Now let's look at the less sparse DTM, DTMS
freq <- colSums(as.matrix(dtms))
freq

# We can also do something like this ...
findFreqTerms(dtm, lowfreq=20)

# These are words that appear 20 or more times
# One more approach ...
wf <- data.frame(word=names(freq), freq=freq)
head(wf)

# Attach the package "ggplot2"
# Let's plot the words that appear at least 20 times
p <- ggplot(subset(wf, freq>20), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p

# One thing we may want to do is see how often terms
# appear together
findAssocs(dtm, c("piano" , "cartoon"), corlimit=0.98)

# Word Clouds can be created using the
# wordcloud package
# Let's try an H-Cluster on the corpus ...
d <- dist(t(dtms), method="euclidian")
fit <- hclust(d=d, method="ward.D2")
fit
plot(fit)

# Notice in this hclust, we used the tranpose of dtms
# So we actually clustered the terms. Can we cluster
# the docs?
d <- dist(dtms, method="euclidian")
fit <- hclust(d=d, method="ward.D2")
fit
plot(fit)