-
Notifications
You must be signed in to change notification settings - Fork 0
/
Apriori.R
73 lines (53 loc) · 2.61 KB
/
Apriori.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#Association Rules Mining
# Loading the arules package
install.packages("arules")
library(arules)
# Loading arulesViz package which has methods of visualizing the rulesets generated by apriori()
install.packages("arulesViz")
library(arulesViz)
# Loading CSV reader
install.packages("readr")
library(readr)
getwd()
# Reading Neethu's cleansed/prepared data
sample1 <- read_csv("unmatched_clean.csv")
summary(sample1)
str(sample1)
# converting the character/string to factors as we have present/absent categorical vars in most cases
sample1$id <- as.factor(sample1$id)
sample1$provider_uid <- as.factor(sample1$provider_uid)
sample1$oclcnum <- as.factor(sample1$oclcnum)
sample1$oclcnums <- as.factor(sample1$oclcnums)
sample1$title <- as.factor(sample1$title)
sample1$publisher <- as.factor(sample1$publisher)
sample1$url <- as.factor(sample1$url)
sample1$author <- as.factor(sample1$author)
sample1$content <- as.factor(sample1$content)
sample1$coverage <- as.factor(sample1$coverage)
sample1$openaccess <- as.factor(sample1$openaccess)
sample1$holdings_regid <- as.factor(sample1$holdings_regid)
sample1$unique_identifier <- as.factor(sample1$unique_identifier)
sample1$linkingkey <- as.factor(sample1$linkingkey)
# Putting the needed columns into a new dataframe of variables that are now factors
sampleMiningDS <- sample1[,c("oclcnum","oclcnums", "title","publisher","url","author","content", "coverage", "openaccess","holdings_regid","unique_identifier", "linkingkey")]
summary(sampleMiningDS)
str(sampleMiningDS)
# Preparing the data set for apriori => placing it in a matrix
sample1matrix <- as.matrix(sampleMiningDS)
# in order to use the apriori() based association mining method, the data needs to have
# the rows as containers and columns as items. Creating a matrix of "transactions" so the
# association rules mining can examine the "shopping carts"
sampleMiningMatrix <- as(sampleMiningDS,"transactions")
# View the data again to validate that the transpose operation has completed successfully
View(sampleMiningMatrix)
# invoking the data mining function apriori() on the mining matrix in order to
# find associations rules. The apriori() uses the Apriori algorithm that uses level-wise
# search for frequent itemsets.
ruleset <- apriori(sampleMiningMatrix, parameter=list(support=0.80,confidence=0.5))
rules.sorted <- sort(ruleset, by="support")
# filtering the ruleset to focus on what factors may have caused the ocn to be absent
rules.ocn_absent_rhs <- subset(rules.sorted,subset= rhs %pin% "oclcnum=match_absent")
# view the rules
inspect(rules.ocn_absent_rhs)
# scatter plot of support versus confidence
plot(rules.ocn_absent_rhs)