-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.r
119 lines (98 loc) · 5.25 KB
/
predict.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
predict.writer <- function(original, writers, char.dict, word.dict) {
if (nchar(original) == 0) {
writers$probability <- .1
writers$likelihood <- 0
writers$normalized <- .1
return(writers)
}
# Turn probabilities into logs
writers$probability <- log(writers$probability)
# Make copy of original text
text <- original
# Modify text to consolidate similar characters
text <- gsub("[a-z]", "a", text) # Make all lowercase letters a's
text <- gsub("[A-Z]", "A", text) # Make all uppercase letters A's
text <- gsub("\\d+", "0", text) # Make all numbers 0's
text <- gsub("\n", " ", text) # Change new lines to spaces
text <- gsub("–", "-", text) # Change en dashes to hyphens
text <- gsub("—", "-", text) # Change em dashes to hyphens
text <- gsub("'", "‘", text) # Change ugly apostrophe to open apostrophe
text <- gsub("’", "‘", text) # Change close apostrophe to open apostrophe
text <- gsub("″", "“", text) # Change ugly quote to open quote
text <- gsub("”", "“", text) # Change close quote to open quote
text <- gsub(")", "(", text) # Change close parenthesis to open parenthesis
text <- gsub("]", "[", text) # Change close bracket to open bracket
text <- gsub("}", "{", text) # Change close curly brace to open curly brace
text <- gsub(">", "<", text) # Change greater than to less than
text <- gsub("\\", "*", text, fixed = TRUE) # Change forward slash to backslash
text <- gsub("&", "*", text) # Change ampersand to asterisk
text <- gsub("#", "*", text) # Change pound to asterisk
text <- gsub("%", "*", text) # Change percent to asterisk
text <- gsub("`", "*", text) # Change weird thing to asterisk
text <- gsub("^", "*", text, fixed = TRUE) # Change caret to asterisk
text <- gsub("+", "*", text, fixed = TRUE) # Change plus to asterisk
text <- gsub("|", "*", text, fixed = TRUE) # Change horizontal line to asterisk
text <- gsub("<", "*", text) # Change less than to asterisk
text <- gsub("=", "*", text) # Change equals to asterisk
text <- gsub("@", "*", text) # Change at to asterisk
text <- gsub("$", "*", text, fixed = TRUE) # Change dollar to asterisk
text <- gsub("[", "*", text, fixed = TRUE) # Change open bracket to asterisk
text <- gsub("{", "*", text, fixed = TRUE) # Change open curly brace to asterisk
# Split text into characters
text <- strsplit(text, split = "")[[1]]
# Rotate through characters and calculate probability
for (char.index in 1:length(text)) {
char <- text[char.index]
row <- which(char.dict$char == char)
# Skip character if doesn't exist for any writer
if (length(row) > 0) {
# Calculate likelihoods for each writer
for (writer.index in 1:nrow(writers)) {
column <- which(colnames(char.dict) == writers[writer.index, 1])
writers[writer.index, 3] <- char.dict[row, column] / sum(char.dict[, column])
}
# If likelihood = 0, just make it a bit smaller than the smallest one
writers[writers$likelihood == 0, 3] <- min(writers[writers$likelihood > 0, 3]) * .95
# Calculate posterior
writers$probability <- writers$probability + log(writers$likelihood)
}
}
# Reload text
text <- original
# Modify text to consolidate similar characters
text <- gsub("'", "", text) # Change ugly apostrophe to blank
text <- gsub("’", "", text) # Change close apostrophe to blank
text <- gsub("‘", "", text) # Change open apostrophe to blank
text <- gsub("[^a-zA-Z]", " ", text) # Remove everything but letters
# Split text into words
text <- strsplit(text, split = " ")[[1]]
# Rotate through words and calculate probability
for (word.index in 1:length(text)) {
word <- text[word.index]
row <- which(word.dict$len == nchar(word))
# Skip word if length doesn't exist for any writer
if (length(row) > 0) {
# Calculate likelihoods for each writer
for (writer.index in 1:nrow(writers)) {
column <- which(colnames(word.dict) == writers[writer.index, 1])
writers[writer.index, 3] <- word.dict[row, column] / sum(word.dict[, column])
}
# If likelihood = 0, just make it a bit smaller than the smallest one
writers[writers$likelihood == 0, 3] <- min(writers[writers$likelihood > 0, 3]) * .95
# Calculate posterior
writers$probability <- writers$probability + log(writers$likelihood)
}
}
# Normalize probabilities for each writer
# Use ratios, initialize first one as 1 and calculate other values
writers[1, 4] <- 1
for (index in 2:nrow(writers)) {
writers[index, 4] <- exp(writers[index, 2] - writers[1, 2])
}
# Make probability hold updated probabilities
# Make log likelihood and normalized back to 0
writers$normalized <- writers$normalized / sum(writers$normalized)
writers$probability <- .1
writers$likelihood <- 0
return(writers)
}