-
Notifications
You must be signed in to change notification settings - Fork 1
/
markymarkov.py
102 lines (83 loc) · 2.84 KB
/
markymarkov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import sys
import collections
import random
import bisect
class MarkyMarkov(object):
end = '\n'
def __init__(self, order, allow_duplicates=True):
self.order = order
self.counts = collections.defaultdict(collections.Counter)
self.dirty = {}
self.cdf = {}
self.cumulative_sum = {}
self.state_list = {}
self.words = set()
self.allow_duplicates = allow_duplicates
def add_word(self, word):
if not self.allow_duplicates and word in self.words:
return None
else:
padded = ' ' * self.order + word + self.end
for index in range(self.order, len(padded)):
key = padded[(index - self.order):index]
value = padded[index]
self.counts[key][value] += 1
self.dirty[key] = True
self.words.add(word)
def _create_cdf(self, key):
cumulative_sum = 0
self.state_list[key] = []
self.cdf[key] = []
for state, count in self.counts[key].items():
cumulative_sum += count
self.state_list[key].append(state)
self.cdf[key].append(cumulative_sum)
self.dirty[key] = False
self.cumulative_sum[key] = cumulative_sum
def choose(self, state):
if self.dirty[state]:
self._create_cdf(state)
index = bisect.bisect(
self.cdf[state],
random.random() * self.cumulative_sum[state]
)
return self.state_list[state][index]
def generate_word(self):
word = ' ' * self.order
while True:
state = word[-self.order:]
next_char = self.choose(state)
if next_char == self.end:
break
else:
word += next_char
return word[self.order:]
@classmethod
def from_file(cls, filename, order=1, allow_duplicates=False):
result = cls(order, allow_duplicates=allow_duplicates)
with open(filename) as infile:
for line in infile:
stripped = line.strip().lower()
if stripped:
result.add_word(stripped)
return result
def ikea():
ikea = MarkyMarkov.from_file(sys.argv[1], order=4)
swedish = MarkyMarkov.from_file(sys.argv[2], order=0)
for i in range(1000000):
word = ikea.generate_word()
if not word in ikea.words and not word in swedish.words:
print(word)
def generate():
mc = MarkyMarkov.from_file(sys.argv[1], order=4)
reference = MarkyMarkov(0)
if len(sys.argv) > 2:
reference = MarkyMarkov.from_file(sys.argv[2], order=0)
n = 0
while n < 10:
word = mc.generate_word()
if not word in mc.words and not word in reference.words:
print(word)
n += 1
if __name__ == '__main__':
generate()