forked from blei-lab/onlineldavb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpus.py
123 lines (106 loc) · 3.25 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
from itertools import izip
import re
# read and organize data
#3 2:3 4:5 5:3 --- document info (word: count)
class document:
''' the class for a single document '''
def __init__(self):
self.words = []
self.counts = []
self.length = 0
self.total = 0
class corpus:
''' the class for the whole corpus'''
def __init__(self):
self.size_vocab = 0
self.docs = []
self.num_docs = 0
def read_data(self, filename):
if not os.path.exists(filename):
print 'no data file, please check it'
return
print 'reading data from %s.' % filename
for line in file(filename):
ss = line.strip().split()
if len(ss) == 0: continue
doc = document()
doc.length = int(ss[0])
doc.words = [0 for w in range(doc.length)]
doc.counts = [0 for w in range(doc.length)]
for w, pair in enumerate(re.finditer(r"(\d+):(\d+)", line)):
doc.words[w] = int(pair.group(1))
doc.counts[w] = int(pair.group(2))
doc.total = sum(doc.counts)
self.docs.append(doc)
if doc.length > 0:
max_word = max(doc.words)
if max_word >= self.size_vocab:
self.size_vocab = max_word + 1
if (len(self.docs) >= 10000):
break
self.num_docs = len(self.docs)
print "finished reading %d docs." % self.num_docs
# def read_data(filename):
# c = corpus()
# c.read_data(filename)
# return c
def read_stream_data(f, num_docs):
c = corpus()
splitexp = re.compile(r'[ :]')
for i in range(num_docs):
line = f.readline()
line = line.strip()
if len(line) == 0:
break
d = document()
splitline = [int(i) for i in splitexp.split(line)]
wordids = splitline[1::2]
wordcts = splitline[2::2]
d.words = wordids
d.counts = wordcts
d.total = sum(d.counts)
d.length = len(d.words)
c.docs.append(d)
c.num_docs = len(c.docs)
return c
# This version is about 33% faster
def read_data(filename):
c = corpus()
splitexp = re.compile(r'[ :]')
for line in open(filename):
d = document()
splitline = [int(i) for i in splitexp.split(line)]
wordids = splitline[1::2]
wordcts = splitline[2::2]
d.words = wordids
d.counts = wordcts
d.total = sum(d.counts)
d.length = len(d.words)
c.docs.append(d)
if d.length > 0:
max_word = max(d.words)
if max_word >= c.size_vocab:
c.size_vocab = max_word + 1
c.num_docs = len(c.docs)
return c
def count_tokens(filename):
num_tokens = 0
splitexp = re.compile(r'[ :]')
for line in open(filename):
splitline = [int(i) for i in splitexp.split(line)]
wordcts = splitline[2::2]
num_tokens += sum(wordcts)
return num_tokens
splitexp = re.compile(r'[ :]')
def parse_line(line):
line = line.strip()
d = document()
splitline = [int(i) for i in splitexp.split(line)]
wordids = splitline[1::2]
wordcts = splitline[2::2]
d.words = wordids
d.counts = wordcts
d.total = sum(d.counts)
d.length = len(d.words)
return d