-
Notifications
You must be signed in to change notification settings - Fork 0
/
tut1.py
36 lines (29 loc) · 1.41 KB
/
tut1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import polyglotdb.io as pgio
from polyglotdb import CorpusContext
# corpus_root = './data/LibriSpeech-aligned/'
#corpus_name = 'tutorial'
corpus_root = './data/LibriSpeech-aligned-subset/'
corpus_name = 'tutorial-subset'
parser = pgio.inspect_mfa(corpus_root)
parser.call_back = print
# Note: a corpus only needs to be loaded (imported) to pgdb once.
# If you get the error "The discourse ... already exists in this corpus"
# then you can comment out/delete the following two lines:
with CorpusContext(corpus_name) as c:
c.load(parser, corpus_root)
# Simple queries
with CorpusContext(corpus_name) as c:
print('Speakers:', c.speakers)
print('Discourses:', c.discourses)
q = c.query_lexicon(c.lexicon_phone)
q = q.order_by(c.lexicon_phone.label)
q = q.columns(c.lexicon_phone.label.column_name('phone'))
results = q.all()
print(results)
from polyglotdb.query.base.func import Count, Average
with CorpusContext(corpus_name) as c:
# Optional: Use order_by to enforce ordering on the output for easier comparison with the sample output.
q = c.query_graph(c.phone).order_by(c.phone.label).group_by(c.phone.label.column_name('phone'))
results = q.aggregate(Count().column_name('count'), Average(c.phone.duration).column_name('average_duration'))
for r in results:
print('The phone {} had {} occurrences and an average duration of {}.'.format(r['phone'], r['count'], r['average_duration']))