-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentences.py
159 lines (113 loc) · 5.72 KB
/
sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Gets example sentences from the internet
import os
import shutil
import unittest
import logging
import json
import codecs
import requests
import urllib.parse
import osutils
class CachedDownloader:
def __init__(self, cache_folder):
self.cache_folder = cache_folder
self.download_count = 0
osutils.ensure_dir(self.cache_folder)
def download_with_cache(self, url):
"""
Downloads example sentences for the given word from Bing's dictionary.
Returns the content of the html page.
:param word: hanzi of word to search for
:return:
"""
cache_file = os.path.join(self.cache_folder, osutils.make_valid_filename(url) + ".txt")
if not os.path.exists(cache_file):
logging.info("Requesting content from '%s'", url)
response = requests.get(url)
self.download_count += 1
response_as_text = response.content.decode(encoding = response.encoding)
with codecs.open(cache_file, 'w', 'utf-8') as file:
file.write(response_as_text)
logging.info("Saved content to '%s'", cache_file)
return response_as_text
else:
with codecs.open(cache_file, 'r', 'utf-8') as file:
return file.read()
class SentenceDownloader:
_BING_SENTENCES_BASE_URL = "http://cn.bing.com/dict/search"
_ICIBA_BASE_URL = "http://dj.iciba.com"
def __init__(self, downloader):
self.downloader = downloader
def _parse_sentences_bing(self, dom):
dom_sentences = dom.xpath('//*[@class="se_li"]')
sentences = []
for s in list(dom_sentences):
english = "".join(s.xpath('.//*[@class="sen_en"]//text()'))
chinese = "".join(s.xpath('.//*[@class="sen_cn"]//text()'))
sentences.append({ "english": english, "chinese": chinese})
return sentences
def _parse_sentences_iciba(self, dom):
import re
dom_sentences = dom.xpath('//*[@class="dj_li"]')
sentences = []
for s in list(dom_sentences):
# Can't use tokenize a la http://stackoverflow.com/questions/20692303/xslt-select-elements-that-have-multiple-class-values-based-off-of-1-of-those-v
# Get an unregistered function exception
raw_english = "".join(s.xpath('.//*[@class="stc_en_txt font_arial"]//text()'))
matches = re.search("^\d{1,2}\.\s(.*)$", raw_english.strip())
english = matches.group(1)
chinese = "".join(s.xpath('.//*[@class="stc_cn_txt"]//text()'))
chinese = chinese.strip()
sentences.append({ "english": english, "chinese": chinese})
return sentences
def get_sentences_bing(self, word):
"""Downloads example sentences for a given word from Bing"""
from lxml import html
# TODO: urllib.parse.urlencode seems to order its parameters however it feels like it,
# TODO: which messes with the caching logic. So, yeah
query_string = urllib.parse.urlencode({ "q": word })
url = SentenceDownloader._BING_SENTENCES_BASE_URL + "?" + query_string + "&mkt=zh-CN"
dom = html.fromstring(self.downloader.download_with_cache(url))
return self._parse_sentences_bing(dom)
def get_sentences_iciba(self, word):
"""Downloads example sentences for a given word from Bing"""
from lxml import html
url = SentenceDownloader._ICIBA_BASE_URL + "/" + word + "-1.html"
dom = html.fromstring(self.downloader.download_with_cache(url))
return self._parse_sentences_iciba(dom)
class UnitTests(unittest.TestCase):
_TEMP_SENTENCE_CACHE_FOLDER = os.path.expandvars("$Temp\\sentences_py_unittest_cache")
def setUp(self):
osutils.clear_dir(UnitTests._TEMP_SENTENCE_CACHE_FOLDER)
def tearDown(self):
pass
def test_bing_download(self):
"""Should download sentences for a given word, and use cache"""
downloader = CachedDownloader(UnitTests._TEMP_SENTENCE_CACHE_FOLDER)
me = SentenceDownloader(downloader)
test_word = u"\u4fdd\u62a4"
sentences = me.get_sentences_bing(test_word) # bao3hu4, protect
self.assertEqual(len(sentences), 10, "sentence count")
self.assertTrue(sentences[0]['chinese'].find(test_word) >= 0, "downloaded sentence contains word")
sentences2 = me.get_sentences_bing(test_word) # same thing
self.assertEqual(1, downloader.download_count, "download count")
self.assertEqual(json.dumps(sentences), json.dumps(sentences2), "sentences downloaded should be identical")
def test_iciba_download(self):
"""Should download sentences for a given word, and use cache"""
downloader = CachedDownloader(UnitTests._TEMP_SENTENCE_CACHE_FOLDER)
me = SentenceDownloader(downloader)
test_word = u"\u4fdd\u62a4"
sentences = me.get_sentences_iciba(test_word) # bao3hu4, protect
self.assertEqual(len(sentences), 10, "sentence count")
self.assertTrue(sentences[0]['chinese'].find(test_word) >= 0, "downloaded sentence contains word")
self.assertTrue(
sentences[0]['english'].find("protection") >= 0 or
sentences[0]['english'].find("cover") >= 0 or
sentences[0]['english'].find("protect") >= 0,
"downloaded sentence contains English word")
first_char = sentences[0]['chinese'][0]
self.assertTrue(first_char != '\t' and first_char != '\r')
self.assertTrue(first_char != "1")
sentences2 = me.get_sentences_iciba(test_word) # same thing
self.assertEqual(1, downloader.download_count, "download count")
self.assertEqual(json.dumps(sentences), json.dumps(sentences2), "sentences downloaded should be identical")