-
Notifications
You must be signed in to change notification settings - Fork 13
/
extract.py
88 lines (76 loc) · 4.16 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import html2text
# import jieba.analyse
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_emphasis = True
def extract(input_folder, middle_folder, output_folder, category):
path_r = os.path.abspath(input_folder)
path_s = os.path.abspath(middle_folder)
path_n = os.path.abspath(output_folder)
n = 1
for i in os.listdir(path_r):
print(n)
if i.endswith('.htm') or i.startswith('20') or i.endswith('.mht'):
fname = os.path.join(path_r,i)
# 读取 php 文件,提取标题、时间,并转换为 markdown 文件,并使用 jieba 从标题提取关键字
with open(fname,"r", encoding='GBK') as f:
date = i.split(' ')[0]
title = i.split(' ')[1].strip('.php')
title = title.replace('[','【')
title = title.replace(']','】')
# tags = jieba.analyse.extract_tags(title, topK=2)
# if len(tags) == 1:
# tags.append('1984')
# elif len(tags) == 0:
# tags = ['1984','bbs']
# print(tags)
# 关键词提取效果太差,故 disable
fname = date + '-' + title + '.md'
print(fname)
content = h.handle(f.read())
f_save = os.path.join(path_s, fname)
with open(f_save,"w") as f:
f.write(content)
# 剔除噪音行
with open(f_save, "r") as f:
new = []
lines = f.readlines()[32:]
for line in lines:
ignore = ('Processed in','CopyRight','浏览器支持说明','查看详细资料','查看个人网站',
'顾问:','编辑:','技术:','内容','##### 标题','#### 快速回复主题',' * 发布投票',' * 发新话题',
'![](images/default/reply.gif)',' * 多功能小黑屋','收藏 订阅 推荐 打印','. ',
'![](http://messenger.services.live.com/','cn)',
'![QQ](images/default/qq.gif)',
'> ![](http://www.1984bbs.com/images/common/back.gif)',
'> ![](http://doubans.com/images/common/back.gif)',
'站务',' * Live!大讲堂',' * 罗马假日公寓',' * 国家局域网研究所',' * 开放社会资料室',' * 雅典学院',
' * 自由新闻社','沙龙','发表话题[完成后可按','‹‹ 上一主题','引用 回复 TOP','|','---|')
if line.startswith(ignore):
continue
if line.startswith("##"):
line = line.replace('#','').lstrip()
if line.startswith('1984bbs是一个非营利项目'):
line = '---' + '\n\n' + '[Terminusbot](https://github.com/TerminusBot) 整理,讨论请前往 [2049bbs.xyz](http://2049bbs.xyz/)' + '\n\n' + '---' + '\n'
line = line.replace('|', ' ')
new.append(line)
new.append('\n')
# 生成 jekyll serve 需要的文件格式
f_output = os.path.join(path_n, fname)
with open(f_output, 'w') as f:
f.write('---' + '\n')
f.write('layout: default' + '\n\n')
tags: 北京驱逐
f.write("date: " + date + '\n\n')
f.write("title: " + title + '\n\n')
# f.write("tags: " + tags[0] + " " + tags[1] + '\n\n')
f.write("categories: " + category + '\n\n')
f.write('---' + '\n\n')
f.writelines(new)
n += 1
extract('sbb4891/1自由新闻社', 'sbb4891/1', 'sbb4891/11', '自由新闻社')
extract('sbb4891/2雅典学院', 'sbb4891/2', 'sbb4891/22', '雅典学院')
extract('sbb4891/3开放社会资料室', 'sbb4891/3', 'sbb4891/33', '开放社会资料室')
extract('sbb4891/4国家局域网研究所', 'sbb4891/4', 'sbb4891/44', '国家局域网研究所')
extract('sbb4891/5罗马假日公寓', 'sbb4891/5', 'sbb4891/55', '罗马假日公寓')
# extract('sbb4891/大讲堂', 'sbb4891/6', 'sbb4891/66', '大讲堂')