forked from miaopei/ffmpeg-webrtc
-
Notifications
You must be signed in to change notification settings - Fork 2
/
blog2markdown.py
202 lines (169 loc) · 5.89 KB
/
blog2markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
'''
写的一个将博客转成markdown的脚本,
目前支持简书,知乎,CSDN,segmentfault,掘金 使用方法 python html2md.py -u <url>
由于博客类网站页面渲染方式和反爬技术的变化,这里不再维护。
基本思路是通过分析网页中正文部分,然后通过BeautifulSoup获取html,在通过tomd.py转换成markdown
'''
import os
import sys
import getopt
import requests
import random
import re
import html2text
from bs4 import BeautifulSoup
useragents = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
]
def jinashu(url):
## 浏览器头部
headers = {
'Host': 'www.jianshu.com',
'Referer': 'https://www.jianshu.com/',
'User-Agent': random.choice(useragents)
}
## 获取网页主体
html = requests.get(url,headers=headers).text
## bs4
soup = BeautifulSoup(html,"html5lib")
title = soup.find_all("title")[0].get_text()
article = str(soup.find_all("div",class_="show-content")[0])
## 替图片的src加上https://方便访问
article = re.sub('(src=")|(data-original-src=")','src="https:',article)
## 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/jianshu/'
write2md(dirpath,title,article)
def csdn(url):
headers = {
'Host': 'blog.csdn.net',
'Referer': 'http://blog.csdn.net/',
'User-Agent': random.choice(useragents)
}
## 获取网页主体
html = requests.get(url,headers=headers).text
## bs4
soup = BeautifulSoup(html,'html5lib')
title = soup.find_all('title')[0].get_text()
article = str(soup.find_all('article')[0])
## 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/CSDN/'
write2md(dirpath,title,article)
def zhihu(url):
headers = {
'Host': 'zhuanlan.zhihu.com',
'Referer': 'https://www.zhihu.com/',
'User-Agent': random.choice(useragents)
}
html = requests.get(url,headers=headers).text
## bs4
soup = BeautifulSoup(html,'html5lib')
title = soup.find_all('title')[0].get_text()
article = str(soup.find_all('div',class_='Post-RichText')[0])
## 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/ZhiHu/'
write2md(dirpath,title,article)
def segmentfault(url):
headers = {
# 'Host': 'https://segmentfault.com',
'Referer': 'https://segmentfault.com/',
'User-Agent': random.choice(useragents)
}
html = requests.get(url,headers=headers).text
## bs4
soup = BeautifulSoup(html,'html5lib')
title = soup.find('title').text # 获取标题
article = str(soup.find(class_='article__content'))
## 能够加载图片
# article = re.sub('<p><span class="img-wrap">','',article)
# article = re.sub('</span></p>','',article)
article = re.sub('data-src="','src="https://segmentfault.com',article)
print(article)
# 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/segmentfault/'
write2md(dirpath,title,article)
def juejin(url):
headers = {
'Host': 'juejin.im',
'Referer': 'https://juejin.im/',
'User-Agent': random.choice(useragents)
}
res = requests.get(url=url,headers=headers).text # 获取整个html
soup = BeautifulSoup(res,'html5lib')
title = soup.find('title').text
article = str(soup.find(class_='post-content-container'))
## 写入文件
pwd = os.getcwd() # 获取当前的文件路径
dirpath = pwd + '/segmentfault/'
write2md(dirpath,title,article)
def doelse(url):
headers = {
'User-Agent': random.choice(useragents)
}
res = requests.get(url=url ,headers=headers) # 获取整个html页面
h = html2text.HTML2Text()
h.ignore_links = False
soup = BeautifulSoup(res.text,'html5lib')
title = soup.title.text # 获取标题
html = str(soup.body)
article = h.handle(html)
pwd = os.getcwd() # 获取当前文件的路径
dirpath = pwd + '/Else/'
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
## 写入文件
write2md(dirpath,title,article)
"""
传入文件路径,title,article
"""
def write2md(dirpath,title,article):
## 创建转换器
h2md = html2text.HTML2Text()
h2md.ignore_links = False
## 转换文档
article = h2md.handle(article)
## 写入文件
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
# 创建md文件
with open(dirpath+title+'.md','w',encoding="utf8") as f:
lines = article.splitlines()
for line in lines:
if line.endswith('-'):
f.write(line)
else:
f.write(line+"\n")
print(title+"下载完成....")
def main(argv):
try:
opts,args = getopt.getopt(argv,"hu:",["url"])
except getopt.GetoptError:
print("python html2md.py -u <url>")
for opt,arg in opts:
if opt == "-h":
print("python html2md.py -u <url>")
sys.exit(2)
elif opt in ("-u", "-url"):
print()
checkSite(arg)
else:
print("python html2md.py -u <url>")
## 检查网站,使用哪个下载器
def checkSite(url):
if url.find('csdn') != -1:
csdn(url)
elif url.find('jianshu') != -1:
jinashu(url)
elif url.find('zhihu') != -1:
zhihu(url)
elif url.find('segmentfault') != -1:
segmentfault(url)
else:
doelse(url)
if __name__ == "__main__":
main(sys.argv[1:])