forked from karrot-dev/yunity.org
-
Notifications
You must be signed in to change notification settings - Fork 0
/
old_heartbeats.py
133 lines (107 loc) · 3.73 KB
/
old_heartbeats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 8 17:07:53 2017
@author: tic
"""
#%% get urls of posts
url = "https://yunity.atlassian.net/wiki/pages/viewrecentblogposts.action?key=YUN"
from bs4 import BeautifulSoup
import requests
urls = []
while url:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for post in soup.find_all('div', class_='blog-post-listing'):
link = post.find('a', class_='blogHeading')
urls.append(link.get('href'))
print(link)
url = 'https://yunity.atlassian.net' + soup.find('a', class_='blogmonthnavigation').get('href')
# repeats endlessly for now
print(urls)
#%% get posts
import os
import re
urls = list(set(urls))
os.mkdir('old_heartbeats')
for u in urls:
r = requests.get('https://yunity.atlassian.net' + u)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find(id='title-text').a.get_text()
print(title)
date = re.search(r'\d{4}-\d{2}-\d{2}', title).group()
html = soup.find(id='main-content').div.prettify()
os.mkdir(os.path.join('old_heartbeats', date))
with open(os.path.join('old_heartbeats', date, 'source.html'), 'w+') as f:
f.write('<h1>{}</h1>'.format(title))
f.write(html)
#%%
import os
from bs4 import BeautifulSoup
#os.chdir('user/pages/01.heartbeat')
for d in os.listdir():
if d == 'blog.md':
continue
if 'item.md' in os.listdir(d):
continue
print(d)
with open(os.path.join(d, 'source.html')) as f:
soup = BeautifulSoup(f, 'html.parser')
header = """---
title: "{}"
date: "{}"
continue_link: true
taxonomy:
category: blog
tag: [heartbeat]
published: false
---
""".format(soup.h1.extract(), d)
for url in soup.find_all('a'):
if not url['href'].startswith('http'):
url['href'] = 'https://yunity.atlassian.net' + url['href']
with open(os.path.join(d, 'item.md'), 'w+') as dest:
dest.write(header)
dest.write(soup.prettify())
#%%
import os
from bs4 import BeautifulSoup
import subprocess
#os.chdir('user/pages/01.heartbeat')
for d in os.listdir():
if d == 'blog.md':
continue
with open(os.path.join(d, 'item.md')) as f:
print(d)
chunks = f.read().split("---", maxsplit=2)
[_, header, body] = chunks
if 'class="wiki-content"' not in body:
continue
soup = BeautifulSoup(body, 'html.parser')
for img in soup.find_all('img'):
src = img.get('data-image-src')
if not src:
print('skipping', img)
continue
if not src.startswith('https://yunity.atlassian.net'):
print('other source', img)
continue
filename = src.split('/')[-1].split('?')[0]
if 'emoticons/' in src:
img['src'] = '/user/themes/twentyfifteen/images/emoticons/' + filename
else:
if filename not in os.listdir(d):
print('downloading', src)
subprocess.run(['wget', src, '-O', filename], cwd=d)
else:
print('already exists', filename)
img['src'] = d+'/'+filename
del img['srcset']
for k in [_ for _ in img.attrs.keys() if _.startswith('data-')]:
del img[k]
with open(os.path.join(d, 'item.md.tmp'), 'w+') as dst:
dst.write('---')
dst.write(header)
dst.write('---\n\n')
dst.write(soup.prettify())
os.rename(os.path.join(d, 'item.md.tmp'), os.path.join(d, 'item.md'))