-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpr_date_range.py
170 lines (138 loc) · 5.45 KB
/
pr_date_range.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import utils
from config import tokens
from datetime import datetime, timedelta
import glob
import re
MAX_TOTAL_NUM = 10000
# MAX_EACH_NUM = 1000
# MIN_STARS = 20 # min star number of a repository
RE_REPO_NAME = re.compile(r'https://api\.github\.com/repos/([^/]+/[^/]+)/')
def get_repo_stars(repo_name: str):
'''
return the number of stars given repository name
:param repo_name: string
:return: the number of stars
'''
ulink = f'https://api.github.com/repos/{repo_name}'
resp = utils.send(ulink, tokens[0], 3)
if not resp or resp.status_code != 200:
return -1
return resp.json()['stargazers_count']
def get_repo_name(pr_link: str):
m = RE_REPO_NAME.search(pr_link)
if not m:
return None
return m.groups()[0]
def search_pr(language: str, start_date: str, end_date=''):
'''
Query pull requests created from start_date to end_date, and save links to files in csv tables
:param language: repository language
:param start_date: string in the format of YYYY-MM-DD
:param end_date: string in the format of YYYY-MM-DD
:return: the number of pull requests
'''
out_path = 'out/' + language + '/links/{}_{}.csv'
utils.create_missing_dirs(out_path)
pr_cnt = 0
# ulink example: https://api.github.com/search/issues?q=language:Java+is:pr+is:open+created:2020-09-10..2020-09-15
query = f'https://api.github.com/search/issues?q=language:{language}+is:pr+is:open+created:{start_date}'
if end_date:
query += f'..{end_date}'
ulink = query + '&page={}&per_page=100'
file_list = []
for page_cnt in range(1, 11):
# if pr_cnt >= MAX_EACH_NUM:
# break
resp = utils.send(ulink.format(page_cnt), tokens[0], 3)
if not resp or resp.status_code != 200:
utils.logger.warning(f'[No response] {ulink.format(page_cnt)}\naccess_token={tokens[0]}')
break
jresp = resp.json()
if 'items' in jresp:
for item in jresp['items']:
if 'url' in item:
repo_name = get_repo_name(item['url'])
if not repo_name:
continue # If it works fine, this line wont be executed
# if get_repo_stars(repo_name) <= MIN_STARS:
# continue
link_files = item['url'].replace('/issues/', '/pulls/') + '/files\n'
file_list.append(link_files)
pr_cnt += 1
if file_list:
with open(out_path.format(start_date, end_date), 'w') as outfile:
outfile.writelines(file_list)
outfile.flush()
utils.logger.warning(f'pr count {pr_cnt}')
return pr_cnt
def save_files(csv_file: str, language: str):
"""
Query links in csv_file and save
:param csv_file: the path to csv files generated by the method search_pr
:param language: repository language
:return: none
"""
out_path = f'out/{language}/files/'
with open(csv_file, 'r') as f:
lines = f.readlines()
token_len = len(tokens)
lines_len = len(lines)
idx = -1
while idx < lines_len:
for i in range(token_len):
idx += 1
if idx < lines_len:
link = lines[idx].strip()
savepath = link.replace('https://api.github.com/repos/', out_path) + '.json'
if utils.exists_file(savepath): # if file exists, do not send unnecessary request
continue
else:
resp = utils.send(link, tokens[i], 3)
if not resp or resp.status_code != 200:
continue
utils.save(resp.text, savepath)
else:
return
def step1():
end_date = datetime.now()
start_date = end_date - timedelta(days=1)
total_cnt = 0
while total_cnt < MAX_TOTAL_NUM:
total_cnt += search_pr('java', start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))
start_date -= timedelta(days=2)
end_date -= timedelta(days=2)
utils.logger.warning(f'total:{total_cnt}')
def step2():
root = 'out/java/links'
paths = glob.glob(f'{root}/**/*.csv', recursive=True)
for p in paths:
save_files(p, 'java')
def normal_search(language: str):
out_path = 'out/' + language + '/links/{}.csv'
utils.create_missing_dirs(out_path)
pr_cnt = 0
ulink = f'https://api.github.com/search/issues?q=language:{language}+is:pr+is:open'
ulink = ulink + '&page={}&per_page=100'
file_list = []
for page_num in range(1, 11):
resp = utils.send(ulink.format(page_num), tokens[0], 3)
if not resp or resp.status_code != 200:
break
jresp = resp.json()
if 'items' in jresp:
for item in jresp['items']:
if 'url' in item:
repo_name = get_repo_name(item['url'])
if not repo_name:
continue # If it works fine, this line wont be executed
link_files = item['url'].replace('/issues/', '/pulls/') + '/files\n'
file_list.append(link_files)
pr_cnt += 1
if file_list:
with open(out_path.format('link'), 'w') as outfile:
outfile.writelines(file_list)
outfile.flush()
utils.logger.warning(f'pr count {pr_cnt}')
if __name__ == '__main__':
step1()
# step2()