-
Notifications
You must be signed in to change notification settings - Fork 3
/
github_user_details.py
226 lines (195 loc) · 8.9 KB
/
github_user_details.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import pandas as pd
from github import Github
import time
import concurrent.futures
from evadb.catalog.catalog_type import ColumnType
from evadb.functions.abstract.abstract_function import AbstractFunction
from evadb.functions.decorators.decorators import forward, setup
from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe
class GithubUserDetails(AbstractFunction):
"""
Arguments:
None
Input Signatures:
github_username (str) : The GitHub username for the user whose details you want to retrieve.
github_token (str) : GitHub personal access token for authentication.
Output Signatures:
user_name (str) : The name of the GitHub user.
user_login (str) : The login (username) of the GitHub user.
user_following (int) : The number of users the GitHub user is following.
user_followers (int) : The number of followers the GitHub user has.
user_email (str) : The email address of the GitHub user.
user_id (int) : The unique ID of the GitHub user.
user_location (str) : The location of the GitHub user.
user_bio (str) : The bio of the GitHub user.
user_company (str) : The company associated with the GitHub user.
user_blog (str) : The blog URL of the GitHub user.
user_url (str) : The URL of the GitHub user's profile.
user_twitter_username (str) : The Twitter username of the GitHub user.
user_repos (list) : A list of dictionaries representing the user's repositories with 10+ stars.
user_starred (list) : A list of dictionaries representing repositories starred by the user with 10+ stars.
Example Usage:
You can use this function to retrieve details about a GitHub user as follows:
github_username = "username"
github_token = "your_personal_access_token"
cursor.function("GithubUserDetails", github_username, github_token)
"""
@property
def name(self) -> str:
return "GithubUserDetails"
@setup(cacheable=False, function_type="web-scraping")
def setup(self) -> None:
# Any setup or initialization can be done here if needed
pass
@forward(
input_signatures=[
PandasDataframe(
columns=["github_username", "github_token"],
column_types=[ColumnType.TEXT, ColumnType.TEXT],
column_shapes=[(1,), (1,)],
)
],
output_signatures=[
PandasDataframe(
columns=[
"user_name", "user_login",
"user_following", "user_followers",
"user_email",
"user_id",
"user_location", "user_bio",
"user_company", "user_blog", "user_url", "user_twitter_username",
"user_repos", "user_starred",
],
column_types=[
ColumnType.TEXT, ColumnType.TEXT,
ColumnType.INTEGER, ColumnType.INTEGER,
ColumnType.TEXT,
ColumnType.INTEGER,
ColumnType.TEXT, ColumnType.TEXT,
ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT, ColumnType.TEXT,
ColumnType.TEXT,
ColumnType.TEXT,
ColumnType.TEXT
],
column_shapes=[
(1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,), (1,)
],
)
],
)
def forward(self, input_df):
# Ensure the GitHub username is provided
if input_df.empty or input_df.iloc[0, 0] is None:
raise ValueError("GitHub username must be provided.")
# Extract inputs from the DataFrame
github_username = input_df.iloc[0, 0]
github_token = input_df.iloc[0, 1]
# Initialize GitHub API client
if github_token:
github = Github(github_token)
else:
github = Github()
# Create an empty list to store user details
user_details_list = []
# Define a function to fetch user details for a range of rows
def fetch_user_details_range(start_index, end_index):
# Process a range of rows from start_index to end_index
api_limit_error = False
i = 0
for index in range(start_index, end_index):
i = i + 1
# Avoid hitting API limit
if i != 0 and i % 10 == 0:
print(f"Downloading details of user: {i}")
time.sleep(30)
if api_limit_error == True:
api_limit_error = False
index = index - 1
github_username = input_df.iloc[index]["github_username"]
try:
# Retrieve the user object
user = github.get_user(github_username)
# Gather user details into separate variables
user_name = user.name
user_login = user.login
user_following = user.following
user_followers = user.followers
user_email = user.email
user_id = user.id
user_location = user.location
user_bio = user.bio
user_company = user.company
user_blog = user.blog
user_url = user.url
user_twitter_username = user.twitter_username
# Repos of user with 10+ stars
user_repos = user.get_repos()
user_created_repos = []
for repo in user_repos:
if repo.fork is False:
if repo.stargazers_count > 10:
user_created_repos.append({
repo.name,
repo.description,
repo.html_url,
repo.language
})
# Repos starred by user with 10+ stars
starred_repos = user.get_starred()
user_starred_repos = []
j = 0
for repo in starred_repos:
j = j + 1
if j > 10:
break
if repo.stargazers_count > 100:
user_starred_repos.append({
repo.name,
repo.description,
repo.html_url,
repo.language
})
# Gather user details into a dictionary
user_details = {
"user_name": user.name,
"user_login": user.login,
"user_following": user.following,
"user_followers": user.followers,
"user_email": user.email,
"user_id": user.id,
"user_location": user.location,
"user_bio": user.bio,
"user_company": user.company,
"user_blog": user.blog,
"user_url": user.url,
"user_twitter_username": user.twitter_username,
"user_repos": f"{user_created_repos}",
"user_starred_repos": f"{user_starred_repos}"
}
# Append user details to the list
user_details_list.append(user_details)
except Exception as e:
print(f"Error: {str(e)}")
api_limit_error = True
# sleep for 5 minutes
time.sleep(300)
num_workers = 1
num_rows = len(input_df)
rows_per_worker = num_rows // num_workers
print(f"Downloading details of {num_rows} users")
# Create a list of tuples defining the ranges for each worker
# Include any remaining rows in the last worker's range
worker_ranges = [
(i * rows_per_worker, (i + 1) * rows_per_worker)
for i in range(num_workers)
]
worker_ranges[-1] = (worker_ranges[-1][0], num_rows)
# Iterate over rows in the input DataFrame using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
for start_index, end_index in worker_ranges:
executor.submit(
fetch_user_details_range, start_index, end_index
)
# Create a DataFrame from the list of user details
user_details_df = pd.DataFrame(user_details_list)
return user_details_df