-
Notifications
You must be signed in to change notification settings - Fork 0
/
gosac.py
148 lines (132 loc) · 5.7 KB
/
gosac.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import getopt, sys, time, os
from pathlib import Path
firefox_opt = Options()
firefox_opt.headless = True
def clean_text(text) -> str:
ct = text
chars = "\\;:/`*_{}[]()><#+-.,!@#$%\"'\"&'´?"
for c in chars:
ct = ct.replace(c, "")
return ct.lower()
def main():
args_in = sys.argv[1:]
options = "f:nyried:h"
long_options = ["file", "not-log", "yes-log", "real-name", "iter-save", "end-save", "delay", "help"]
ai=an=ay=ar=ae=False
delay=3
author_file = None
try:
#parse args
args, values = getopt.getopt(args_in, options, long_options)
for curr_arg, curr_value in args:
if curr_arg in ("-f", "--file"):
author_file = curr_value
elif curr_arg in ("-n", "--not-log"):
an = True
elif curr_arg in ("-y", "--yes-log"):
ay = True
elif curr_arg in ("-r", "--real-name"):
ar = True
elif curr_arg in ("-i", "--iter-save"):
ai = True
elif curr_arg in ("-e", "--end-save"):
ae = True
elif curr_arg in ("-d", "--delay"):
try:
delay = int(curr_value)
except ValueError:
print("opt err: --delay must be a integer")
sys.exit(1)
elif curr_arg in ("-h", "--help"):
print("""
[-f filename] INPUT FILE
[-n] LOG NOT FOUND AUTHOR TO STDOUT
[-y] LOG FOUND AUTHOR DO STDOUT
[-r] SAVE THE AUTHOR NAME SCRAPPED TO FINAL RESULT DATAFRAME CSV
[-i] SAVE DATA IN DATAFRAME CSV WHILE IT IS BEEN CAPTURED [default]
[-e] COLLECT ALL DATA AND SAVE AT THE END
[-d] REQUESTS DELAY
[-h] THIS MESSAGE
""")
sys.exit(0)
else:
print("opt err: undefined option " + str(curr_arg))
sys.exit(1)
except getopt.error as err:
print(str(err))
sys.exit(2)
#args conditions
if not author_file:
print("file err: author file not present in argv")
sys.exit(3)
if ai and ae:
print("opt err: Can't use --iter-save and --end-save simultaneously")
sys.exit(4)
if not ae:
ai = True
authors = []
with open(author_file) as file:
authors = [ line.strip() for line in file ]
if len(authors) == 0:
print("empty err: empty file provided")
sys.exit(5)
driver = webdriver.Firefox(options=firefox_opt, service=FirefoxService(GeckoDriverManager().install()))
result_lines = {"Author":[], "AuthorSN": [], "Citations": []} if ar else {"Author": [], "Citations": []}
for author in authors:
outer_url = "https://scholar.google.com/citations?view_op=search_authors&mauthors=" + author + "&hl=en-US"
driver.get(outer_url)
time.sleep(delay)
soup = BeautifulSoup(driver.page_source, 'html.parser')
author_candidates = soup.find_all("h3", {"class":"gs_ai_name"})
#match author name
author_found = ""
tag = None
for ac in author_candidates:
if author == ac.text.strip() or clean_text(author) == clean_text(ac.text):
author_found = ac.text.strip()
tag = ac
if ay:
print("FOUND: "+ author + " --> " + author_found)
break
if not author_found:
if an:
print("NOT FOUND: "+ author)
else:
inner_url = "https://scholar.google.com"+tag.find('a')['href']
driver.get(inner_url)
time.sleep(delay)
soup = BeautifulSoup(driver.page_source, 'html.parser')
metric_lines = soup.find("table", {"id": "gsc_rsb_st"}).find_all("tr")
for ml in metric_lines:
if "citations" in ml.text.lower():
if ae:
result_lines["Author"].append(author)
if ar:
result_lines["AuthorSN"].append(tag.text.strip())
result_lines["Citations"].append(ml.find_all("td")[1].text.strip())
else:
result_line = {"Author":[author], "AuthorSN": [tag.text.strip()], "Citations": [ml.find_all("td")[1].text.strip()]} if ar else {"Author": [author], "Citations": [m1.find_all("td")[1].text.strip()]}
df = pd.DataFrame(result_line)
if os.path.exists(Path(author_file).stem+"_out.csv"):
df.to_csv(Path(author_file).stem+'_out.csv', index=False, mode='a', header=False)
else:
df.to_csv(Path(author_file).stem+'_out.csv',index=False, mode='w')
break
if ae:
df = pd.DataFrame(result_lines)
df.to_csv(Path(author_file).stem+'_out.csv',index=False, mode='w')
driver.quit()
print("""
██████ ██████ ███████ █████ ██████
██ ██ ██ ██ ██ ██ ██
██ ███ ██ ██ ███████ ███████ ██
██ ██ ██ ██ ██ ██ ██ ██
██████ ██████ ███████ ██ ██ ██████ v0.1
""")
main()