-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCmcTop10Scrape.py
59 lines (47 loc) · 2.48 KB
/
CmcTop10Scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
import time
from datetime import datetime
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# Declare browser
chrome_driver_path = '/home/user/Downloads/chromedriver-linux64/chromedriver'
#change "user" to your pc user
service = Service(chrome_driver_path)# Create a Service object
driver = webdriver.Chrome(service=service) # Pass the Service object to the webdriver.Chrome
#define url of page to exract data from
url='https://coinmarketcap.com/gainers-losers/'
driver.get(url)
time.sleep(5) #Sleep for few seconds so, by that time, the webpage gets loaded.
ranking = driver.find_elements(By.XPATH, '//*[@id="__next"]/div[2]/div/div[2]/div/div[2]/div/div[1]/div/table')# get element by XPATH from element selection in inspect mode
""" rec = [s.text for s in ranking] + rec #this will return ['#\nName\nPrice\n24h\nVolume(24h)
print(rec) """
data = []
for index, rank in enumerate(ranking):
rowData = rank.text.split('\n')
#print(rowData) #This prints the extracted data all in a single row
data.append(rowData)
'''print(rowData)'''
#Resahaping the data list to # cols, should be indented or not?
lst1 = rowData[5:] #popping headers
expectedRows = len(lst1) // 4 #divide the list by no of expected cols to get no of expected rows
lst = np.array(lst1)#turning the simngle rowdata list into an array
reshpd = lst.reshape(int(expectedRows), 4) #to get row x col
'''print(reshpd)'''
reshpdtrimmed = reshpd[:, :-1]#'Price' '24h%' 'Vol(24h)' were merged together so I popped them
forth = np.array([row[3].split() for row in reshpd])#now splitting 'Price' '24h%' 'Vol(24h)' on their own
'''print(reshpdtrimmed)'''
merged = np.concatenate((reshpdtrimmed, forth), axis = 1)#coming together making the perfect array
'''print(merged)'''
first10 = np.array(merged[:10])#only need the first 10
'''print(first10)'''
#passing to nympy Dataframe to get tabular form
df = pd.DataFrame(first10, index=[np.arange(1, 11)], columns=['CmcRank', 'Name', 'Symbol', 'Price', '24h%', 'Vol(24h)']) #use index_col=0 to remove default pd index(numbering)
#Initially getting errors of ValueError: 5 columns passed, passed data had 125 columns
#so needed to break the cols down into 5
timestamp = datetime.utcnow().strftime('%b %d, %Y %H:%M:%S UTC') #added timestamp
title = f"Top 10 CMC 24h% Increase for {timestamp}" #added title
print(title)
print(df)
driver.quit