-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbanks_project.py
111 lines (79 loc) · 3.3 KB
/
banks_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import numpy as np
import requests
import sqlite3
from bs4 import BeautifulSoup
from datetime import datetime
def log_progress(msg):
timeformat = '%Y-%h-%d-%H:%M:%S'
now = datetime.now()
timestamp = now.strftime(timeformat)
with open(logfile, 'a') as f:
f.write(timestamp + ' : ' + msg + '\n')
def extract(url, table_attribs):
df = pd.DataFrame(columns = table_attribs)
page = requests.get(url).text
data = BeautifulSoup(page, 'html.parser')
tables = data.find_all('tbody')[0]
rows = tables.find_all('tr')
for row in rows:
col = row.find_all('td')
if len(col) != 0:
ancher_data = col[1].find_all('a')[1]
if ancher_data is not None:
data_dict = {
'Name': ancher_data.contents[0],
'MC_USD_Billion': col[2].contents[0]
}
df1 = pd.DataFrame(data_dict, index = [0])
df = pd.concat([df, df1], ignore_index = True)
USD_list = list(df['MC_USD_Billion'])
USD_list = [float(''.join(x.split('\n'))) for x in USD_list]
df['MC_USD_Billion'] = USD_list
return df
def transform(df, exchange_rate_path):
csvfile = pd.read_csv(exchange_rate_path)
# i made here the content for currenct is the keys and the content of
# the rate is the values to the crossponding keys
dict = csvfile.set_index('Currency').to_dict()['Rate']
df['MC_GBP_Billion'] = [np.round(x * dict['GBP'],2) for x in df['MC_USD_Billion']]
df['MC_INR_Billion'] = [np.round(x * dict['INR'],2) for x in df['MC_USD_Billion']]
df['MC_EUR_Billion'] = [np.round(x * dict['EUR'],2) for x in df['MC_USD_Billion']]
return df
def load_to_csv(df, output_path):
df.to_csv(output_path)
def load_to_db(df, sql_connection, table_name):
df.to_sql(table_name, sql_connection, if_exists = 'replace', index = False)
def run_query(query_statements, sql_connection):
for query in query_statements:
print(query)
print(pd.read_sql(query, sql_connection), '\n')
''' Here, you define the required entities and call the relevant
functions in the correct order to complete the project. Note that this
portion is not inside any function.'''
url = 'https://web.archive.org/web/20230908091635/https://en.wikipedia.org/wiki/List_of_largest_banks'
exchange_rate_path = 'exchange_rate.csv'
table_attribs = ['Name', 'MC_USD_Billion']
db_name = 'Banks.db'
table_name = 'Largest_banks'
conn = sqlite3.connect(db_name)
query_statements = [
'SELECT * FROM Largest_banks',
'SELECT AVG(MC_GBP_Billion) FROM Largest_banks',
'SELECT Name from Largest_banks LIMIT 5'
]
logfile = 'code_log.txt'
output_csv_path = 'Largest_banks_data.csv'
log_progress('Preliminaries complete. Initiating ETL process.')
df = extract(url, table_attribs)
log_progress('Data extraction complete. Initiating Transformation process.')
df = transform(df, exchange_rate_path)
log_progress('Data transformation complete. Initiating loading process.')
load_to_csv(df, output_csv_path)
log_progress('Data saved to CSV file.')
log_progress('SQL Connection initiated.')
load_to_db(df, conn, table_name)
log_progress('Data loaded to Database as table. Running the query.')
run_query(query_statements, conn)
conn.close()
log_progress('Process Complete.')