-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_scraping_example.py
105 lines (93 loc) · 4.39 KB
/
data_scraping_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xmltodict, json
import matplotlib.pyplot as plt
import os
import numpy as np
import matplotlib.pyplot as plt
import imageio
# database URL
url = 'http://nsdp.gso.gov.vn/GSO-chung/SDMXFiles/GSO/GSO.%20Chi%20so%20cong%20nghiep.IIP_Vietnam.xml'
# read xml file from url
r = requests.get(url)
soup = BeautifulSoup(r.text, 'xml')
# convert xml to python dict
data = xmltodict.parse(str(soup))
# explore data structure
structure = {0: {'AIP_ISIC4_IX':'Industry (2015=100)'}, 1:{'AIP_ISIC4_B_IX':'Mining and Quarying (2015=100)'}, 2:{'AIP_ISIC4_C_IX':'Manufacturing (2015=100)'}, 3:{'AIP_ISIC4_D_IX':'Electricity, Gas (2015=100)'}, 4:{'AIP_ISIC4_E_IX':'Water Supply: Sewerage, Waste Mgt/Remediation Activities(2015=100)'}}
data_series = data['message:StructureSpecificData']['message:DataSet']['Series']
df_industry = data_series[0]
df_mining = data_series[1]
df_manufacturing = data_series[2]
df_electricity = data_series[3]
df_water = data_series[4]
# get industry data
def get_industry_data(dataframe):
x_dict = []
y_dict = []
for i in range(0,len(dataframe['Obs'])):
x = dataframe['Obs'][i]['@TIME_PERIOD']
y = dataframe['Obs'][i]['@OBS_VALUE']
x = pd.to_datetime(x).strftime('%Y-%m') #convert x to datetime
x_dict.append(x)
y_dict.append(float(y)) #convert y to float
return x_dict, y_dict
x, y = get_industry_data(df_industry)
# plot
fig, ax = plt.subplots(figsize=(10,5))
fontsize = 14
# plot data
# barplot for industry
plt.bar(get_industry_data(df_industry)[0], get_industry_data(df_industry)[1], color='#ff7f0e', width=0.5, label='All Industry', alpha=0.2)
#plt.plot(get_industry_data(df_industry)[0], get_industry_data(df_industry)[1], label='All Industry', ls='-', marker='o', color='blue')
plt.plot(get_industry_data(df_manufacturing)[0], get_industry_data(df_manufacturing)[1], label='Manufacturing')
plt.plot(get_industry_data(df_electricity)[0], get_industry_data(df_electricity)[1], label='Electricity, Gas')
plt.plot(get_industry_data(df_water)[0], get_industry_data(df_water)[1], label='Water/Sewage')
plt.plot(get_industry_data(df_mining)[0], get_industry_data(df_mining)[1], label='Mining and Quarying')
#legend outside
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1, fontsize=fontsize)
#set x-ticks every 3 months
ax.xaxis.set_major_locator(plt.MultipleLocator(4))
plt.xlabel('Month', fontsize=fontsize)
plt.xticks(rotation=60, fontsize=12)
plt.ylabel('Index, 2015=100', fontsize=fontsize)
plt.title('Vietnam Industrial Production Index (monthly)', fontsize=16)
# make gif
# ======== gif for iip ======================
filenames = []
x = get_industry_data(df_industry)[0]
for i in range(0, len(x)):
fig, ax = plt.subplots(figsize=(10,5))
fontsize = 14
# plot data
plt.bar(get_industry_data(df_industry)[0][:i], get_industry_data(df_industry)[1][:i], color='#ff7f0e', width=0.5, label='All Industry', alpha=0.2)
#plt.plot(get_industry_data(df_industry)[0][:i], get_industry_data(df_industry)[1][:i], label='All Industry', ls='-', marker='o', color='blue')
plt.plot(get_industry_data(df_manufacturing)[0][:i], get_industry_data(df_manufacturing)[1][:i], label='Manufacturing')
plt.plot(get_industry_data(df_electricity)[0][:i], get_industry_data(df_electricity)[1][:i], label='Electricity, Gas')
plt.plot(get_industry_data(df_water)[0][:i], get_industry_data(df_water)[1][:i], label='Water/Sewage')
plt.plot(get_industry_data(df_mining)[0][:i], get_industry_data(df_mining)[1][:i], label='Mining and Quarying')
#legend outside
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1, fontsize=fontsize)
#set x-ticks every 3 months
ax.xaxis.set_major_locator(plt.MultipleLocator(4))
plt.xlabel('Month', fontsize=fontsize)
plt.xticks(rotation=45, fontsize=10)
plt.ylabel('Index, 2015=100', fontsize=fontsize)
plt.title('Vietnam Industrial Production Index (monthly)', fontsize=16)
# create file name and append it to a list
filename = f'{i}.png'
filenames.append(filename)
# save frame
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.close()
# build gif
gif_filename = 'iip.gif'
images = []
for filename in filenames:
images.append(imageio.imread(filename))
duration = 0.15
imageio.mimsave(gif_filename, images, duration=duration, loop=1)
# Remove files
for filename in set(filenames):
os.remove(filename)