From b3768c68c88cabdb5dba403c24fda6c6ac6526a1 Mon Sep 17 00:00:00 2001 From: "Quang-Thanh Tran (tedd)" <66583019+thanhqtran@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:59:47 +0900 Subject: [PATCH] Create scrape_all.py --- scrape_all.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 scrape_all.py diff --git a/scrape_all.py b/scrape_all.py new file mode 100644 index 0000000..6500f6f --- /dev/null +++ b/scrape_all.py @@ -0,0 +1,62 @@ +import pandas as pd +import requests +from bs4 import BeautifulSoup +import xmltodict, json +import matplotlib.pyplot as plt +import os +import numpy as np +import matplotlib.pyplot as plt +import imageio.v2 as imageio +import textwrap + +# read database +database_csv = pd.read_csv('https://raw.githubusercontent.com/thanhqtran/gso-macro-monitor/refs/heads/main/dsbb_indicator_desc.csv') +database_df = pd.DataFrame(database_csv) + +# parse data from xml to python dictionary +def get_data(url): + r = requests.get(url) + soup = BeautifulSoup(r.text, 'xml') + data = xmltodict.parse(str(soup)) + return data + +def get_obs_data(dataframe): + x_dict = [] + y_dict = [] + for i in range(0,len(dataframe['Obs'])): + x = dataframe['Obs'][i]['@TIME_PERIOD'] + y = dataframe['Obs'][i]['@OBS_VALUE'] + x = pd.to_datetime(x) #convert x to datetime + try: + y = float(y) + except: + y = np.nan + x_dict.append(x) + y_dict.append(y) + return x_dict, y_dict + +def get_meta_data(dataframe): + meta_data = {} + meta_data['REF_AREA'] = dataframe['@REF_AREA'] + meta_data['INDICATOR'] = dataframe['@INDICATOR'] + meta_data['FREQ'] = dataframe['@FREQ'] + meta_data['DATA_DOMAIN'] = dataframe['@DATA_DOMAIN'] + return meta_data + +# return unique database and database_link pair +database_df = database_df.drop_duplicates(subset=['database', 'database_link', 'database_link_archive']).reset_index(drop=True) + +# extract data from database_link_archive +extracted_database = [] + +# archived +for i in range(0, len(database_df)): + database = database_df['database'][i] + url = database_df['database_link_archive'][i] + data = get_data(url) + database_raw = data['message:StructureSpecificData']['message:DataSet']['Series'] + extracted_database.append(database_raw) + +# save extracted data to json +with open('extracted_database.json', 'w') as f: + json.dump(extracted_database, f)