Skip to content

Commit

Permalink
Create scrape_all.py
Browse files Browse the repository at this point in the history
  • Loading branch information
thanhqtran authored Nov 20, 2024
1 parent ae7d74a commit b3768c6
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions scrape_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pandas as pd
import requests
from bs4 import BeautifulSoup
import xmltodict, json
import matplotlib.pyplot as plt
import os
import numpy as np
import matplotlib.pyplot as plt
import imageio.v2 as imageio
import textwrap

# read database
database_csv = pd.read_csv('https://raw.githubusercontent.com/thanhqtran/gso-macro-monitor/refs/heads/main/dsbb_indicator_desc.csv')
database_df = pd.DataFrame(database_csv)

# parse data from xml to python dictionary
def get_data(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'xml')
data = xmltodict.parse(str(soup))
return data

def get_obs_data(dataframe):
x_dict = []
y_dict = []
for i in range(0,len(dataframe['Obs'])):
x = dataframe['Obs'][i]['@TIME_PERIOD']
y = dataframe['Obs'][i]['@OBS_VALUE']
x = pd.to_datetime(x) #convert x to datetime
try:
y = float(y)
except:
y = np.nan
x_dict.append(x)
y_dict.append(y)
return x_dict, y_dict

def get_meta_data(dataframe):
meta_data = {}
meta_data['REF_AREA'] = dataframe['@REF_AREA']
meta_data['INDICATOR'] = dataframe['@INDICATOR']
meta_data['FREQ'] = dataframe['@FREQ']
meta_data['DATA_DOMAIN'] = dataframe['@DATA_DOMAIN']
return meta_data

# return unique database and database_link pair
database_df = database_df.drop_duplicates(subset=['database', 'database_link', 'database_link_archive']).reset_index(drop=True)

# extract data from database_link_archive
extracted_database = []

# archived
for i in range(0, len(database_df)):
database = database_df['database'][i]
url = database_df['database_link_archive'][i]
data = get_data(url)
database_raw = data['message:StructureSpecificData']['message:DataSet']['Series']
extracted_database.append(database_raw)

# save extracted data to json
with open('extracted_database.json', 'w') as f:
json.dump(extracted_database, f)

0 comments on commit b3768c6

Please sign in to comment.