Skip to content

Commit

Permalink
feat: Refactored and added logging to CSV and image generators
Browse files Browse the repository at this point in the history
Renamed plots.py to make_plots.py to be consistent with make_csv.py.
Lightly refactored both scripts and added logging and progress bars
to aid in development work.  Required a minor change in requirements.
  • Loading branch information
chrisarridge committed Oct 1, 2024
1 parent 9ef7fed commit 4206099
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 220 deletions.
3 changes: 2 additions & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ xmlschema
lxml
requests
markupsafe
itsdangerous
itsdangerous
tqdm
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ six==1.16.0
# via python-dateutil
sqlparse==0.5.1
# via django
tqdm==4.66.5
# via -r requirements.in
urllib3==2.2.3
# via requests
werkzeug==3.0.4
Expand Down
2 changes: 2 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ sqlparse==0.5.1
# via
# -r requirements.txt
# django
tqdm==4.66.5
# via -r requirements.txt
urllib3==2.2.3
# via
# -r requirements.txt
Expand Down
315 changes: 150 additions & 165 deletions src/make_csv.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,25 @@
# Script to generate CSV files from data in the 'stats-calculated' folder,
# and extra logic in other files in this repository
"""Generates CSV files from data in the 'stats-calculated' folder and using additional logic
"""
import csv
import data
import os
import sys
import argparse
import logging

# Timeliness CSV files (frequency and timelag)
import data
import timeliness

# Forward-looking CSV file
import forwardlooking

# Comprehensiveness CSV files ('summary', 'core', 'financials' and 'valueadded')
import comprehensiveness

# # Coverage CSV file
# import coverage

# Summary Stats CSV file
import summary_stats

# Humanitarian CSV file
import humanitarian

import config

publisher_name = {publisher: publisher_json['result']['title'] for publisher, publisher_json in data.ckan_publishers.items()}

logger = logging.getLogger(__name__)


def publisher_dicts():
publisher_name = {publisher: publisher_json['result']['title'] for publisher, publisher_json in data.ckan_publishers.items()}
for publisher, activities in data.current_stats['inverted_publisher']['activities'].items():
if publisher not in data.ckan_publishers:
continue
Expand All @@ -48,154 +41,146 @@ def publisher_dicts():
}


with open(config.join_out_path('publishers.csv'), 'w') as fp:
writer = csv.DictWriter(fp, [
'Publisher Name',
'Publisher Registry Id',
'Activities',
'Organisations',
'Files',
'Activity Files',
'Organisation Files',
'Total File Size',
'Reporting Org on Registry',
'Reporting Orgs in Data (count)',
'Reporting Orgs in Data',
'Hierarchies (count)',
'Hierarchies',
])
writer.writeheader()
for d in publisher_dicts():
writer.writerow(d)

publishers = list(data.current_stats['inverted_publisher']['activities'].keys())

with open(config.join_out_path('elements.csv'), 'w') as fp:
writer = csv.DictWriter(fp, ['Element'] + publishers)
writer.writeheader()
for element, publisher_dict in data.current_stats['inverted_publisher']['elements'].items():
publisher_dict['Element'] = element
writer.writerow(publisher_dict)

with open(config.join_out_path('elements_total.csv'), 'w') as fp:
writer = csv.DictWriter(fp, ['Element'] + publishers)
writer.writeheader()
for element, publisher_dict in data.current_stats['inverted_publisher']['elements_total'].items():
publisher_dict['Element'] = element
writer.writerow(publisher_dict)

with open(config.join_out_path('registry.csv'), 'w') as fp:
keys = ['name', 'title', 'publisher_frequency', 'publisher_frequency_select', 'publisher_implementation_schedule', 'publisher_ui', 'publisher_field_exclusions', 'publisher_contact', 'image_url', 'display_name', 'publisher_iati_id', 'publisher_units', 'publisher_record_exclusions', 'publisher_data_quality', 'publisher_country', 'publisher_description', 'publisher_refs', 'publisher_thresholds' 'publisher_agencies', 'publisher_constraints', 'publisher_organization_type', 'publisher_segmentation', 'license_id', 'state', 'publisher_timeliness']
writer = csv.DictWriter(fp, keys)
writer.writeheader()
for publisher_json in data.ckan_publishers.values():
writer.writerow({x: publisher_json['result'].get(x) or 0 for x in keys})

previous_months = timeliness.previous_months_reversed

with open(config.join_out_path('timeliness_frequency.csv'), 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['Publisher Name', 'Publisher Registry Id'] + previous_months + ['Frequency', 'First published'])
for publisher, publisher_title, per_month, assessment, hft, first_published_band in timeliness.publisher_frequency_sorted():
writer.writerow([publisher_title, publisher] + [per_month.get(x) or 0 for x in previous_months] + [assessment, first_published_band])

with open(config.join_out_path('timeliness_timelag.csv'), 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['Publisher Name', 'Publisher Registry Id'] + previous_months + ['Time lag'])
for publisher, publisher_title, per_month, assessment, hft in timeliness.publisher_timelag_sorted():
writer.writerow([publisher_title, publisher] + [per_month.get(x) or 0 for x in previous_months] + [assessment])

with open(config.join_out_path('forwardlooking.csv'), 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['Publisher Name', 'Publisher Registry Id'] + ['{} ({})'.format(header, year) for header in forwardlooking.column_headers for year in forwardlooking.years])
for row in forwardlooking.table():
writer.writerow([row['publisher_title'], row['publisher']] + [year_column[year] for year_column in row['year_columns'] for year in forwardlooking.years])

for tab in comprehensiveness.columns.keys():
with open(config.join_out_path('comprehensiveness_{}.csv'.format(tab)), 'w') as fp:
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--verbose", action="store_true", help="Output progress to stdout")
args = parser.parse_args()

if args.verbose:
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

logger.info("Generating CSV files")
os.makedirs(config.join_out_path('data/csv'), exist_ok=True)

logger.info("Generating publishers.csv")
with open(config.join_out_path('data/csv/publishers.csv'), 'w') as fp:
writer = csv.DictWriter(fp, [
'Publisher Name',
'Publisher Registry Id',
'Activities',
'Organisations',
'Files',
'Activity Files',
'Organisation Files',
'Total File Size',
'Reporting Org on Registry',
'Reporting Orgs in Data (count)',
'Reporting Orgs in Data',
'Hierarchies (count)',
'Hierarchies',
])
writer.writeheader()
for d in publisher_dicts():
writer.writerow(d)

logger.info("Generating elements.csv")
publishers = list(data.current_stats['inverted_publisher']['activities'].keys())
with open(config.join_out_path('data/csv/elements.csv'), 'w') as fp:
writer = csv.DictWriter(fp, ['Element'] + publishers)
writer.writeheader()
for element, publisher_dict in data.current_stats['inverted_publisher']['elements'].items():
publisher_dict['Element'] = element
writer.writerow(publisher_dict)

logger.info("Generating elements_total.csv")
with open(config.join_out_path('data/csv/elements_total.csv'), 'w') as fp:
writer = csv.DictWriter(fp, ['Element'] + publishers)
writer.writeheader()
for element, publisher_dict in data.current_stats['inverted_publisher']['elements_total'].items():
publisher_dict['Element'] = element
writer.writerow(publisher_dict)

logger.info("Generating registry.csv")
with open(config.join_out_path('data/csv/registry.csv'), 'w') as fp:
keys = ['name', 'title', 'publisher_frequency', 'publisher_frequency_select', 'publisher_implementation_schedule', 'publisher_ui', 'publisher_field_exclusions', 'publisher_contact', 'image_url', 'display_name', 'publisher_iati_id', 'publisher_units', 'publisher_record_exclusions', 'publisher_data_quality', 'publisher_country', 'publisher_description', 'publisher_refs', 'publisher_thresholds' 'publisher_agencies', 'publisher_constraints', 'publisher_organization_type', 'publisher_segmentation', 'license_id', 'state', 'publisher_timeliness']
writer = csv.DictWriter(fp, keys)
writer.writeheader()
for publisher_json in data.ckan_publishers.values():
writer.writerow({x: publisher_json['result'].get(x) or 0 for x in keys})

logger.info("Generating timeliness_frequency.csv")
previous_months = timeliness.previous_months_reversed
with open(config.join_out_path('data/csv/timeliness_frequency.csv'), 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['Publisher Name', 'Publisher Registry Id'] + previous_months + ['Frequency', 'First published'])
for publisher, publisher_title, per_month, assessment, hft, first_published_band in timeliness.publisher_frequency_sorted():
writer.writerow([publisher_title, publisher] + [per_month.get(x) or 0 for x in previous_months] + [assessment, first_published_band])

logger.info("Generating timeliness_timelag.csv")
with open(config.join_out_path('data/csv/timeliness_timelag.csv'), 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['Publisher Name', 'Publisher Registry Id'] + previous_months + ['Time lag'])
for publisher, publisher_title, per_month, assessment, hft in timeliness.publisher_timelag_sorted():
writer.writerow([publisher_title, publisher] + [per_month.get(x) or 0 for x in previous_months] + [assessment])

logger.info("Generating forwardlooking.csv")
with open(config.join_out_path('data/csv/forwardlooking.csv'), 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['Publisher Name', 'Publisher Registry Id'] + ['{} ({})'.format(header, year) for header in forwardlooking.column_headers for year in forwardlooking.years])
for row in forwardlooking.table():
writer.writerow([row['publisher_title'], row['publisher']] + [year_column[year] for year_column in row['year_columns'] for year in forwardlooking.years])

for tab in comprehensiveness.columns.keys():
logger.info("Generating comprehensiveness_{}.csv".format(tab))
with open(config.join_out_path('data/csv/comprehensiveness_{}.csv'.format(tab)), 'w') as fp:
writer = csv.writer(fp)
if tab == 'financials':
writer.writerow(['Publisher Name', 'Publisher Registry Id'] +
[x + ' (with valid data)' for x in comprehensiveness.column_headers[tab]] +
[x + ' (with any data)' for x in comprehensiveness.column_headers[tab]] +
['Using budget-not-provided'])
for row in comprehensiveness.table():
writer.writerow([row['publisher_title'], row['publisher']] +
[row[slug + '_valid'] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] +
[row[slug] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] +
['Yes' if row['flag'] else '-'])
else:
writer.writerow(['Publisher Name', 'Publisher Registry Id'] +
[x + ' (with valid data)' for x in comprehensiveness.column_headers[tab]] +
[x + ' (with any data)' for x in comprehensiveness.column_headers[tab]])
for row in comprehensiveness.table():
writer.writerow([row['publisher_title'], row['publisher']] +
[row[slug + '_valid'] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] +
[row[slug] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]])

logger.info("Generating summary_stats.csv")
with open(config.join_out_path('data/csv/summary_stats.csv'), 'w') as fp:
writer = csv.writer(fp)
# Add column headers
writer.writerow(['Publisher Name', 'Publisher Registry Id'] + [header for slug, header in summary_stats.columns])
for row in summary_stats.table():
# Write each row
writer.writerow([row['publisher_title'], row['publisher']] + [row[slug] for slug, header in summary_stats.columns])

logger.info("Generating humanitarian.csv")
with open(config.join_out_path('data/csv/humanitarian.csv'), 'w') as fp:
writer = csv.writer(fp)
if tab == 'financials':
writer.writerow(['Publisher Name', 'Publisher Registry Id'] +
[x + ' (with valid data)' for x in comprehensiveness.column_headers[tab]] +
[x + ' (with any data)' for x in comprehensiveness.column_headers[tab]] +
['Using budget-not-provided'])
for row in comprehensiveness.table():
writer.writerow([row['publisher_title'], row['publisher']] +
[row[slug + '_valid'] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] +
[row[slug] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] +
['Yes' if row['flag'] else '-'])
else:
writer.writerow(['Publisher Name', 'Publisher Registry Id'] +
[x + ' (with valid data)' for x in comprehensiveness.column_headers[tab]] +
[x + ' (with any data)' for x in comprehensiveness.column_headers[tab]])
for row in comprehensiveness.table():
writer.writerow([row['publisher_title'], row['publisher']] +
[row[slug + '_valid'] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] +
[row[slug] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]])

# with open(os.path.join('out', 'coverage.csv'), 'w') as fp:
# writer = csv.writer(fp)
# # Add column headers
# writer.writerow([
# 'Publisher Name',
# 'Publisher Registry Id',
# '2014 IATI Spend (US $m)',
# '2015 IATI Spend (US $m)',
# '2014 Reference Spend (US $m)',
# '2015 Reference Spend (US $m)',
# '2015 Official Forecast (US $m)',
# 'Spend Ratio (%)',
# 'No reference data available (Historic publishers)',
# 'No reference data available (New publishers)',
# 'Data quality issue reported'
# ])
# for row in coverage.table():
# # Write each row
# writer.writerow([
# row['publisher_title'],
# row['publisher'],
# row['iati_spend_2014'],
# row['iati_spend_2015'],
# row['reference_spend_2014'],
# row['reference_spend_2015'],
# row['official_forecast_2015'],
# row['spend_ratio'],
# row['no_data_flag_red'],
# row['no_data_flag_amber'],
# row['spend_data_error_reported_flag']
# ])

with open(config.join_out_path('summary_stats.csv'), 'w') as fp:
writer = csv.writer(fp)
# Add column headers
writer.writerow(['Publisher Name', 'Publisher Registry Id'] + [header for slug, header in summary_stats.columns])
for row in summary_stats.table():
# Write each row
writer.writerow([row['publisher_title'], row['publisher']] + [row[slug] for slug, header in summary_stats.columns])

with open(config.join_out_path('humanitarian.csv'), 'w') as fp:
writer = csv.writer(fp)
# Add column headers
writer.writerow([
'Publisher Name',
'Publisher Registry Id',
'Publisher Type',
'Number of Activities',
'Publishing Humanitarian',
'Using Humanitarian Attribute',
'Appeal or Emergency Details',
'Clusters',
'Humanitarian Score'
])
for row in humanitarian.table():
# Add column headers
writer.writerow([
row['publisher_title'],
row['publisher'],
row['publisher_type'],
row['num_activities'],
row['publishing_humanitarian'],
row['humanitarian_attrib'],
row['appeal_emergency'],
row['clusters'],
row['average']
'Publisher Name',
'Publisher Registry Id',
'Publisher Type',
'Number of Activities',
'Publishing Humanitarian',
'Using Humanitarian Attribute',
'Appeal or Emergency Details',
'Clusters',
'Humanitarian Score'
])
for row in humanitarian.table():
writer.writerow([
row['publisher_title'],
row['publisher'],
row['publisher_type'],
row['num_activities'],
row['publishing_humanitarian'],
row['humanitarian_attrib'],
row['appeal_emergency'],
row['clusters'],
row['average']
])


if __name__ == "__main__":
main()
Loading

0 comments on commit 4206099

Please sign in to comment.