diff --git a/requirements.in b/requirements.in index 83a6919f8..523b2f0a4 100644 --- a/requirements.in +++ b/requirements.in @@ -10,4 +10,5 @@ xmlschema lxml requests markupsafe -itsdangerous \ No newline at end of file +itsdangerous +tqdm \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b1cb4aa5a..afde658de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -73,6 +73,8 @@ six==1.16.0 # via python-dateutil sqlparse==0.5.1 # via django +tqdm==4.66.5 + # via -r requirements.in urllib3==2.2.3 # via requests werkzeug==3.0.4 diff --git a/requirements_dev.txt b/requirements_dev.txt index 53760817b..480e11b87 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -135,6 +135,8 @@ sqlparse==0.5.1 # via # -r requirements.txt # django +tqdm==4.66.5 + # via -r requirements.txt urllib3==2.2.3 # via # -r requirements.txt diff --git a/src/make_csv.py b/src/make_csv.py index 5d4410e09..f34e2f9e3 100644 --- a/src/make_csv.py +++ b/src/make_csv.py @@ -1,32 +1,25 @@ -# Script to generate CSV files from data in the 'stats-calculated' folder, -# and extra logic in other files in this repository +"""Generates CSV files from data in the 'stats-calculated' folder and using additional logic +""" import csv -import data +import os +import sys +import argparse +import logging -# Timeliness CSV files (frequency and timelag) +import data import timeliness - -# Forward-looking CSV file import forwardlooking - -# Comprehensiveness CSV files ('summary', 'core', 'financials' and 'valueadded') import comprehensiveness - -# # Coverage CSV file -# import coverage - -# Summary Stats CSV file import summary_stats - -# Humanitarian CSV file import humanitarian - import config -publisher_name = {publisher: publisher_json['result']['title'] for publisher, publisher_json in data.ckan_publishers.items()} + +logger = logging.getLogger(__name__) def publisher_dicts(): + publisher_name = {publisher: publisher_json['result']['title'] for publisher, publisher_json in data.ckan_publishers.items()} for publisher, activities in data.current_stats['inverted_publisher']['activities'].items(): if publisher not in data.ckan_publishers: continue @@ -48,154 +41,146 @@ def publisher_dicts(): } -with open(config.join_out_path('publishers.csv'), 'w') as fp: - writer = csv.DictWriter(fp, [ - 'Publisher Name', - 'Publisher Registry Id', - 'Activities', - 'Organisations', - 'Files', - 'Activity Files', - 'Organisation Files', - 'Total File Size', - 'Reporting Org on Registry', - 'Reporting Orgs in Data (count)', - 'Reporting Orgs in Data', - 'Hierarchies (count)', - 'Hierarchies', - ]) - writer.writeheader() - for d in publisher_dicts(): - writer.writerow(d) - -publishers = list(data.current_stats['inverted_publisher']['activities'].keys()) - -with open(config.join_out_path('elements.csv'), 'w') as fp: - writer = csv.DictWriter(fp, ['Element'] + publishers) - writer.writeheader() - for element, publisher_dict in data.current_stats['inverted_publisher']['elements'].items(): - publisher_dict['Element'] = element - writer.writerow(publisher_dict) - -with open(config.join_out_path('elements_total.csv'), 'w') as fp: - writer = csv.DictWriter(fp, ['Element'] + publishers) - writer.writeheader() - for element, publisher_dict in data.current_stats['inverted_publisher']['elements_total'].items(): - publisher_dict['Element'] = element - writer.writerow(publisher_dict) - -with open(config.join_out_path('registry.csv'), 'w') as fp: - keys = ['name', 'title', 'publisher_frequency', 'publisher_frequency_select', 'publisher_implementation_schedule', 'publisher_ui', 'publisher_field_exclusions', 'publisher_contact', 'image_url', 'display_name', 'publisher_iati_id', 'publisher_units', 'publisher_record_exclusions', 'publisher_data_quality', 'publisher_country', 'publisher_description', 'publisher_refs', 'publisher_thresholds' 'publisher_agencies', 'publisher_constraints', 'publisher_organization_type', 'publisher_segmentation', 'license_id', 'state', 'publisher_timeliness'] - writer = csv.DictWriter(fp, keys) - writer.writeheader() - for publisher_json in data.ckan_publishers.values(): - writer.writerow({x: publisher_json['result'].get(x) or 0 for x in keys}) - -previous_months = timeliness.previous_months_reversed - -with open(config.join_out_path('timeliness_frequency.csv'), 'w') as fp: - writer = csv.writer(fp) - writer.writerow(['Publisher Name', 'Publisher Registry Id'] + previous_months + ['Frequency', 'First published']) - for publisher, publisher_title, per_month, assessment, hft, first_published_band in timeliness.publisher_frequency_sorted(): - writer.writerow([publisher_title, publisher] + [per_month.get(x) or 0 for x in previous_months] + [assessment, first_published_band]) - -with open(config.join_out_path('timeliness_timelag.csv'), 'w') as fp: - writer = csv.writer(fp) - writer.writerow(['Publisher Name', 'Publisher Registry Id'] + previous_months + ['Time lag']) - for publisher, publisher_title, per_month, assessment, hft in timeliness.publisher_timelag_sorted(): - writer.writerow([publisher_title, publisher] + [per_month.get(x) or 0 for x in previous_months] + [assessment]) - -with open(config.join_out_path('forwardlooking.csv'), 'w') as fp: - writer = csv.writer(fp) - writer.writerow(['Publisher Name', 'Publisher Registry Id'] + ['{} ({})'.format(header, year) for header in forwardlooking.column_headers for year in forwardlooking.years]) - for row in forwardlooking.table(): - writer.writerow([row['publisher_title'], row['publisher']] + [year_column[year] for year_column in row['year_columns'] for year in forwardlooking.years]) - -for tab in comprehensiveness.columns.keys(): - with open(config.join_out_path('comprehensiveness_{}.csv'.format(tab)), 'w') as fp: +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--verbose", action="store_true", help="Output progress to stdout") + args = parser.parse_args() + + if args.verbose: + logger.setLevel(logging.INFO) + logger.addHandler(logging.StreamHandler(sys.stdout)) + + logger.info("Generating CSV files") + os.makedirs(config.join_out_path('data/csv'), exist_ok=True) + + logger.info("Generating publishers.csv") + with open(config.join_out_path('data/csv/publishers.csv'), 'w') as fp: + writer = csv.DictWriter(fp, [ + 'Publisher Name', + 'Publisher Registry Id', + 'Activities', + 'Organisations', + 'Files', + 'Activity Files', + 'Organisation Files', + 'Total File Size', + 'Reporting Org on Registry', + 'Reporting Orgs in Data (count)', + 'Reporting Orgs in Data', + 'Hierarchies (count)', + 'Hierarchies', + ]) + writer.writeheader() + for d in publisher_dicts(): + writer.writerow(d) + + logger.info("Generating elements.csv") + publishers = list(data.current_stats['inverted_publisher']['activities'].keys()) + with open(config.join_out_path('data/csv/elements.csv'), 'w') as fp: + writer = csv.DictWriter(fp, ['Element'] + publishers) + writer.writeheader() + for element, publisher_dict in data.current_stats['inverted_publisher']['elements'].items(): + publisher_dict['Element'] = element + writer.writerow(publisher_dict) + + logger.info("Generating elements_total.csv") + with open(config.join_out_path('data/csv/elements_total.csv'), 'w') as fp: + writer = csv.DictWriter(fp, ['Element'] + publishers) + writer.writeheader() + for element, publisher_dict in data.current_stats['inverted_publisher']['elements_total'].items(): + publisher_dict['Element'] = element + writer.writerow(publisher_dict) + + logger.info("Generating registry.csv") + with open(config.join_out_path('data/csv/registry.csv'), 'w') as fp: + keys = ['name', 'title', 'publisher_frequency', 'publisher_frequency_select', 'publisher_implementation_schedule', 'publisher_ui', 'publisher_field_exclusions', 'publisher_contact', 'image_url', 'display_name', 'publisher_iati_id', 'publisher_units', 'publisher_record_exclusions', 'publisher_data_quality', 'publisher_country', 'publisher_description', 'publisher_refs', 'publisher_thresholds' 'publisher_agencies', 'publisher_constraints', 'publisher_organization_type', 'publisher_segmentation', 'license_id', 'state', 'publisher_timeliness'] + writer = csv.DictWriter(fp, keys) + writer.writeheader() + for publisher_json in data.ckan_publishers.values(): + writer.writerow({x: publisher_json['result'].get(x) or 0 for x in keys}) + + logger.info("Generating timeliness_frequency.csv") + previous_months = timeliness.previous_months_reversed + with open(config.join_out_path('data/csv/timeliness_frequency.csv'), 'w') as fp: + writer = csv.writer(fp) + writer.writerow(['Publisher Name', 'Publisher Registry Id'] + previous_months + ['Frequency', 'First published']) + for publisher, publisher_title, per_month, assessment, hft, first_published_band in timeliness.publisher_frequency_sorted(): + writer.writerow([publisher_title, publisher] + [per_month.get(x) or 0 for x in previous_months] + [assessment, first_published_band]) + + logger.info("Generating timeliness_timelag.csv") + with open(config.join_out_path('data/csv/timeliness_timelag.csv'), 'w') as fp: + writer = csv.writer(fp) + writer.writerow(['Publisher Name', 'Publisher Registry Id'] + previous_months + ['Time lag']) + for publisher, publisher_title, per_month, assessment, hft in timeliness.publisher_timelag_sorted(): + writer.writerow([publisher_title, publisher] + [per_month.get(x) or 0 for x in previous_months] + [assessment]) + + logger.info("Generating forwardlooking.csv") + with open(config.join_out_path('data/csv/forwardlooking.csv'), 'w') as fp: + writer = csv.writer(fp) + writer.writerow(['Publisher Name', 'Publisher Registry Id'] + ['{} ({})'.format(header, year) for header in forwardlooking.column_headers for year in forwardlooking.years]) + for row in forwardlooking.table(): + writer.writerow([row['publisher_title'], row['publisher']] + [year_column[year] for year_column in row['year_columns'] for year in forwardlooking.years]) + + for tab in comprehensiveness.columns.keys(): + logger.info("Generating comprehensiveness_{}.csv".format(tab)) + with open(config.join_out_path('data/csv/comprehensiveness_{}.csv'.format(tab)), 'w') as fp: + writer = csv.writer(fp) + if tab == 'financials': + writer.writerow(['Publisher Name', 'Publisher Registry Id'] + + [x + ' (with valid data)' for x in comprehensiveness.column_headers[tab]] + + [x + ' (with any data)' for x in comprehensiveness.column_headers[tab]] + + ['Using budget-not-provided']) + for row in comprehensiveness.table(): + writer.writerow([row['publisher_title'], row['publisher']] + + [row[slug + '_valid'] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] + + [row[slug] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] + + ['Yes' if row['flag'] else '-']) + else: + writer.writerow(['Publisher Name', 'Publisher Registry Id'] + + [x + ' (with valid data)' for x in comprehensiveness.column_headers[tab]] + + [x + ' (with any data)' for x in comprehensiveness.column_headers[tab]]) + for row in comprehensiveness.table(): + writer.writerow([row['publisher_title'], row['publisher']] + + [row[slug + '_valid'] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] + + [row[slug] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]]) + + logger.info("Generating summary_stats.csv") + with open(config.join_out_path('data/csv/summary_stats.csv'), 'w') as fp: + writer = csv.writer(fp) + # Add column headers + writer.writerow(['Publisher Name', 'Publisher Registry Id'] + [header for slug, header in summary_stats.columns]) + for row in summary_stats.table(): + # Write each row + writer.writerow([row['publisher_title'], row['publisher']] + [row[slug] for slug, header in summary_stats.columns]) + + logger.info("Generating humanitarian.csv") + with open(config.join_out_path('data/csv/humanitarian.csv'), 'w') as fp: writer = csv.writer(fp) - if tab == 'financials': - writer.writerow(['Publisher Name', 'Publisher Registry Id'] + - [x + ' (with valid data)' for x in comprehensiveness.column_headers[tab]] + - [x + ' (with any data)' for x in comprehensiveness.column_headers[tab]] + - ['Using budget-not-provided']) - for row in comprehensiveness.table(): - writer.writerow([row['publisher_title'], row['publisher']] + - [row[slug + '_valid'] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] + - [row[slug] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] + - ['Yes' if row['flag'] else '-']) - else: - writer.writerow(['Publisher Name', 'Publisher Registry Id'] + - [x + ' (with valid data)' for x in comprehensiveness.column_headers[tab]] + - [x + ' (with any data)' for x in comprehensiveness.column_headers[tab]]) - for row in comprehensiveness.table(): - writer.writerow([row['publisher_title'], row['publisher']] + - [row[slug + '_valid'] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]] + - [row[slug] if slug in row else '-' for slug in comprehensiveness.column_slugs[tab]]) - -# with open(os.path.join('out', 'coverage.csv'), 'w') as fp: -# writer = csv.writer(fp) -# # Add column headers -# writer.writerow([ -# 'Publisher Name', -# 'Publisher Registry Id', -# '2014 IATI Spend (US $m)', -# '2015 IATI Spend (US $m)', -# '2014 Reference Spend (US $m)', -# '2015 Reference Spend (US $m)', -# '2015 Official Forecast (US $m)', -# 'Spend Ratio (%)', -# 'No reference data available (Historic publishers)', -# 'No reference data available (New publishers)', -# 'Data quality issue reported' -# ]) -# for row in coverage.table(): -# # Write each row -# writer.writerow([ -# row['publisher_title'], -# row['publisher'], -# row['iati_spend_2014'], -# row['iati_spend_2015'], -# row['reference_spend_2014'], -# row['reference_spend_2015'], -# row['official_forecast_2015'], -# row['spend_ratio'], -# row['no_data_flag_red'], -# row['no_data_flag_amber'], -# row['spend_data_error_reported_flag'] -# ]) - -with open(config.join_out_path('summary_stats.csv'), 'w') as fp: - writer = csv.writer(fp) - # Add column headers - writer.writerow(['Publisher Name', 'Publisher Registry Id'] + [header for slug, header in summary_stats.columns]) - for row in summary_stats.table(): - # Write each row - writer.writerow([row['publisher_title'], row['publisher']] + [row[slug] for slug, header in summary_stats.columns]) - -with open(config.join_out_path('humanitarian.csv'), 'w') as fp: - writer = csv.writer(fp) - # Add column headers - writer.writerow([ - 'Publisher Name', - 'Publisher Registry Id', - 'Publisher Type', - 'Number of Activities', - 'Publishing Humanitarian', - 'Using Humanitarian Attribute', - 'Appeal or Emergency Details', - 'Clusters', - 'Humanitarian Score' - ]) - for row in humanitarian.table(): + # Add column headers writer.writerow([ - row['publisher_title'], - row['publisher'], - row['publisher_type'], - row['num_activities'], - row['publishing_humanitarian'], - row['humanitarian_attrib'], - row['appeal_emergency'], - row['clusters'], - row['average'] + 'Publisher Name', + 'Publisher Registry Id', + 'Publisher Type', + 'Number of Activities', + 'Publishing Humanitarian', + 'Using Humanitarian Attribute', + 'Appeal or Emergency Details', + 'Clusters', + 'Humanitarian Score' ]) + for row in humanitarian.table(): + writer.writerow([ + row['publisher_title'], + row['publisher'], + row['publisher_type'], + row['num_activities'], + row['publishing_humanitarian'], + row['humanitarian_attrib'], + row['appeal_emergency'], + row['clusters'], + row['average'] + ]) + + +if __name__ == "__main__": + main() diff --git a/src/plots.py b/src/make_plots.py similarity index 56% rename from src/plots.py rename to src/make_plots.py index a2b8ad3a1..70d266597 100644 --- a/src/plots.py +++ b/src/make_plots.py @@ -2,11 +2,15 @@ """ Generates static images of stats graphs using matplotlib. """ +import logging import datetime -import numpy as np # noqa: F401 -from collections import defaultdict +import argparse import os # noqa: F401 +from collections import defaultdict import csv + +import numpy as np # noqa: F401 +from tqdm import tqdm import common import data import config @@ -17,35 +21,37 @@ import matplotlib.dates as mdates # noqa: E402 -# Import failed_downloads as a global -failed_downloads = csv.reader(open(config.join_data_path('downloads/history.csv'))) - -gitaggregate_publisher = data.JSONDir(config.join_stats_path('gitaggregate-publisher-dated')) +logger = logging.getLogger(__name__) class AugmentedJSONDir(data.JSONDir): + def __init__(self, folder, failed_downloads, gitaggregate_publisher): + super().__init__(folder) + self.failed_downloads = failed_downloads + self.gitaggregate_publisher = gitaggregate_publisher + def __getitem__(self, key): if key == 'failed_downloads': - return dict((row[0], row[1]) for row in failed_downloads) + return dict((row[0], row[1]) for row in self.failed_downloads) elif key == 'publisher_types': out = defaultdict(lambda: defaultdict(int)) - for publisher, publisher_data in gitaggregate_publisher.items(): + for publisher, publisher_data in self.gitaggregate_publisher.items(): if publisher in data.ckan_publishers: organization_type = common.get_publisher_type(publisher)['name'] for datestring, count in publisher_data['activities'].items(): out[datestring][organization_type] += 1 else: - print('Publisher not matched:', publisher) + logger.debug("Getting by publisher_type unmatched publisher <{}>".format(publisher)) return out elif key == 'activities_per_publisher_type': out = defaultdict(lambda: defaultdict(int)) - for publisher, publisher_data in gitaggregate_publisher.items(): + for publisher, publisher_data in self.gitaggregate_publisher.items(): if publisher in data.ckan_publishers: organization_type = common.get_publisher_type(publisher)['name'] for datestring, count in publisher_data['activities'].items(): out[datestring][organization_type] += count else: - print('Publisher not matched:', publisher) + logger.debug("Getting by activities_per_publisher_type unmatched publisher <{}>".format(publisher)) return out else: return super(AugmentedJSONDir, self).__getitem__(key) @@ -136,53 +142,80 @@ def make_plot(stat_path, git_stats, img_prefix=''): del writer -# Load aggregated stats for all data -print("All data") -git_stats = AugmentedJSONDir(config.join_stats_path('gitaggregate-dated')) - -for stat_path in [ - 'activities', - 'publishers', - 'activity_files', - 'organisation_files', - 'file_size', - 'failed_downloads', - 'invalidxml', - 'nonstandardroots', - 'unique_identifiers', - ('validation', lambda x: x == 'fail', ''), - ('publishers_validation', lambda x: x == 'fail', ''), - ('publisher_has_org_file', lambda x: x == 'no', ''), - ('versions', lambda x: x in expected_versions, '_expected'), - ('versions', lambda x: x not in expected_versions, '_other'), - ('publishers_per_version', lambda x: x in expected_versions, '_expected'), - ('publishers_per_version', lambda x: x not in expected_versions, '_other'), - ('file_size_bins', lambda x: True, ''), - ('publisher_types', lambda x: True, ''), - ('activities_per_publisher_type', lambda x: True, '') -]: - make_plot(stat_path, git_stats) - - -# Delete git_stats variable to save memory -del git_stats - -try: - os.makedirs(config.join_out_path('publisher_imgs')) -except OSError: - pass - -git_stats_publishers = AugmentedJSONDir(config.join_stats_path('gitaggregate-publisher-dated/')) -for publisher, git_stats_publisher in git_stats_publishers.items(): - for stat_path in [ +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--verbose", action="store_true", help="Generate images verbosely to stdout") + args = parser.parse_args() + + # Load data required for loading stats. + failed_downloads = csv.reader(open(config.join_data_path('downloads/history.csv'))) + gitaggregate_publisher = data.JSONDir(config.join_stats_path('gitaggregate-publisher-dated')) + + # Generate plots for aggregated stats for all data. + logger.info("Generating plots for all aggregated data") + git_stats = AugmentedJSONDir(config.join_stats_path('gitaggregate-dated'), + failed_downloads, + gitaggregate_publisher) + os.makedirs(config.join_out_path('img/aggregate'), exist_ok=True) + + _paths = [ 'activities', + 'publishers', 'activity_files', 'organisation_files', 'file_size', + 'failed_downloads', 'invalidxml', 'nonstandardroots', - 'publisher_unique_identifiers', + 'unique_identifiers', ('validation', lambda x: x == 'fail', ''), - ('versions', lambda x: True, ''), - ]: - make_plot(stat_path, git_stats_publisher, 'publisher_imgs/{0}_'.format(publisher)) + ('publishers_validation', lambda x: x == 'fail', ''), + ('publisher_has_org_file', lambda x: x == 'no', ''), + ('versions', lambda x: x in expected_versions, '_expected'), + ('versions', lambda x: x not in expected_versions, '_other'), + ('publishers_per_version', lambda x: x in expected_versions, '_expected'), + ('publishers_per_version', lambda x: x not in expected_versions, '_other'), + ('file_size_bins', lambda x: True, ''), + ('publisher_types', lambda x: True, ''), + ('activities_per_publisher_type', lambda x: True, '') + ] + with tqdm(total=len(_paths)) as pbar: + if args.verbose: + pbar.set_description("Generate aggregate plots") + for stat_path in _paths: + if args.verbose: + pbar.update() + make_plot(stat_path, git_stats, img_prefix='img/aggregate/') + + # Delete git_stats variable to save memory + del git_stats + + # Generate plots for each publisher. + logger.info("Generating plots for all publishers") + git_stats_publishers = AugmentedJSONDir(config.join_stats_path('gitaggregate-publisher-dated/'), + failed_downloads, + gitaggregate_publisher) + os.makedirs(config.join_out_path('img/publishers'), exist_ok=True) + + with tqdm(total=len(git_stats_publishers)) as pbar: + if args.verbose: + pbar.set_description("Generate plots for all publishers") + for publisher, git_stats_publisher in git_stats_publishers.items(): + if args.verbose: + pbar.update() + for stat_path in [ + 'activities', + 'activity_files', + 'organisation_files', + 'file_size', + 'invalidxml', + 'nonstandardroots', + 'publisher_unique_identifiers', + ('validation', lambda x: x == 'fail', ''), + ('versions', lambda x: True, ''), + ]: + make_plot(stat_path, git_stats_publisher, img_prefix='img/publishers/{0}_'.format(publisher)) + + +if __name__ == "__main__": + main()