From 0fa6f33c687fbcf800b64ebf77d79baf36180047 Mon Sep 17 00:00:00 2001 From: Josh Stegmaier <104993387+joshuastegmaier@users.noreply.github.com> Date: Tue, 30 Jan 2024 13:02:26 -0500 Subject: [PATCH] Reworked daily data backfill process to update one report at a time and split out into separate tasks to reduce the database load when backfilling. (#2252) --- concordia/admin/__init__.py | 47 +++++++++++- concordia/models.py | 2 +- concordia/tasks.py | 148 +++++++++++++++++++++++++++++------- 3 files changed, 169 insertions(+), 28 deletions(-) diff --git a/concordia/admin/__init__.py b/concordia/admin/__init__.py index b05eb6d06..e54a1e75f 100644 --- a/concordia/admin/__init__.py +++ b/concordia/admin/__init__.py @@ -833,7 +833,43 @@ class SimplePageAdmin(admin.ModelAdmin): @admin.register(SiteReport) class SiteReportAdmin(admin.ModelAdmin): - list_display = ("created_on", "report_name", "campaign", "topic") + list_display = ("created_on", "report_type") + readonly_fields = ("created_on", "report_type") + fieldsets = ( + ("Summary", {"fields": ("created_on", "report_type")}), + ( + "Data", + { + "fields": ( + "report_name", + "campaign", + "topic", + "assets_total", + "assets_published", + "assets_not_started", + "assets_in_progress", + "assets_waiting_review", + "assets_completed", + "assets_unpublished", + "items_published", + "items_unpublished", + "projects_published", + "projects_unpublished", + "anonymous_transcriptions", + "transcriptions_saved", + "daily_review_actions", + "distinct_tags", + "tag_uses", + "campaigns_published", + "campaigns_unpublished", + "users_registered", + "users_activated", + "registered_contributors", + "daily_active_users", + ) + }, + ), + ) list_filter = ( "report_name", @@ -842,6 +878,15 @@ class SiteReportAdmin(admin.ModelAdmin): "topic", ) + @admin.display(description="Report type") + def report_type(self, obj): + if obj.report_name: + return f"Report name: {obj.report_name}" + elif obj.campaign: + return f"Campaign: {obj.campaign}" + elif obj.topic: + return f"Topic: {obj.topic}" + def export_to_csv(self, request, queryset): return export_to_csv_action( self, request, queryset, field_names=SiteReport.DEFAULT_EXPORT_FIELDNAMES diff --git a/concordia/models.py b/concordia/models.py index 21ad58368..1e1b7a894 100644 --- a/concordia/models.py +++ b/concordia/models.py @@ -803,7 +803,7 @@ class ReportName(models.TextChoices): TOTAL = "Active and completed campaigns", "Active and completed campaigns" RETIRED_TOTAL = "Retired campaigns", "Retired campaigns" - created_on = models.DateTimeField(editable=False, auto_now_add=True) + created_on = models.DateTimeField(auto_now_add=True) report_name = models.CharField( max_length=80, blank=True, default="", choices=ReportName.choices ) diff --git a/concordia/tasks.py b/concordia/tasks.py index 5b6b8c4cd..0028e3c45 100644 --- a/concordia/tasks.py +++ b/concordia/tasks.py @@ -10,7 +10,7 @@ from django.contrib.auth.models import User from django.core.management import call_command from django.db import transaction -from django.db.models import Count, F, OuterRef, Q, Subquery +from django.db.models import Count, F, Q from django.utils import timezone from more_itertools.more import chunked @@ -37,6 +37,8 @@ logger = getLogger(__name__) +ONE_DAY = datetime.timedelta(days=1) + @celery_app.task def expire_inactive_asset_reservations(): @@ -427,51 +429,145 @@ def retired_total_report(): total_site_report.save() -ONE_DAY = datetime.timedelta(days=1) +def site_reports_for_date(date): + start = date - ONE_DAY + return SiteReport.objects.filter(created_on__gte=start, created_on__lte=date) -@celery_app.task -def backfill_by_date(date, days): - logger.info("Backfilling daily data for %s ", date) +def assets_for_date(date): start = date - ONE_DAY - q_accepted = Q( transcription__accepted__gte=start, transcription__accepted__lte=date ) q_rejected = Q( transcription__rejected__gte=start, transcription__rejected__lte=date ) - assets = Asset.objects.filter(q_accepted | q_rejected) - site_reports = SiteReport.objects.filter( - created_on__gte=start, created_on__lte=date + return Asset.objects.filter(q_accepted | q_rejected) + + +@celery_app.task(ignore_result=True) +def backfill_total(date, days): + logger.info( + "STARTING: Backfilling daily data for %s on %s", + SiteReport.ReportName.TOTAL, + date, ) - topic_assets = assets.filter(item__project__topics=OuterRef("topic__pk")) - subquery = Subquery( - topic_assets.annotate(cnt=Count("transcription")).values("cnt")[:1] + site_report = site_reports_for_date(date).filter( + report_name=SiteReport.ReportName.TOTAL + )[0] + logger.info( + "STARTING: Backfilling daily data for report %s (%s)", site_report.id, date ) - site_reports.filter(topic__isnull=False).update(daily_review_actions=subquery) - campaign_assets = assets.filter(item__project__campaign=OuterRef("campaign__pk")) - subquery = Subquery( - campaign_assets.annotate(cnt=Count("transcription")).values("cnt")[:1] + daily_review_actions = assets_for_date(date).count() + logger.debug( + "%s daily review actions for report %s (%s)", + daily_review_actions, + site_report.id, + date, ) - site_reports.filter(campaign__isnull=False).update(daily_review_actions=subquery) - site_reports.filter(topic__isnull=True, campaign__isnull=True).update( - daily_review_actions=Subquery( - assets.annotate(cnt=Count("transcription")).values("cnt")[:1] - ) + site_report.daily_review_actions = daily_review_actions + site_report.save() + logger.info( + "FINISHED: Backfilling daily data for %s on %s", + SiteReport.ReportName.TOTAL, + date, ) + logger.info("FINISHED: Backfilling daily data for all reports on %s", date) - if days >= 0: - backfill_by_date.delay(start, days - 1) + if days > 0: + return backfill_topics.delay(date - ONE_DAY, days - 1) else: - logger.info("Backfilling daily data finished") + logger.info("Backfilling daily data complete") -@celery_app.task +@celery_app.task(ignore_result=True) +def backfill_next_campaign_report(date, days, site_report_ids): + try: + site_report_id = site_report_ids.pop() + except IndexError: + logger.info("FINISHED: Backfilling daily data for campaigns on %s", date) + backfill_total.delay(date, days) + return + site_report = SiteReport.objects.get(id=site_report_id) + logger.info( + "STARTING: Backfilling daily data for report %s (%s)", site_report.id, date + ) + daily_review_actions = ( + assets_for_date(date) + .filter(item__project__campaign=site_report.campaign) + .count() + ) + logger.debug( + "%s daily review actions for report %s (%s)", + daily_review_actions, + site_report.id, + date, + ) + site_report.daily_review_actions = daily_review_actions + site_report.save() + logger.info( + "FINISHED: Backfilling daily data for report %s (%s)", site_report.id, date + ) + return backfill_next_campaign_report.delay(date, days, site_report_ids) + + +@celery_app.task(ignore_result=True) +def backfill_campaigns(date, days): + site_report_ids = list( + site_reports_for_date(date) + .filter(campaign__isnull=False) + .values_list("id", flat=True) + ) + logger.info("STARTING: Backfilling daily data for campaigns on %s", date) + return backfill_next_campaign_report.delay(date, days, site_report_ids) + + +@celery_app.task(ignore_result=True) +def backfill_next_topic_report(date, days, site_report_ids): + try: + site_report_id = site_report_ids.pop() + except IndexError: + logger.info("FINISHED: Backfilling daily data for topics on %s", date) + backfill_campaigns.delay(date, days) + return + site_report = SiteReport.objects.get(id=site_report_id) + logger.info( + "STARTING: Backfilling daily data for report %s (%s)", site_report.id, date + ) + daily_review_actions = ( + assets_for_date(date).filter(item__project__topics=site_report.topic).count() + ) + logger.debug( + "%s daily review actions for report %s (%s)", + daily_review_actions, + site_report.id, + date, + ) + site_report.daily_review_actions = daily_review_actions + site_report.save() + logger.info( + "FINISHED: Backfilling daily data for report %s (%s)", site_report.id, date + ) + return backfill_next_topic_report.delay(date, days, site_report_ids) + + +@celery_app.task(ignore_result=True) +def backfill_topics(date, days): + site_report_ids = list( + site_reports_for_date(date) + .filter(topic__isnull=False) + .values_list("id", flat=True) + ) + logger.info("STARTING: Backfilling daily data for topics on %s", date) + return backfill_next_topic_report.delay(date, days, site_report_ids) + + +@celery_app.task(ignore_result=True) def backfill_daily_data(start, days): date = timezone.make_aware(datetime.datetime(**start)) logger.info("Backfilling daily data for the %s days before %s", days, date) - backfill_by_date.delay(date - ONE_DAY, days - 1) + logger.info("STARTED: Backfilling daily data for all reports on %s", date) + return backfill_topics.delay(date, days - 1) @celery_app.task