Skip to content

Commit

Permalink
Reworked daily data backfill process to update one report at a time a…
Browse files Browse the repository at this point in the history
…nd split out into separate tasks to reduce the database load when backfilling. (#2252)
  • Loading branch information
joshuastegmaier authored Jan 30, 2024
1 parent 77fb8c5 commit 0fa6f33
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 28 deletions.
47 changes: 46 additions & 1 deletion concordia/admin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,43 @@ class SimplePageAdmin(admin.ModelAdmin):

@admin.register(SiteReport)
class SiteReportAdmin(admin.ModelAdmin):
list_display = ("created_on", "report_name", "campaign", "topic")
list_display = ("created_on", "report_type")
readonly_fields = ("created_on", "report_type")
fieldsets = (
("Summary", {"fields": ("created_on", "report_type")}),
(
"Data",
{
"fields": (
"report_name",
"campaign",
"topic",
"assets_total",
"assets_published",
"assets_not_started",
"assets_in_progress",
"assets_waiting_review",
"assets_completed",
"assets_unpublished",
"items_published",
"items_unpublished",
"projects_published",
"projects_unpublished",
"anonymous_transcriptions",
"transcriptions_saved",
"daily_review_actions",
"distinct_tags",
"tag_uses",
"campaigns_published",
"campaigns_unpublished",
"users_registered",
"users_activated",
"registered_contributors",
"daily_active_users",
)
},
),
)

list_filter = (
"report_name",
Expand All @@ -842,6 +878,15 @@ class SiteReportAdmin(admin.ModelAdmin):
"topic",
)

@admin.display(description="Report type")
def report_type(self, obj):
if obj.report_name:
return f"Report name: {obj.report_name}"
elif obj.campaign:
return f"Campaign: {obj.campaign}"
elif obj.topic:
return f"Topic: {obj.topic}"

def export_to_csv(self, request, queryset):
return export_to_csv_action(
self, request, queryset, field_names=SiteReport.DEFAULT_EXPORT_FIELDNAMES
Expand Down
2 changes: 1 addition & 1 deletion concordia/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ class ReportName(models.TextChoices):
TOTAL = "Active and completed campaigns", "Active and completed campaigns"
RETIRED_TOTAL = "Retired campaigns", "Retired campaigns"

created_on = models.DateTimeField(editable=False, auto_now_add=True)
created_on = models.DateTimeField(auto_now_add=True)
report_name = models.CharField(
max_length=80, blank=True, default="", choices=ReportName.choices
)
Expand Down
148 changes: 122 additions & 26 deletions concordia/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from django.contrib.auth.models import User
from django.core.management import call_command
from django.db import transaction
from django.db.models import Count, F, OuterRef, Q, Subquery
from django.db.models import Count, F, Q
from django.utils import timezone
from more_itertools.more import chunked

Expand All @@ -37,6 +37,8 @@

logger = getLogger(__name__)

ONE_DAY = datetime.timedelta(days=1)


@celery_app.task
def expire_inactive_asset_reservations():
Expand Down Expand Up @@ -427,51 +429,145 @@ def retired_total_report():
total_site_report.save()


ONE_DAY = datetime.timedelta(days=1)
def site_reports_for_date(date):
start = date - ONE_DAY
return SiteReport.objects.filter(created_on__gte=start, created_on__lte=date)


@celery_app.task
def backfill_by_date(date, days):
logger.info("Backfilling daily data for %s ", date)
def assets_for_date(date):
start = date - ONE_DAY

q_accepted = Q(
transcription__accepted__gte=start, transcription__accepted__lte=date
)
q_rejected = Q(
transcription__rejected__gte=start, transcription__rejected__lte=date
)
assets = Asset.objects.filter(q_accepted | q_rejected)
site_reports = SiteReport.objects.filter(
created_on__gte=start, created_on__lte=date
return Asset.objects.filter(q_accepted | q_rejected)


@celery_app.task(ignore_result=True)
def backfill_total(date, days):
logger.info(
"STARTING: Backfilling daily data for %s on %s",
SiteReport.ReportName.TOTAL,
date,
)
topic_assets = assets.filter(item__project__topics=OuterRef("topic__pk"))
subquery = Subquery(
topic_assets.annotate(cnt=Count("transcription")).values("cnt")[:1]
site_report = site_reports_for_date(date).filter(
report_name=SiteReport.ReportName.TOTAL
)[0]
logger.info(
"STARTING: Backfilling daily data for report %s (%s)", site_report.id, date
)
site_reports.filter(topic__isnull=False).update(daily_review_actions=subquery)
campaign_assets = assets.filter(item__project__campaign=OuterRef("campaign__pk"))
subquery = Subquery(
campaign_assets.annotate(cnt=Count("transcription")).values("cnt")[:1]
daily_review_actions = assets_for_date(date).count()
logger.debug(
"%s daily review actions for report %s (%s)",
daily_review_actions,
site_report.id,
date,
)
site_reports.filter(campaign__isnull=False).update(daily_review_actions=subquery)
site_reports.filter(topic__isnull=True, campaign__isnull=True).update(
daily_review_actions=Subquery(
assets.annotate(cnt=Count("transcription")).values("cnt")[:1]
)
site_report.daily_review_actions = daily_review_actions
site_report.save()
logger.info(
"FINISHED: Backfilling daily data for %s on %s",
SiteReport.ReportName.TOTAL,
date,
)
logger.info("FINISHED: Backfilling daily data for all reports on %s", date)

if days >= 0:
backfill_by_date.delay(start, days - 1)
if days > 0:
return backfill_topics.delay(date - ONE_DAY, days - 1)
else:
logger.info("Backfilling daily data finished")
logger.info("Backfilling daily data complete")


@celery_app.task
@celery_app.task(ignore_result=True)
def backfill_next_campaign_report(date, days, site_report_ids):
try:
site_report_id = site_report_ids.pop()
except IndexError:
logger.info("FINISHED: Backfilling daily data for campaigns on %s", date)
backfill_total.delay(date, days)
return
site_report = SiteReport.objects.get(id=site_report_id)
logger.info(
"STARTING: Backfilling daily data for report %s (%s)", site_report.id, date
)
daily_review_actions = (
assets_for_date(date)
.filter(item__project__campaign=site_report.campaign)
.count()
)
logger.debug(
"%s daily review actions for report %s (%s)",
daily_review_actions,
site_report.id,
date,
)
site_report.daily_review_actions = daily_review_actions
site_report.save()
logger.info(
"FINISHED: Backfilling daily data for report %s (%s)", site_report.id, date
)
return backfill_next_campaign_report.delay(date, days, site_report_ids)


@celery_app.task(ignore_result=True)
def backfill_campaigns(date, days):
site_report_ids = list(
site_reports_for_date(date)
.filter(campaign__isnull=False)
.values_list("id", flat=True)
)
logger.info("STARTING: Backfilling daily data for campaigns on %s", date)
return backfill_next_campaign_report.delay(date, days, site_report_ids)


@celery_app.task(ignore_result=True)
def backfill_next_topic_report(date, days, site_report_ids):
try:
site_report_id = site_report_ids.pop()
except IndexError:
logger.info("FINISHED: Backfilling daily data for topics on %s", date)
backfill_campaigns.delay(date, days)
return
site_report = SiteReport.objects.get(id=site_report_id)
logger.info(
"STARTING: Backfilling daily data for report %s (%s)", site_report.id, date
)
daily_review_actions = (
assets_for_date(date).filter(item__project__topics=site_report.topic).count()
)
logger.debug(
"%s daily review actions for report %s (%s)",
daily_review_actions,
site_report.id,
date,
)
site_report.daily_review_actions = daily_review_actions
site_report.save()
logger.info(
"FINISHED: Backfilling daily data for report %s (%s)", site_report.id, date
)
return backfill_next_topic_report.delay(date, days, site_report_ids)


@celery_app.task(ignore_result=True)
def backfill_topics(date, days):
site_report_ids = list(
site_reports_for_date(date)
.filter(topic__isnull=False)
.values_list("id", flat=True)
)
logger.info("STARTING: Backfilling daily data for topics on %s", date)
return backfill_next_topic_report.delay(date, days, site_report_ids)


@celery_app.task(ignore_result=True)
def backfill_daily_data(start, days):
date = timezone.make_aware(datetime.datetime(**start))
logger.info("Backfilling daily data for the %s days before %s", days, date)
backfill_by_date.delay(date - ONE_DAY, days - 1)
logger.info("STARTED: Backfilling daily data for all reports on %s", date)
return backfill_topics.delay(date, days - 1)


@celery_app.task
Expand Down

0 comments on commit 0fa6f33

Please sign in to comment.