Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quest/monthly use report #10354

Draft
wants to merge 2 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion api/metrics/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@ def get(self, request, *args, **kwargs):
'user_summary': reports.UserSummaryReport,
'spam_summary': reports.SpamSummaryReport,
'new_user_domains': reports.NewUserDomainReport,
'monthly_sessionhours': reports.MonthlySessionhoursReport,
'monthly_route_use': reports.MonthlyRouteUseReport,
}


Expand Down Expand Up @@ -344,7 +346,7 @@ def get(self, request, *args, report_name):
range_field_name = 'report_yearmonth'
range_parser = parse_yearmonth_range
else:
raise ValueError(f'report class must subclass DailyReport or MonthlyReport: {report_class}')
raise ValueError(f'VIEWABLE_REPORTS values should subclass DailyReport or MonthlyReport ("{report_name}": {report_class})')
range_filter = range_parser(request.GET)
search_recent = (
report_class.search()
Expand Down Expand Up @@ -409,6 +411,7 @@ def _get_session_id(self, request, client_session_id=None):
session_id_parts = [
client_session_id,
current_date_str,
now.hour,
]
elif user_is_authenticated:
session_id_parts = [
Expand Down
11 changes: 3 additions & 8 deletions osf/metrics/counted_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,18 @@
import logging
from urllib.parse import urlsplit

from elasticsearch_dsl import InnerDoc, analyzer, tokenizer
from elasticsearch_dsl import InnerDoc
from elasticsearch_metrics import metrics
from elasticsearch_metrics.signals import pre_save
from django.dispatch import receiver
import pytz

from osf.metrics.utils import stable_key
from osf.metrics.utils import stable_key, route_prefix_analyzer
from osf.models import Guid


logger = logging.getLogger(__name__)

route_prefix_analyzer = analyzer(
'route_prefix_analyzer',
tokenizer=tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'),
)

class PageviewInfo(InnerDoc):
"""PageviewInfo

Expand All @@ -31,7 +26,7 @@ class PageviewInfo(InnerDoc):
page_title = metrics.Keyword()
route_name = metrics.Keyword(
fields={
'by_prefix': metrics.Text(analyzer=route_prefix_analyzer),
'by_prefix': metrics.Text(analyzer=route_prefix_analyzer, fielddata=True),
},
)

Expand Down
4 changes: 4 additions & 0 deletions osf/metrics/reporters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from .preprint_count import PreprintCountReporter
from .user_count import UserCountReporter
from .spam_count import SpamCountReporter
from .monthly_sessionhours import MonthlySessionhoursReporter
from .monthly_route_use import MonthlyRouteUseReporter


DAILY_REPORTERS = (
Expand All @@ -24,4 +26,6 @@

MONTHLY_REPORTERS = (
SpamCountReporter,
MonthlySessionhoursReporter,
MonthlyRouteUseReporter,
)
38 changes: 38 additions & 0 deletions osf/metrics/reporters/monthly_route_use.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from osf.metrics.counted_usage import CountedUsage
from osf.metrics.reports import MonthlyRouteUseReport
from ._base import MonthlyReporter


class MonthlyRouteUseReporter(MonthlyReporter):
def report(self, report_yearmonth):
start = report_yearmonth.as_datetime()
end = report_yearmonth.next().as_datetime()
search = (
CountedUsage.search()
.filter('range', timestamp={'gte': start, 'lte': end})
[:0] # just the aggregations, no hits
)
route_agg = search.aggs.bucket(
'by_route',
'terms',
field='pageview_info.route_name',
)
route_agg.metric(
'total_sessions',
'cardinality',
field='session_id',
precision_threshold=40000, # maximum precision
)

result = search.execute()

reports = []
for route_bucket in result.aggs.by_route.buckets:
report = MonthlyRouteUseReport(
report_yearmonth=report_yearmonth,
route_name=route_bucket.key,
use_count=route_bucket.doc_count,
sessionhour_count=route_bucket.total_sessions.value,
)
reports.append(report)
return reports
31 changes: 31 additions & 0 deletions osf/metrics/reporters/monthly_sessionhours.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from osf.metrics.counted_usage import CountedUsage
from osf.metrics.reports import MonthlySessionhoursReport
from ._base import MonthlyReporter


class MonthlySessionhoursReporter(MonthlyReporter):
def report(self, report_yearmonth):
start = report_yearmonth.as_datetime()
end = report_yearmonth.next().as_datetime()
search = (
CountedUsage.search()
.filter('range', timestamp={'gte': start, 'lte': end})
[:0] # just the aggregations, no hits
)
search.aggs.metric(
'total_sessionhour_count',
'cardinality',
field='session_id',
precision_threshold=40000, # maximum precision
)
result = search.execute()
total_sessionhour_count = result.aggs.total_sessionhour_count.value
month_timedelta = (end - start)
month_hours = (24 * month_timedelta.days) + int(month_timedelta.seconds / (60 * 60))
average_sessions_per_hour = total_sessionhour_count / month_hours
report = MonthlySessionhoursReport(
report_yearmonth=report_yearmonth,
total_sessionhour_count=total_sessionhour_count,
average_sessions_per_hour=average_sessions_per_hour,
)
return [report]
4 changes: 2 additions & 2 deletions osf/metrics/reporters/spam_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
class SpamCountReporter(MonthlyReporter):

def report(self, report_yearmonth):
target_month = report_yearmonth.target_month()
next_month = report_yearmonth.next_month()
target_month = report_yearmonth.as_datetime()
next_month = report_yearmonth.next().as_datetime()

report = SpamSummaryReport(
report_yearmonth=str(report_yearmonth),
Expand Down
55 changes: 37 additions & 18 deletions osf/metrics/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from elasticsearch_metrics import metrics
from elasticsearch_metrics.signals import pre_save as metrics_pre_save

from osf.metrics.utils import stable_key, YearMonth
from osf.metrics.utils import stable_key, YearMonth, route_prefix_analyzer


class ReportInvalid(Exception):
Expand All @@ -20,7 +20,7 @@ class DailyReport(metrics.Metric):
There's something we'd like to know about every so often,
so let's regularly run a report and stash the results here.
"""
DAILY_UNIQUE_FIELD = None # set in subclasses that expect multiple reports per day
UNIQUE_TOGETHER = ('report_date',) # override in subclasses that expect multiple reports per day

report_date = metrics.Date(format='strict_date', required=True)

Expand Down Expand Up @@ -58,6 +58,7 @@ def serialize(self, data):
class MonthlyReport(metrics.Metric):
"""MonthlyReport (abstract base for report-based metrics that run monthly)
"""
UNIQUE_TOGETHER = ('report_yearmonth',) # override in subclasses that expect multiple reports per month

report_yearmonth = YearmonthField()

Expand All @@ -74,18 +75,19 @@ def set_report_id(sender, instance, **kwargs):
# "ON CONFLICT UPDATE" behavior -- if the document
# already exists, it will be updated rather than duplicated.
# Cannot detect/avoid conflicts this way, but that's ok.

if issubclass(sender, DailyReport):
duf_name = instance.DAILY_UNIQUE_FIELD
if duf_name is None:
instance.meta.id = stable_key(instance.report_date)
else:
duf_value = getattr(instance, duf_name)
if not duf_value or not isinstance(duf_value, str):
raise ReportInvalid(f'{sender.__name__}.{duf_name} MUST have a non-empty string value (got {duf_value})')
instance.meta.id = stable_key(instance.report_date, duf_value)
elif issubclass(sender, MonthlyReport):
instance.meta.id = stable_key(instance.report_yearmonth)
if issubclass(sender, (DailyReport, MonthlyReport)):
unique_together_fields = getattr(sender, 'UNIQUE_TOGETHER', None)
if not unique_together_fields:
raise ValueError(f'{sender.__name__}.UNIQUE_TOGETHER must be non-empty!')
unique_together_values = []
for field_name in unique_together_fields:
field_value = getattr(instance, field_name)
field_value_str = str(field_value)
if (field_value is None) or (not field_value_str):
raise ReportInvalid(f'{sender.__name__}.{field_name} must have a non-empty stringable value (got {field_value})')
unique_together_values.append(field_value_str)
assert len(unique_together_values) > 0
instance.meta.id = stable_key(*unique_together_values)


#### BEGIN reusable inner objects #####
Expand Down Expand Up @@ -157,7 +159,7 @@ class DownloadCountReport(DailyReport):


class InstitutionSummaryReport(DailyReport):
DAILY_UNIQUE_FIELD = 'institution_id'
UNIQUE_TOGETHER = ('report_date', 'institution_id',)

institution_id = metrics.Keyword()
institution_name = metrics.Keyword()
Expand All @@ -169,7 +171,7 @@ class InstitutionSummaryReport(DailyReport):


class NewUserDomainReport(DailyReport):
DAILY_UNIQUE_FIELD = 'domain_name'
UNIQUE_TOGETHER = ('report_date', 'domain_name',)

domain_name = metrics.Keyword()
new_user_count = metrics.Integer()
Expand All @@ -187,7 +189,7 @@ class OsfstorageFileCountReport(DailyReport):


class PreprintSummaryReport(DailyReport):
DAILY_UNIQUE_FIELD = 'provider_key'
UNIQUE_TOGETHER = ('report_date', 'provider_key',)

provider_key = metrics.Keyword()
preprint_count = metrics.Integer()
Expand All @@ -212,5 +214,22 @@ class SpamSummaryReport(MonthlyReport):
preprint_confirmed_spam = metrics.Integer()
preprint_confirmed_ham = metrics.Integer()
preprint_flagged = metrics.Integer()
user_marked_as_spam = metrics.Integer()
users_marked_as_spam = metrics.Integer()
user_marked_as_ham = metrics.Integer()


class MonthlySessionhoursReport(MonthlyReport):
total_sessionhour_count = metrics.Integer()
average_sessions_per_hour = metrics.Float()


class MonthlyRouteUseReport(MonthlyReport):
UNIQUE_TOGETHER = ('report_yearmonth', 'route_name',)
route_name = metrics.Keyword(
fields={
# "route_name.by_prefix" subfield for aggregating subroutes
'by_prefix': metrics.Text(analyzer=route_prefix_analyzer, fielddata=True),
},
)
use_count = metrics.Integer()
sessionhour_count = metrics.Integer()
28 changes: 20 additions & 8 deletions osf/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import typing
from hashlib import sha256

import pytz
from elasticsearch_dsl import analyzer, tokenizer


def stable_key(*key_parts):
Expand All @@ -21,8 +21,8 @@ def stable_key(*key_parts):


class YearMonth(typing.NamedTuple):
year: int
month: int
year: int # assumed >= 1000, < 10000
month: int # assumed >= 1, <= 12

YEARMONTH_RE = re.compile(r'(?P<year>\d{4})-(?P<month>\d{2})')

Expand All @@ -45,10 +45,22 @@ def from_str(cls, input_str):
def __str__(self):
return f'{self.year}-{self.month:0>2}'

def target_month(self):
return datetime.datetime(self.year, self.month, 1, tzinfo=pytz.utc)
def as_datetime(self) -> datetime.datetime:
return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.timezone.utc)

def next_month(self):
def next(self):
if self.month == 12:
return datetime.datetime(self.year + 1, 1, 1, tzinfo=pytz.utc)
return datetime.datetime(self.year, self.month + 1, 1, tzinfo=pytz.utc)
return YearMonth(self.year + 1, 1)
return YearMonth(self.year, self.month + 1)

def prior(self):
if self.month == 1:
return YearMonth(self.year - 1, 12)
return YearMonth(self.year, self.month - 1)


# for elasticsearch fields that hold dot-delimited paths,
# to allow querying/aggregating by prefix (e.g. 'root.to.leaf'
# yields tokens ['root', 'root.to', 'root.to.leaf'])
route_prefix_tokenizer = tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.')
route_prefix_analyzer = analyzer('route_prefix_analyzer', tokenizer=route_prefix_tokenizer)
11 changes: 6 additions & 5 deletions osf_tests/metrics/test_daily_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,28 @@ class Meta:
assert report.meta.id == expected_key
mock_save.reset_mock()

def test_with_duf(self, mock_save):
def test_with_unique_together(self, mock_save):
# multiple reports of this type per day, unique by given field
class UniqueByDateAndField(DailyReport):
DAILY_UNIQUE_FIELD = 'duf'
duf = metrics.Keyword()
UNIQUE_TOGETHER = ('report_date', 'my_uniq_field',)

my_uniq_field = metrics.Keyword()

class Meta:
app_label = 'osf'

today = date(2022, 5, 18)

expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151'
blah_report = UniqueByDateAndField(report_date=today, duf='blah')
blah_report = UniqueByDateAndField(report_date=today, my_uniq_field='blah')
blah_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is blah_report
assert blah_report.meta.id == expected_blah
mock_save.reset_mock()

expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99'
fleh_report = UniqueByDateAndField(report_date=today, duf='fleh')
fleh_report = UniqueByDateAndField(report_date=today, my_uniq_field='fleh')
fleh_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is fleh_report
Expand Down
Loading