Skip to content

Commit

Permalink
stage MITx Online tracking log table, update to xPro staging tracking…
Browse files Browse the repository at this point in the history
… log (#792)
  • Loading branch information
rachellougee authored Aug 10, 2023
1 parent a6ac4a1 commit af9eeab
Show file tree
Hide file tree
Showing 7 changed files with 223 additions and 20 deletions.
3 changes: 2 additions & 1 deletion src/ol_dbt/macros/extract_course_id_from_tracking_log.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{% macro extract_course_id_from_tracking_log() %}
---course ID format: course-v1:{org}+{course number}+{run_tag}
{% set course_id_regex = 'course-v(\d{1}):([^\/]+)\+([^\/]+)\+([^\/\]]+)' %}
---Course number and run tag can be letters, numbers, period, dashes, underscores
{% set course_id_regex = 'course-v(\d{1}):([\w\.\-\_]+)\+([\w\.\-\_]+)\+([\w\.\-\_]+)' %}
case
when regexp_extract(json_query(context, 'lax $.course_id' omit quotes), '{{ course_id_regex }}') is not null
then json_query(context, 'lax $.course_id' omit quotes)
Expand Down
56 changes: 56 additions & 0 deletions src/ol_dbt/models/staging/mitxonline/_mitxonline__sources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -751,3 +751,59 @@ sources:
on the open edX platform
- name: last_login
description: timestamp, date and time when user last login on the open edX platform

- name: raw__mitxonline__openedx__tracking_logs
description: MITx Online Open edX event data that are emitted by server, the browser,
or the mobile device to capture information about user's interactions with a
course
columns:
- name: username
description: str, username of the open edX user who caused the event to be emitted.
Some events are recorded with a blank username. This can occur when a user
logs out, or the login session times out, while a browser window remains open.
EdX recommends to ignore these events.
- name: context
description: object, it includes member fields that provide contextual information.
Common fields apply to all events are course_id, org_id, path, user_id. Other
member fields for applicable events are course_user_tags, module.
- name: event_source
description: str, specifies the source of the interaction that triggered the
event. The values are - browser, mobile, server, task
- name: event_type
description: str, type of event triggered. Values depend on event_source.
- name: name
description: str, type of event triggered. When this field is present for an
event, it supersedes the event_type field.
- name: event
description: object, it includes member fields that identify specifics of each
triggered event. Different member fields are supplied for different events.
- name: page
description: str, url of the page the user was visiting when the event was emitted.
- name: session
description: str, 32-character value to identify the user’s session. All browser
events and the server 'enrollment' events include a session value. Other server
events and mobile events do not include a session value.
- name: ip
description: str, IP address of the user who triggered the event. Empty for
mobile events.
- name: host
description: str, the site visited by the user. e.g. courses.mitxonline.mit.edu
- name: time
description: str, time at which the event was emitted. It has inconsistent formats
due to log collector switches, YYYY-MM-DD HH:mm:ss.SSSSSS for older records,
YYYY-MM-ddTHH:mm:ss.SSSSSS for newer records
- name: agent
description: str, browser agent string of the user who triggered the event.
- name: accept_language
description: str, value from the HTTP Accept-Language request-header field
- name: referer
description: str, URI from the HTTP Referer request-header field
- name: log_file
description: str, internal used field for log file location
- name: _ab_source_file_url
description: str, url path for the raw log file
- name: vector_timestamp
description: str, time when Vector processed this record. Blank for old records.
- name: environment
description: str, internal used field to indicate environment for the event.
e.g. mitxonline-production
76 changes: 76 additions & 0 deletions src/ol_dbt/models/staging/mitxonline/_stg_mitxonline__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1151,3 +1151,79 @@ models:
platform
tests:
- not_null

- name: stg__mitxonline__openedx__tracking_logs__user_activity
description: This table is deduped as raw table has duplicate events. It filters
out blank username events since these don't supply user identifiers and also those
server "exception" events due to server errors
columns:
- name: user_username
description: str, username of the open edX user who caused the event to be emitted.
tests:
- not_null
- name: openedx_user_id
description: int, reference user id in auth_user from open edX. Extracted from
context field.
tests:
- not_null
- name: courserun_readable_id
description: str, Open edX Course ID formatted as course-v1:{org}+{course number}+{run_tag}.
Extracted from various fields - context.course_id, context.path, event_type
and page. The course ID extracted from context field may not be valid, it would
require joining with courserun table to be sure. This field could be blank for
any events that are not for any specific course .e.g. user login/out, visiting
dashboard, some course team events e.g. course export from studio
- name: org_id
description: str, reference name in organizations_organization from open edX.
e.g. MITxT . Extracted from context field
- name: useractivity_path
description: str, URL that generated this event. Extracted from context field
- name: useractivity_context_object
description: object, it includes member fields that provide contextual information.
Common fields apply to all events are course_id, org_id, path, user_id. Other
member fields for applicable events are course_user_tags, module.
- name: useractivity_event_source
description: str, specifies the source of the interaction that triggered the event.
The values are - browser, mobile, server, task
tests:
- not_null
- name: useractivity_event_type
description: str, type of event triggered. Values depend on event_source.
tests:
- not_null
- name: useractivity_event_name
description: str, type of event triggered. When this field is present for an event,
it supersedes the event_type field.
- name: useractivity_event_object
description: object,it includes member fields that identify specifics of each
triggered event. Different member fields are supplied for different events.
tests:
- not_null
- name: useractivity_page_url
description: str, url of the page the user was visiting when the event was emitted.
- name: useractivity_session_id
description: str, 32-character value to identify the user’s session. All browser
events and the server 'enrollment' events include session value. Other server
events and mobile events do not include a session value.
- name: useractivity_ip
description: str, IP address of the user who triggered the event. Empty for mobile
events.
- name: useractivity_http_host
description: str, The site visited by the user. e.g. courses.mitxonline.mit.edu
tests:
- not_null
- name: useractivity_http_user_agent
description: str, browser agent string of the user who triggered the event.
- name: useractivity_http_accept_language
description: str, value from the HTTP Accept-Language request-header field
- name: useractivity_http_referer
description: str, URI from the HTTP Referer request-header field
- name: useractivity_timestamp
description: timestamp, time at which the event was emitted, formatted as ISO
8601 string
tests:
- not_null
tests:
- dbt_expectations.expect_compound_columns_to_be_unique:
column_list: ["user_username", "useractivity_context_object", "useractivity_event_source",
"useractivity_event_type", "useractivity_event_object", "useractivity_timestamp"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
-- MITx Online user activities from tracking logs
-- Due to size of the raw table, build this model as incremental and apply dedup on new rows
{{ config(
materialized='incremental',
unique_key = ['user_username', 'useractivity_context_object', 'useractivity_event_source',
'useractivity_event_type', 'useractivity_event_object', 'useractivity_timestamp'],
incremental_strategy='delete+insert',
views_enabled=false,
)
}}

with source as (
select * from {{ source('ol_warehouse_raw_data','raw__mitxonline__openedx__tracking_logs') }}
where
username != ''
and json_query(context, 'lax $.user_id' omit quotes) is not null
and json_query(event, 'lax $.exception' omit quotes) is null

{% if is_incremental() %}
and "time" > (select max(this.useractivity_timestamp) from {{ this }} as this) --noqa
{% endif %}
)

, source_sorted as (
select
*
, row_number() over (
partition by username, context, event_source, event_type, event, "time" --noqa
order by _airbyte_emitted_at desc, _ab_source_file_last_modified desc, vector_timestamp desc
) as row_num
from source
)

, dedup_source as (
select *
from source_sorted
where row_num = 1
)

, cleaned as (
select
username as user_username
, context as useractivity_context_object
, event as useractivity_event_object
, event_source as useractivity_event_source
, page as useractivity_page_url
, session as useractivity_session_id
, ip as useractivity_ip
, host as useractivity_http_host
, agent as useractivity_http_user_agent
, accept_language as useractivity_http_accept_language
, referer as useractivity_http_referer
, name as useractivity_event_name
, event_type as useractivity_event_type
, {{ extract_course_id_from_tracking_log() }} as courserun_readable_id
--- extract common fields from context object
, json_query(context, 'lax $.user_id' omit quotes) as openedx_user_id
, json_query(context, 'lax $.org_id' omit quotes) as org_id
, json_query(context, 'lax $.path' omit quotes) as useractivity_path
--- due to log collector changes, values of time field come with different formats
, to_iso8601(from_iso8601_timestamp_nanos(
regexp_replace(time, '(\d{4}-\d{2}-\d{2})[T ](\d{2}:\d{2}:\d{2}\.\d+)(.*?)', '$1T$2$3') -- noqa
)) as useractivity_timestamp
from dedup_source
)

select * from cleaned
5 changes: 3 additions & 2 deletions src/ol_dbt/models/staging/mitxpro/_mitxpro__sources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -887,8 +887,9 @@ sources:
education", "Other education"

- name: raw__xpro__openedx__tracking_logs
description: event data that are emitted by server, the browser, or the mobile
device to capture information about user's interactions with a course
description: MIT xPro Open edX event data that are emitted by server, the browser,
or the mobile device to capture information about user's interactions with a
course
columns:
- name: username
description: str, username of the open edX user who caused the event to be emitted.
Expand Down
9 changes: 5 additions & 4 deletions src/ol_dbt/models/staging/mitxpro/_stg_mitxpro__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1347,8 +1347,9 @@ models:
column_list: ["user_id", "program_id", "ecommerce_order_id"]

- name: stg__mitxpro__openedx__tracking_logs__user_activity
description: xPro open edX user activity table that tracks student and course team
events
description: This table is deduped as raw table has duplicate events. It filters
out blank username events since these don't supply user identifiers and also those
server "exception" events due to server errors
columns:
- name: user_username
description: str, username of the open edX user who caused the event to be emitted.
Expand Down Expand Up @@ -1418,5 +1419,5 @@ models:
- not_null
tests:
- dbt_expectations.expect_compound_columns_to_be_unique:
column_list: ["user_username", "useractivity_event_type", "useractivity_context_object",
"useractivity_timestamp"]
column_list: ["user_username", "useractivity_context_object", "useractivity_event_source",
"useractivity_event_type", "useractivity_event_object", "useractivity_timestamp"]
Original file line number Diff line number Diff line change
@@ -1,24 +1,31 @@
-- xPro user activities from tracking logs
-- Raw table has duplicate rows introduced by our loading process - Airbyte incremental + append on
-- _ab_source_file_last_modified, thus need to dedupe in staging
{{ config(materialized='incremental', views_enabled=false, ) }}
-- Due to size of the raw table, build this model as incremental and apply dedup on new rows
{{ config(
materialized='incremental',
unique_key = ['user_username', 'useractivity_context_object', 'useractivity_event_source',
'useractivity_event_type', 'useractivity_event_object', 'useractivity_timestamp'],
incremental_strategy='delete+insert',
views_enabled=false,
)
}}

with source as (
select * from {{ source('ol_warehouse_raw_data','raw__xpro__openedx__tracking_logs') }}
-- ignore blank username events since these don't supply user identifiers
-- and those where event object has 'exception' field in it due to server errors
-- e.g.{"exception":"<type 'exceptions.UnicodeEncodeError'>","event-type":"exception"}
where
username != ''
and json_query(context, 'lax $.user_id' omit quotes) is not null
and json_query(event, 'lax $.exception' omit quotes) is null

{% if is_incremental() %}
and "time" > (select max(this.useractivity_timestamp) from {{ this }} as this) --noqa
{% endif %}
)

, source_sorted as (
select
*
, row_number() over (
partition by username, context, event_type, "time" -- noqa
partition by username, context, event_source, event_type, event, "time" --noqa
order by _airbyte_emitted_at desc, _ab_source_file_last_modified desc, vector_timestamp desc
) as row_num
from source
Expand Down Expand Up @@ -58,9 +65,3 @@ with source as (
)

select * from cleaned

{% if is_incremental() %}

where useractivity_timestamp > (select max(useractivity_timestamp) from {{ this }})

{% endif %}

0 comments on commit af9eeab

Please sign in to comment.