From fe65fccc29dfb37054b677fad87a0bd39bc526ff Mon Sep 17 00:00:00 2001 From: pt2302 <1553279+pt2302@users.noreply.github.com> Date: Thu, 21 May 2026 00:00:42 -0400 Subject: [PATCH 1/3] Add mart for OCW resources --- .../intermediate/ocw/_int_ocw__models.yml | 23 +++- .../intermediate/ocw/int__ocw__resources.sql | 28 +++++ .../models/marts/ocw/_marts__ocw__models.yml | 105 +++++++++++++++++- 3 files changed, 152 insertions(+), 4 deletions(-) diff --git a/src/ol_dbt/models/intermediate/ocw/_int_ocw__models.yml b/src/ol_dbt/models/intermediate/ocw/_int_ocw__models.yml index ec4c05284..b0808808b 100644 --- a/src/ol_dbt/models/intermediate/ocw/_int_ocw__models.yml +++ b/src/ol_dbt/models/intermediate/ocw/_int_ocw__models.yml @@ -72,8 +72,8 @@ models: - name: course_year description: int, course year - name: course_level - description: str, course level in comma-separated list. Possible values are Undergraduate - , Graduate, Non-Credit, High School, or blank + description: str, course level in comma-separated list. Possible values are Undergraduate, + Graduate, Non-Credit, High School, or blank - name: course_primary_course_number description: str, the primary course number e.g. 21A.850J tests: @@ -310,6 +310,25 @@ models: description: str, UUID of the resource tests: - not_null + - name: resource_license + description: str, license URL of the resource (e.g., Creative Commons) + - name: resource_description + description: str, description of the resource + - name: resource_file_type + description: str, file type of the resource file + - name: resource_file_size + description: str, file size of the resource file + - name: resource_ocw_type + description: str, OCW content type classification of the resource + - name: external_resource_status + description: str, link status of the external resource (e.g., valid, unchecked, + broken, check_failed) + - name: external_resource_wayback_url + description: str, Wayback Machine archive URL of the external resource + - name: resource_audience + description: str, intended audience of the resource as a JSON array + - name: resource_level + description: str, level of the resource as a JSON array - name: video_archive_url description: str, archive URL of the video - name: video_captions_file diff --git a/src/ol_dbt/models/intermediate/ocw/int__ocw__resources.sql b/src/ol_dbt/models/intermediate/ocw/int__ocw__resources.sql index 91c3cfd1e..5b2cde4fb 100644 --- a/src/ol_dbt/models/intermediate/ocw/int__ocw__resources.sql +++ b/src/ol_dbt/models/intermediate/ocw/int__ocw__resources.sql @@ -123,6 +123,34 @@ select , nullif(nullif(json_query( websitecontents.websitecontent_metadata, 'lax $.video_files.video_transcript_file' omit quotes ), ''), 'null') as video_transcript_file + -- resource metadata + , nullif(nullif( + {{ json_query_string('websitecontents.websitecontent_metadata', "'$.license'") }}, '' + ), 'null') as resource_license + , nullif(nullif( + {{ json_query_string('websitecontents.websitecontent_metadata', "'$.description'") }}, '' + ), 'null') as resource_description + , nullif(nullif( + {{ json_query_string('websitecontents.websitecontent_metadata', "'$.file_type'") }}, '' + ), 'null') as resource_file_type + , nullif(nullif( + {{ json_query_string('websitecontents.websitecontent_metadata', "'$.file_size'") }}, '' + ), 'null') as resource_file_size + , nullif(nullif( + {{ json_query_string('websitecontents.websitecontent_metadata', "'$.ocw_type'") }}, '' + ), 'null') as resource_ocw_type + , nullif(nullif( + {{ json_query_string('websitecontents.websitecontent_metadata', "'$.status'") }}, '' + ), 'null') as external_resource_status + , nullif(nullif( + {{ json_query_string('websitecontents.websitecontent_metadata', "'$.wayback_url'") }}, '' + ), 'null') as external_resource_wayback_url + , nullif(nullif(nullif(json_query( + websitecontents.websitecontent_metadata, 'lax $.audience' omit quotes + ), ''), 'null'), '[]') as resource_audience + , nullif(nullif(nullif(json_query( + websitecontents.websitecontent_metadata, 'lax $.level' omit quotes + ), ''), 'null'), '[]') as resource_level from websites inner join websitecontents on websites.website_uuid = websitecontents.website_uuid diff --git a/src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml b/src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml index d07cc9e0d..d16564efe 100644 --- a/src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml +++ b/src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml @@ -50,8 +50,8 @@ models: - name: course_year description: int, course year - name: course_level - description: str, course level in comma-separated list. Possible values are Undergraduate - , Graduate, Non-Credit, High School, or blank + description: str, course level in comma-separated list. Possible values are Undergraduate, + Graduate, Non-Credit, High School, or blank - name: course_primary_course_number description: str, the primary course number e.g. 21A.850J tests: @@ -72,3 +72,104 @@ models: list. e.g. Lecture Notes, Problem Sets with Solutions... Full list can be found at https://github.com/mitodl/ocw-hugo-projects/blob/049c85e6544a36ba69a89602e5014f6085ef8831/ocw-course-v2/ocw-studio.yaml#L163-L207 + +- name: marts__ocw_resources + description: OCW course resources (files, external resources, video, image) for + review and analysis + columns: + - name: course_uuid + description: str, UUID of the course containing the resource + tests: + - not_null + - name: course_name + description: str, name of the course + - name: course_number + description: str, primary course number of the course, from the course metadata + - name: course_title + description: str, title of the course, from the course metadata + - name: course_term + description: str, term course was taught, from the course metadata + - name: course_year + description: str, year course was taught, from the course metadata + - name: course_live_url + description: str, url of the course on production + - name: resource_uuid + description: str, UUID of the resource + tests: + - not_null + - name: resource_title + description: str, title of the resource + - name: content_type + description: str, WebsiteContent type of the resource (e.g., resource, external-resource) + - name: resource_type + description: str, resource type (image, video, etc.) + - name: resource_ocw_type + description: str, OCW content type classification of the resource + - name: resource_filename + description: str, filename of the resource file + - name: resource_file_type + description: str, file type of the resource file + - name: resource_file_size + description: str, file size of the resource file + - name: resource_draft + description: boolean, whether the resource is draft + - name: resource_live_url + description: str, live URL of the resource on production + - name: studio_url + description: str, OCW Studio URL for the resource + - name: website_title + description: str, title of the course + - name: learning_resource_types + description: str, learning resource types of the resource (e.g., Lecture Notes) + - name: resource_license + description: str, license URL of the resource (e.g., Creative Commons) + - name: resource_description + description: str, description of the resource + - name: resource_audience + description: str, intended audience of the resource as a JSON array + - name: resource_level + description: str, level of the resource as a JSON array + - name: external_resource_url + description: str, URL of the external resource + - name: external_resource_is_broken + description: boolean, whether the external resource link is broken + - name: external_resource_license_warning + description: boolean, whether the external resource has a license warning + - name: external_resource_url_status_code + description: int, status code of the external resource URL + - name: external_resource_backup_url + description: str, backup URL of the external resource + - name: external_resource_backup_url_status_code + description: int, status code of the external resource backup URL + - name: external_resource_status + description: str, link status of the external resource (e.g., valid, unchecked, + broken, check_failed) + - name: external_resource_wayback_url + description: str, Wayback Machine archive URL of the external resource + - name: image_alt_text + description: str, alt text of the image resource + - name: image_caption + description: str, caption of the image resource + - name: image_credit + description: str, credit of the image resource + - name: video_youtube_id + description: str, YouTube ID of the video + - name: video_youtube_description + description: str, description of the YouTube video + - name: video_youtube_speakers + description: str, speakers in the YouTube video + - name: video_youtube_tags + description: str, tags of the YouTube video + - name: video_archive_url + description: str, archive URL of the video + - name: video_captions_file + description: str, captions file of the video resource + - name: video_thumbnail_file + description: str, thumbnail file of the video resource + - name: video_transcript_file + description: str, transcript file of the video resource + tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - course_uuid + - resource_uuid From efa4981f3a9ad3ad2802bf701ab995e5bbb9a64a Mon Sep 17 00:00:00 2001 From: pt2302 <1553279+pt2302@users.noreply.github.com> Date: Thu, 21 May 2026 00:24:03 -0400 Subject: [PATCH 2/3] Add missing model files --- .../models/dimensional/_dim_ocw_resource.yml | 108 ++++++++++++++++++ .../models/dimensional/dim_ocw_resource.sql | 49 ++++++++ .../models/marts/ocw/marts__ocw_resources.sql | 45 ++++++++ 3 files changed, 202 insertions(+) create mode 100644 src/ol_dbt/models/dimensional/_dim_ocw_resource.yml create mode 100644 src/ol_dbt/models/dimensional/dim_ocw_resource.sql create mode 100644 src/ol_dbt/models/marts/ocw/marts__ocw_resources.sql diff --git a/src/ol_dbt/models/dimensional/_dim_ocw_resource.yml b/src/ol_dbt/models/dimensional/_dim_ocw_resource.yml new file mode 100644 index 000000000..7144ee045 --- /dev/null +++ b/src/ol_dbt/models/dimensional/_dim_ocw_resource.yml @@ -0,0 +1,108 @@ +--- +version: 2 + +models: +- name: dim_ocw_resource + description: > + OCW course resources (files, external resources, video, image) sourced from + int__ocw__resources. One row per (course_uuid, resource_uuid). The raw metadata + JSON is excluded; its useful fields are surfaced as scalar columns. + meta: + owner: data_team + columns: + - name: course_uuid + description: str, UUID of the course containing the resource + tests: + - not_null + - name: course_name + description: str, name of the course + - name: course_number + description: str, primary course number of the course, from the course metadata + - name: course_title + description: str, title of the course, from the course metadata + - name: course_term + description: str, term course was taught, from the course metadata + - name: course_year + description: str, year course was taught, from the course metadata + - name: course_live_url + description: str, url of the course on production + - name: resource_uuid + description: str, UUID of the resource + tests: + - not_null + - name: resource_title + description: str, title of the resource + - name: content_type + description: str, WebsiteContent type of the resource (e.g., resource, external-resource) + - name: resource_type + description: str, resource type (image, video, etc.) + - name: resource_ocw_type + description: str, OCW content type classification of the resource + - name: resource_filename + description: str, filename of the resource file + - name: resource_file_type + description: str, file type of the resource file + - name: resource_file_size + description: str, file size of the resource file + - name: resource_draft + description: boolean, whether the resource is draft + - name: resource_live_url + description: str, live URL of the resource on production + - name: studio_url + description: str, OCW Studio URL for the resource + - name: website_title + description: str, title of the course + - name: learning_resource_types + description: str, learning resource types of the resource (e.g., Lecture Notes) + - name: resource_license + description: str, license URL of the resource (e.g., Creative Commons) + - name: resource_description + description: str, description of the resource + - name: resource_audience + description: str, intended audience of the resource as a JSON array + - name: resource_level + description: str, level of the resource as a JSON array + - name: external_resource_url + description: str, URL of the external resource + - name: external_resource_is_broken + description: boolean, whether the external resource link is broken + - name: external_resource_license_warning + description: boolean, whether the external resource has a license warning + - name: external_resource_url_status_code + description: int, status code of the external resource URL + - name: external_resource_backup_url + description: str, backup URL of the external resource + - name: external_resource_backup_url_status_code + description: int, status code of the external resource backup URL + - name: external_resource_status + description: str, link status of the external resource (e.g., valid, unchecked, + broken, check_failed) + - name: external_resource_wayback_url + description: str, Wayback Machine archive URL of the external resource + - name: image_alt_text + description: str, alt text of the image resource + - name: image_caption + description: str, caption of the image resource + - name: image_credit + description: str, credit of the image resource + - name: video_youtube_id + description: str, YouTube ID of the video + - name: video_youtube_description + description: str, description of the YouTube video + - name: video_youtube_speakers + description: str, speakers in the YouTube video + - name: video_youtube_tags + description: str, tags of the YouTube video + - name: video_archive_url + description: str, archive URL of the video + - name: video_captions_file + description: str, captions file of the video resource + - name: video_thumbnail_file + description: str, thumbnail file of the video resource + - name: video_transcript_file + description: str, transcript file of the video resource + tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: + - course_uuid + - resource_uuid diff --git a/src/ol_dbt/models/dimensional/dim_ocw_resource.sql b/src/ol_dbt/models/dimensional/dim_ocw_resource.sql new file mode 100644 index 000000000..1a07266b8 --- /dev/null +++ b/src/ol_dbt/models/dimensional/dim_ocw_resource.sql @@ -0,0 +1,49 @@ +{{ config( + materialized='table' +) }} + +select + course_uuid + , course_name + , course_number + , course_title + , course_term + , course_year + , course_live_url + , resource_uuid + , resource_title + , content_type + , resource_type + , resource_ocw_type + , resource_filename + , resource_file_type + , resource_file_size + , resource_draft + , resource_live_url + , studio_url + , website_title + , learning_resource_types + , resource_license + , resource_description + , resource_audience + , resource_level + , external_resource_url + , external_resource_is_broken + , external_resource_license_warning + , external_resource_url_status_code + , external_resource_backup_url + , external_resource_backup_url_status_code + , external_resource_status + , external_resource_wayback_url + , image_alt_text + , image_caption + , image_credit + , video_youtube_id + , video_youtube_description + , video_youtube_speakers + , video_youtube_tags + , video_archive_url + , video_captions_file + , video_thumbnail_file + , video_transcript_file +from {{ ref('int__ocw__resources') }} diff --git a/src/ol_dbt/models/marts/ocw/marts__ocw_resources.sql b/src/ol_dbt/models/marts/ocw/marts__ocw_resources.sql new file mode 100644 index 000000000..4de2ac464 --- /dev/null +++ b/src/ol_dbt/models/marts/ocw/marts__ocw_resources.sql @@ -0,0 +1,45 @@ +select + course_uuid + , course_name + , course_number + , course_title + , course_term + , course_year + , course_live_url + , resource_uuid + , resource_title + , content_type + , resource_type + , resource_ocw_type + , resource_filename + , resource_file_type + , resource_file_size + , resource_draft + , resource_live_url + , studio_url + , website_title + , learning_resource_types + , resource_license + , resource_description + , resource_audience + , resource_level + , external_resource_url + , external_resource_is_broken + , external_resource_license_warning + , external_resource_url_status_code + , external_resource_backup_url + , external_resource_backup_url_status_code + , external_resource_status + , external_resource_wayback_url + , image_alt_text + , image_caption + , image_credit + , video_youtube_id + , video_youtube_description + , video_youtube_speakers + , video_youtube_tags + , video_archive_url + , video_captions_file + , video_thumbnail_file + , video_transcript_file +from {{ ref('dim_ocw_resource') }} From 57b582dc00cf63e996981486aa675fe5a6118f7c Mon Sep 17 00:00:00 2001 From: pt2302 <1553279+pt2302@users.noreply.github.com> Date: Thu, 21 May 2026 00:33:45 -0400 Subject: [PATCH 3/3] Update expectations test for consistency --- src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml b/src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml index d16564efe..4f604b8b4 100644 --- a/src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml +++ b/src/ol_dbt/models/marts/ocw/_marts__ocw__models.yml @@ -169,7 +169,6 @@ models: - name: video_transcript_file description: str, transcript file of the video resource tests: - - dbt_utils.unique_combination_of_columns: - combination_of_columns: - - course_uuid - - resource_uuid + - dbt_expectations.expect_compound_columns_to_be_unique: + arguments: + column_list: ["course_uuid", "resource_uuid"]