Skip to content

Commit

Permalink
Merge pull request #1295 from TEAMSchools/refactor-sftp-asset
Browse files Browse the repository at this point in the history
Refactor sftp asset
  • Loading branch information
cbini authored Jul 10, 2024
2 parents 65a2be3 + e8e395d commit 07ceecd
Show file tree
Hide file tree
Showing 65 changed files with 1,286 additions and 1,088 deletions.
2 changes: 0 additions & 2 deletions .github/workflows/deploy-prod-kipptaf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ on:
- src/teamster/libraries/core/**
- src/teamster/libraries/couchdrop/**
- src/teamster/libraries/datagun/**
- src/teamster/libraries/dayforce/**
- src/teamster/libraries/dbt/**
- src/teamster/libraries/deanslist/**
- src/teamster/libraries/fivetran/**
Expand All @@ -44,7 +43,6 @@ on:
- src/teamster/libraries/core/**
- src/teamster/libraries/couchdrop/**
- src/teamster/libraries/datagun/**
- src/teamster/libraries/dayforce/**
- src/teamster/libraries/dbt/**
- src/teamster/libraries/deanslist/**
- src/teamster/libraries/fivetran/**
Expand Down
6 changes: 3 additions & 3 deletions .trunk/trunk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@ plugins:
uri: https://github.com/trunk-io/plugins
lint:
enabled:
- pyright@1.1.370
- pyright@1.1.371
- actionlint@1.7.1
- bandit@1.7.9
- git-diff-check
- gitleaks@8.18.4
- hadolint@2.12.0
- isort@5.13.2
- markdownlint@0.41.0
- osv-scanner@1.8.1
- osv-scanner@1.8.2
- oxipng@9.1.1
- prettier@3.3.2
- ruff@0.5.1
- shellcheck@0.10.0
- shfmt@3.6.0
- sqlfluff@3.1.0
- sqlfmt@0.21.3
- sqlfmt@0.21.4
- svgo@3.3.2
- taplo@0.8.1
- trufflehog@3.79.0
Expand Down
3 changes: 1 addition & 2 deletions src/teamster/code_locations/dev/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# trunk-ignore-begin(ruff/E402)
from dagster import Definitions, load_assets_from_modules

from teamster.code_locations.kipptaf import ( # adp,; airbyte,; amplify,; couchdrop,; datagun,; dayforce,; deanslist,; fivetran,; google,; ldap,; performance_management,; powerschool,; schoolmint,; smartrecruiters,; tableau,; zendesk,
from teamster.code_locations.kipptaf import ( # adp,; airbyte,; amplify,; couchdrop,; datagun,; deanslist,; fivetran,; google,; ldap,; performance_management,; powerschool,; schoolmint,; smartrecruiters,; tableau,; zendesk,
CODE_LOCATION,
dbt,
overgrad,
Expand All @@ -30,7 +30,6 @@
# airbyte,
# amplify,
# datagun,
# dayforce,
dbt,
# deanslist,
# fivetran,
Expand Down
6 changes: 3 additions & 3 deletions src/teamster/code_locations/kippcamden/edplan/assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

from teamster.code_locations.kippcamden import CODE_LOCATION, LOCAL_TIMEZONE
from teamster.code_locations.kippcamden.edplan.schema import NJSMART_POWERSCHOOL
from teamster.libraries.sftp.assets import build_sftp_asset
from teamster.libraries.sftp.assets import build_sftp_file_asset

njsmart_powerschool = build_sftp_asset(
njsmart_powerschool = build_sftp_file_asset(
asset_key=[CODE_LOCATION, "edplan", "njsmart_powerschool"],
remote_dir="Reports",
remote_dir_regex=r"Reports",
remote_file_regex=r"NJSMART-Power[Ss]chool\.txt",
ssh_resource_key="ssh_edplan",
avro_schema=NJSMART_POWERSCHOOL,
Expand Down
88 changes: 55 additions & 33 deletions src/teamster/code_locations/kippcamden/pearson/assets.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
import pathlib

from dagster import (
MultiPartitionsDefinition,
StaticPartitionsDefinition,
config_from_files,
)
from dagster import MultiPartitionsDefinition, StaticPartitionsDefinition

from teamster.code_locations.kippcamden import CODE_LOCATION
from teamster.code_locations.kippcamden.pearson.schema import ASSET_SCHEMA
from teamster.libraries.sftp.assets import build_sftp_asset
from teamster.code_locations.kippcamden.pearson.schema import (
NJGPA_SCHEMA,
NJSLA_SCHEMA,
NJSLA_SCIENCE_SCHEMA,
PARCC_SCHEMA,
STUDENT_LIST_REPORT_SCHEMA,
)
from teamster.libraries.sftp.assets import build_sftp_file_asset

config_dir = pathlib.Path(__file__).parent / "config"
ssh_resource_key = "ssh_couchdrop"
remote_dir_regex_prefix = f"/data-team/{CODE_LOCATION}/pearson"
key_prefix = [CODE_LOCATION, "pearson"]

njgpa = build_sftp_asset(
asset_key=[CODE_LOCATION, "pearson", "njgpa"],
remote_dir=f"/data-team/{CODE_LOCATION}/pearson/njgpa",
njgpa = build_sftp_file_asset(
asset_key=[*key_prefix, "njgpa"],
remote_dir_regex=rf"{remote_dir_regex_prefix}/njgpa",
remote_file_regex=(
r"pc(?P<administration>[a-z]+)"
r"(?P<fiscal_year>\d+)_NJ-\d+-\d+_\w+GPA\w+\.csv"
),
avro_schema=ASSET_SCHEMA["njgpa"],
ssh_resource_key="ssh_couchdrop",
avro_schema=NJGPA_SCHEMA,
ssh_resource_key=ssh_resource_key,
partitions_def=MultiPartitionsDefinition(
{
"fiscal_year": StaticPartitionsDefinition(["23", "24"]),
Expand All @@ -29,15 +31,17 @@
),
)

student_list_report = build_sftp_asset(
asset_key=[CODE_LOCATION, "pearson", "student_list_report"],
remote_dir=f"/data-team/{CODE_LOCATION}/pearson/student_list_report",
student_list_report = build_sftp_file_asset(
asset_key=[*key_prefix, "student_list_report"],
remote_dir_regex=(
rf"{remote_dir_regex_prefix}/student_list_report/(?P<test_type>[a-z]+)"
),
remote_file_regex=(
r"(?P<test_type>[a-z]+)\/StudentListReport_"
r"(?P<administration_fiscal_year>[A-za-z]+\d+)_\d+_\d+-\d+-\d+\.csv"
r"StudentListReport_(?P<administration_fiscal_year>[A-za-z]+\d+)"
r"_\d+_\d+-\d+-\d+\.csv"
),
avro_schema=ASSET_SCHEMA["student_list_report"],
ssh_resource_key="ssh_couchdrop",
avro_schema=STUDENT_LIST_REPORT_SCHEMA,
ssh_resource_key=ssh_resource_key,
partitions_def=MultiPartitionsDefinition(
{
"test_type": StaticPartitionsDefinition(["njsla", "njgpa"]),
Expand All @@ -48,19 +52,37 @@
),
)

static_partition_assets = [
build_sftp_asset(
asset_key=[CODE_LOCATION, "pearson", a["asset_name"]],
avro_schema=ASSET_SCHEMA[a["asset_name"]],
ssh_resource_key="ssh_couchdrop",
partitions_def=StaticPartitionsDefinition(a["partition_keys"]),
**a,
)
for a in config_from_files([f"{config_dir}/assets.yaml"])["assets"]
]
njsla = build_sftp_file_asset(
asset_key=[*key_prefix, "njsla"],
remote_dir_regex=rf"{remote_dir_regex_prefix}/njsla",
remote_file_regex=r"pcspr(?P<fiscal_year>\d+)_NJ-\d+(-\d+)?_\w+\.csv",
avro_schema=NJSLA_SCHEMA,
ssh_resource_key=ssh_resource_key,
partitions_def=StaticPartitionsDefinition(["19", "22", "23"]),
)

njsla_science = build_sftp_file_asset(
asset_key=[*key_prefix, "njsla_science"],
remote_dir_regex=rf"{remote_dir_regex_prefix}/njsla_science",
remote_file_regex=r"njs(?P<fiscal_year>\d+)_NJ-\d+_\w+\.csv",
avro_schema=NJSLA_SCIENCE_SCHEMA,
ssh_resource_key=ssh_resource_key,
partitions_def=StaticPartitionsDefinition(["22", "23"]),
)

parcc = build_sftp_file_asset(
asset_key=[*key_prefix, "parcc"],
remote_dir_regex=rf"{remote_dir_regex_prefix}/parcc",
remote_file_regex=r"PC_pcspr(?P<fiscal_year>\d+)_NJ-\d+(-\d+)?_\w+\.csv",
avro_schema=PARCC_SCHEMA,
ssh_resource_key=ssh_resource_key,
partitions_def=StaticPartitionsDefinition(["16", "17", "18"]),
)

assets = [
njgpa,
njsla_science,
njsla,
parcc,
student_list_report,
*static_partition_assets,
]
21 changes: 0 additions & 21 deletions src/teamster/code_locations/kippcamden/pearson/config/assets.yaml

This file was deleted.

36 changes: 19 additions & 17 deletions src/teamster/code_locations/kippcamden/pearson/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,22 @@ class njgpa_record(NJGPA):

pas_options = py_avro_schema.Option.NO_DOC | py_avro_schema.Option.NO_AUTO_NAMESPACE

ASSET_SCHEMA = {
"parcc": json.loads(
py_avro_schema.generate(py_type=parcc_record, options=pas_options)
),
"njsla": json.loads(
py_avro_schema.generate(py_type=njsla_record, options=pas_options)
),
"njsla_science": json.loads(
py_avro_schema.generate(py_type=njsla_science_record, options=pas_options)
),
"njgpa": json.loads(
py_avro_schema.generate(py_type=njgpa_record, options=pas_options)
),
"student_list_report": json.loads(
py_avro_schema.generate(py_type=StudentListReport, options=pas_options)
),
}
PARCC_SCHEMA = json.loads(
py_avro_schema.generate(py_type=parcc_record, options=pas_options)
)

NJSLA_SCHEMA = json.loads(
py_avro_schema.generate(py_type=njsla_record, options=pas_options)
)

NJSLA_SCIENCE_SCHEMA = json.loads(
py_avro_schema.generate(py_type=njsla_science_record, options=pas_options)
)

NJGPA_SCHEMA = json.loads(
py_avro_schema.generate(py_type=njgpa_record, options=pas_options)
)

STUDENT_LIST_REPORT_SCHEMA = json.loads(
py_avro_schema.generate(py_type=StudentListReport, options=pas_options)
)
2 changes: 1 addition & 1 deletion src/teamster/code_locations/kippcamden/titan/assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

person_data = build_titan_sftp_asset(
key=[CODE_LOCATION, "titan", "person_data"],
remote_file_regex=r"Person Data(?P<fiscal_year>\d{4})\.csv",
remote_file_regex=r"[Pp]erson\s?[Dd]ata(?P<fiscal_year>\d{4})\.csv",
schema=PERSON_DATA_SCHEMA,
partition_start_date="2020-07-01",
timezone=LOCAL_TIMEZONE,
Expand Down
22 changes: 11 additions & 11 deletions src/teamster/code_locations/kippmiami/fldoe/assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
FSA_SCHEMA,
SCIENCE_SCHEMA,
)
from teamster.libraries.sftp.assets import build_sftp_asset, build_sftp_folder_asset
from teamster.libraries.sftp.assets import (
build_sftp_file_asset,
build_sftp_folder_asset,
)

fast = build_sftp_folder_asset(
asset_key=[CODE_LOCATION, "fldoe", "fast"],
remote_dir="/data-team/kippmiami/fldoe/fast",
remote_dir_regex=r"/data-team/kippmiami/fldoe/fast/(?P<school_year_term>\d+/PM\d)",
remote_file_regex=(
r"(?P<school_year_term>\d+\/PM\d)\/"
r"\w+-\w+_(?P<grade_level_subject>Grade\dFAST\w+)_StudentData_.+\.csv"
),
ssh_resource_key="ssh_couchdrop",
Expand Down Expand Up @@ -43,11 +45,10 @@
),
)

eoc = build_sftp_asset(
eoc = build_sftp_file_asset(
asset_key=[CODE_LOCATION, "fldoe", "eoc"],
remote_dir="/data-team/kippmiami/fldoe/eoc",
remote_dir_regex=(r"/data-team/kippmiami/fldoe/eoc/(?P<school_year_term>\d+)"),
remote_file_regex=(
r"(?P<school_year_term>\d+)\/"
r"\w+-\w+_(?P<grade_level_subject>[\w\.]+)EOC_StudentData_\d+\s[AP]M\.csv"
),
ssh_resource_key="ssh_couchdrop",
Expand All @@ -62,11 +63,10 @@
),
)

science = build_sftp_asset(
science = build_sftp_file_asset(
asset_key=[CODE_LOCATION, "fldoe", "science"],
remote_dir="/data-team/kippmiami/fldoe/science",
remote_dir_regex=r"/data-team/kippmiami/fldoe/science/(?P<school_year_term>\d+)",
remote_file_regex=(
r"(?P<school_year_term>\d+)\/"
r"\w+-\w+_Grade(?P<grade_level_subject>\d)Science_StudentData_\d+\s[AP]M\.csv"
),
ssh_resource_key="ssh_couchdrop",
Expand All @@ -79,9 +79,9 @@
),
)

fsa = build_sftp_asset(
fsa = build_sftp_file_asset(
asset_key=[CODE_LOCATION, "fldoe", "fsa"],
remote_dir="/data-team/kippmiami/fldoe/fsa/student_scores",
remote_dir_regex=r"/data-team/kippmiami/fldoe/fsa/student_scores",
remote_file_regex=(
r"FSA_(?P<school_year_term>\d+)SPR_\d+_SRS-E_"
r"(?P<grade_level_subject>\w+)_SCHL\.csv"
Expand Down
Loading

0 comments on commit 07ceecd

Please sign in to comment.