diff --git a/README.md b/README.md index 666bfd0..3d48796 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# dqm-tutorial \ No newline at end of file +# dqm-tutorial diff --git a/feature_store.yaml b/feature_store.yaml index 695110c..9c22f23 100644 --- a/feature_store.yaml +++ b/feature_store.yaml @@ -6,6 +6,4 @@ online_store: path: data/online_store.db offline_store: type: file -flags: - alpha_features: true - on_demand_transforms: true +entity_key_serialization_version: 2 diff --git a/validating-historical-features.ipynb b/validating-historical-features.ipynb index 0a601d1..c105712 100644 --- a/validating-historical-features.ipynb +++ b/validating-historical-features.ipynb @@ -65,14 +65,6 @@ "!pip install 'feast[ge]'" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ca11e6d", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "id": "93dbebd8", @@ -101,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "6787c9fa", "metadata": {}, "outputs": [], @@ -113,19 +105,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "8cad9c80", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/pyalex/projects/feast/venv/lib/python3.7/site-packages/google/auth/_default.py:70: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" - ] - } - ], + "outputs": [], "source": [ "bq_client = Client(project='kf-feast')" ] @@ -140,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "a019d23d", "metadata": {}, "outputs": [], @@ -162,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "0aa0ea00", "metadata": {}, "outputs": [], @@ -175,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "aa5d86c4", "metadata": {}, "outputs": [], @@ -192,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "e9cc7a21", "metadata": {}, "outputs": [], @@ -205,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "644201fe", "metadata": {}, "outputs": [], @@ -232,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "id": "19cd609c", "metadata": {}, "outputs": [], @@ -240,25 +223,25 @@ "import pyarrow.parquet\n", "import pandas as pd\n", "\n", - "from feast import Feature, FeatureView, Entity, FeatureStore\n", + "from feast import FeatureView, Entity, FeatureStore, Field, BatchFeatureView\n", + "from feast.types import Float64, Int64\n", "from feast.value_type import ValueType\n", "from feast.data_format import ParquetFormat\n", "from feast.on_demand_feature_view import on_demand_feature_view\n", "from feast.infra.offline_stores.file_source import FileSource\n", "from feast.infra.offline_stores.file import SavedDatasetFileStorage\n", - "\n", - "from google.protobuf.duration_pb2 import Duration" + "from datetime import timedelta" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "id": "79ff6a81", "metadata": {}, "outputs": [], "source": [ "batch_source = FileSource(\n", - " event_timestamp_column=\"day\",\n", + " timestamp_field=\"day\",\n", " path=\"trips_stats.parquet\", # using parquet file that we created on previous step\n", " file_format=ParquetFormat()\n", ")" @@ -266,33 +249,42 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "id": "7cc59092", "metadata": {}, "outputs": [], "source": [ - "taxi_entity = Entity(name='taxi', join_key='taxi_id')" + "taxi_entity = Entity(name='taxi', join_keys=['taxi_id'])" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "id": "d0dc7fda", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lrangine/Documents/Src/feast/sdk/python/feast/batch_feature_view.py:72: RuntimeWarning: Batch feature views are experimental features in alpha development. Some functionality may still be unstable so functionality can change in the future.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "trips_stats_fv = FeatureView(\n", + "trips_stats_fv = BatchFeatureView(\n", " name='trip_stats',\n", - " entities=['taxi'],\n", - " features=[\n", - " Feature(\"total_miles_travelled\", ValueType.DOUBLE),\n", - " Feature(\"total_trip_seconds\", ValueType.DOUBLE),\n", - " Feature(\"total_earned\", ValueType.DOUBLE),\n", - " Feature(\"trip_count\", ValueType.INT64),\n", - " \n", + " entities=[taxi_entity],\n", + " schema=[\n", + " Field(name=\"total_miles_travelled\", dtype=Float64),\n", + " Field(name=\"total_trip_seconds\", dtype=Float64),\n", + " Field(name=\"total_earned\", dtype=Float64),\n", + " Field(name=\"trip_count\", dtype=Int64),\n", + "\n", " ],\n", - " ttl=Duration(seconds=86400),\n", - " batch_source=batch_source,\n", + " ttl=timedelta(seconds=86400),\n", + " source=batch_source,\n", ")" ] }, @@ -306,23 +298,23 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "id": "d706f6b6", "metadata": {}, "outputs": [], "source": [ "@on_demand_feature_view(\n", - " features=[\n", - " Feature(\"avg_fare\", ValueType.DOUBLE),\n", - " Feature(\"avg_speed\", ValueType.DOUBLE),\n", - " Feature(\"avg_trip_seconds\", ValueType.DOUBLE),\n", - " Feature(\"earned_per_hour\", ValueType.DOUBLE),\n", + " sources=[\n", + " trips_stats_fv,\n", " ],\n", - " inputs={\n", - " \"stats\": trips_stats_fv\n", - " }\n", + " schema=[\n", + " Field(name=\"avg_fare\", dtype=Float64),\n", + " Field(name=\"avg_speed\", dtype=Float64),\n", + " Field(name=\"avg_trip_seconds\", dtype=Float64),\n", + " Field(name=\"earned_per_hour\", dtype=Float64),\n", + " ]\n", ")\n", - "def on_demand_stats(inp):\n", + "def on_demand_stats(inp: pd.DataFrame) -> pd.DataFrame:\n", " out = pd.DataFrame()\n", " out[\"avg_fare\"] = inp[\"total_earned\"] / inp[\"trip_count\"]\n", " out[\"avg_speed\"] = 3600 * inp[\"total_miles_travelled\"] / inp[\"total_trip_seconds\"]\n", @@ -341,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "id": "be0c72e7", "metadata": {}, "outputs": [], @@ -351,10 +343,21 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "id": "8935e813", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/lrangine/Documents/Src/feast/sdk/python/feast/feature_store.py:534: RuntimeWarning: On demand feature view is an experimental feature. This API is stable, but the functionality does not scale well for offline retrieval\n", + " warnings.warn(\n", + "/Users/lrangine/Documents/Src/feast/sdk/python/feast/batch_feature_view.py:72: RuntimeWarning: Batch feature views are experimental features in alpha development. Some functionality may still be unstable so functionality can change in the future.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "store.apply([taxi_entity, trips_stats_fv, on_demand_stats]) # writing to the registry" ] @@ -377,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, "id": "03ba0273", "metadata": {}, "outputs": [], @@ -395,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "id": "a558e4b8", "metadata": {}, "outputs": [], @@ -414,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "id": "2f6ca83c", "metadata": {}, "outputs": [ @@ -521,7 +524,7 @@ "[156984 rows x 2 columns]" ] }, - "execution_count": 19, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -549,29 +552,10 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "2c4af1e4", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/pyalex/projects/feast/sdk/python/feast/feature_store.py:853: RuntimeWarning: Saving dataset is an experimental feature. This API is unstable and it could and most probably will be changed in the future. We do not guarantee that future changes will maintain backward compatibility.\n", - " RuntimeWarning,\n" - ] - }, - { - "data": { - "text/plain": [ - ", full_feature_names = False, tags = {}, _retrieval_job = , min_event_timestamp = 2019-06-01 00:00:00, max_event_timestamp = 2019-07-01 00:00:00)>" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "job = store.get_historical_features(\n", " entity_df=entity_df,\n", @@ -630,7 +614,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 13, "id": "a46f571f", "metadata": {}, "outputs": [ @@ -638,9 +622,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "02/02/2022 02:43:45 PM WARNING:/Users/pyalex/projects/feast/venv/lib/python3.7/site-packages/great_expectations/render/view/view.py:116: DeprecationWarning: 'contextfilter' is renamed to 'pass_context', the old name will be removed in Jinja 3.1.\n", - " def add_data_context_id_to_url(self, jinja_context, url, add_datetime=True):\n", - "\n" + "/usr/local/anaconda3/envs/feast-3.11/lib/python3.11/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n", + " if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):\n", + "/usr/local/anaconda3/envs/feast-3.11/lib/python3.11/site-packages/pyspark/sql/connect/utils.py:50: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n", + " if LooseVersion(grpc.__version__) < LooseVersion(minimum_grpc_version):\n" ] } ], @@ -663,18 +648,10 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 14, "id": "e57a229b", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/pyalex/projects/feast/sdk/python/feast/feature_store.py:904: RuntimeWarning: Retrieving datasets is an experimental feature. This API is unstable and it could and most probably will be changed in the future. We do not guarantee that future changes will maintain backward compatibility.\n", - " RuntimeWarning,\n" - ] - }, { "data": { "text/html": [ @@ -696,83 +673,83 @@ " \n", " \n", " \n", - " total_earned\n", - " avg_trip_seconds\n", " taxi_id\n", - " total_miles_travelled\n", - " trip_count\n", - " earned_per_hour\n", " event_timestamp\n", - " total_trip_seconds\n", + " earned_per_hour\n", + " total_earned\n", + " trip_count\n", " avg_fare\n", + " total_miles_travelled\n", " avg_speed\n", + " avg_trip_seconds\n", + " total_trip_seconds\n", " \n", " \n", " \n", " \n", " 0\n", - " 68.25\n", - " 2270.000000\n", - " 91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d...\n", - " 24.70\n", - " 2.0\n", - " 54.118943\n", + " d13c5aaa066f94b4927779ed24cd313b0c686f03407095...\n", " 2019-06-01 00:00:00+00:00\n", - " 4540.0\n", - " 34.125000\n", - " 19.585903\n", + " 45.559701\n", + " 203.50\n", + " 8\n", + " 25.437500\n", + " 69.50\n", + " 15.559701\n", + " 2010.000000\n", + " 16080\n", " \n", " \n", " 1\n", - " 221.00\n", - " 560.500000\n", - " 7a4a6162eaf27805aef407d25d5cb21fe779cd962922cb...\n", - " 54.18\n", - " 24.0\n", - " 59.143622\n", + " 33164e16dd29b1c58cd15cce31df4bfcb75d9903cb66de...\n", " 2019-06-01 00:00:00+00:00\n", - " 13452.0\n", - " 9.208333\n", - " 14.499554\n", + " 36.219512\n", + " 74.25\n", + " 5\n", + " 14.850000\n", + " 15.80\n", + " 7.707317\n", + " 1476.000000\n", + " 7380\n", " \n", " \n", " 2\n", - " 160.50\n", - " 1010.769231\n", - " f4c9d05b215d7cbd08eca76252dae51cdb7aca9651d4ef...\n", - " 41.30\n", - " 13.0\n", - " 43.972603\n", + " 226fe0b00be42932bdff81bc0b318b883bfbf15dd48093...\n", " 2019-06-01 00:00:00+00:00\n", - " 13140.0\n", - " 12.346154\n", - " 11.315068\n", + " 54.212598\n", + " 114.75\n", + " 6\n", + " 19.125000\n", + " 38.50\n", + " 18.188976\n", + " 1270.000000\n", + " 7620\n", " \n", " \n", " 3\n", - " 183.75\n", - " 697.550000\n", - " c1f533318f8480a59173a9728ea0248c0d3eb187f4b897...\n", - " 37.30\n", - " 20.0\n", - " 47.415956\n", + " 5a5bed1b5ced617d0594007d591f10bbbca354d50b19ca...\n", " 2019-06-01 00:00:00+00:00\n", - " 13951.0\n", - " 9.187500\n", - " 9.625116\n", + " 45.000000\n", + " 70.75\n", + " 4\n", + " 17.687500\n", + " 20.22\n", + " 12.860777\n", + " 1415.000000\n", + " 5660\n", " \n", " \n", " 4\n", - " 217.75\n", - " 1054.076923\n", - " 455b6b5cae6ca5a17cddd251485f2266d13d6a2c92f07c...\n", - " 69.69\n", - " 13.0\n", - " 57.206451\n", + " b7f7dbb452c0fb980a0f2050a146147c1006fe5f34e3b0...\n", " 2019-06-01 00:00:00+00:00\n", - " 13703.0\n", - " 16.750000\n", - " 18.308692\n", + " 53.783319\n", + " 104.25\n", + " 5\n", + " 20.850000\n", + " 34.49\n", + " 17.793637\n", + " 1395.600000\n", + " 6978\n", " \n", " \n", " ...\n", @@ -788,132 +765,132 @@ " ...\n", " \n", " \n", - " 156979\n", - " 38.00\n", - " 1980.000000\n", - " 0cccf0ec1f46d1e0beefcfdeaf5188d67e170cdff92618...\n", - " 14.90\n", - " 1.0\n", - " 69.090909\n", + " 119803\n", + " 961263722c1beadafef2355412d672acac35e4054f6aaa...\n", " 2019-07-01 00:00:00+00:00\n", - " 1980.0\n", - " 38.000000\n", - " 27.090909\n", + " 76.369295\n", + " 102.25\n", + " 4\n", + " 25.562500\n", + " 36.98\n", + " 27.619917\n", + " 1205.000000\n", + " 4820\n", " \n", " \n", - " 156980\n", - " 135.00\n", - " 551.250000\n", - " beefd3462e3f5a8e854942a2796876f6db73ebbd25b435...\n", - " 28.40\n", - " 16.0\n", - " 55.102041\n", + " 119804\n", + " 8b07f9156e568a37d362463c84dbd1118b4eeb753bae50...\n", " 2019-07-01 00:00:00+00:00\n", - " 8820.0\n", - " 8.437500\n", - " 11.591837\n", + " 52.677165\n", + " 111.50\n", + " 11\n", + " 10.136364\n", + " 29.00\n", + " 13.700787\n", + " 692.727273\n", + " 7620\n", " \n", " \n", - " 156981\n", - " NaN\n", - " NaN\n", - " 9a3c52aa112f46cf0d129fafbd42051b0fb9b0ff8dcb0e...\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 119805\n", + " a112879f10892d5c698ce150af17aa28615b6d005ca749...\n", " 2019-07-01 00:00:00+00:00\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 54.649682\n", + " 143.00\n", + " 16\n", + " 8.937500\n", + " 31.00\n", + " 11.847134\n", + " 588.750000\n", + " 9420\n", " \n", " \n", - " 156982\n", - " 63.00\n", - " 815.000000\n", - " 08308c31cd99f495dea73ca276d19a6258d7b4c9c88e43...\n", - " 19.96\n", - " 4.0\n", - " 69.570552\n", + " 119806\n", + " 68fe14b9fc2d53de5ac349d47f80f43fea895e201a31e3...\n", " 2019-07-01 00:00:00+00:00\n", - " 3260.0\n", - " 15.750000\n", - " 22.041718\n", + " 73.770492\n", + " 101.25\n", + " 3\n", + " 33.750000\n", + " 37.86\n", + " 27.584699\n", + " 1647.000000\n", + " 4941\n", " \n", " \n", - " 156983\n", - " NaN\n", - " NaN\n", - " 7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf...\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 119807\n", + " 6933327f9cc740e893a882282f7d06460207d6d4e084b3...\n", " 2019-07-01 00:00:00+00:00\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 67.500000\n", + " 218.25\n", + " 21\n", + " 10.392857\n", + " 58.90\n", + " 18.216495\n", + " 554.285714\n", + " 11640\n", " \n", " \n", "\n", - "

156984 rows × 10 columns

\n", + "

119808 rows × 10 columns

\n", "" ], "text/plain": [ - " total_earned avg_trip_seconds \\\n", - "0 68.25 2270.000000 \n", - "1 221.00 560.500000 \n", - "2 160.50 1010.769231 \n", - "3 183.75 697.550000 \n", - "4 217.75 1054.076923 \n", - "... ... ... \n", - "156979 38.00 1980.000000 \n", - "156980 135.00 551.250000 \n", - "156981 NaN NaN \n", - "156982 63.00 815.000000 \n", - "156983 NaN NaN \n", - "\n", " taxi_id \\\n", - "0 91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... \n", - "1 7a4a6162eaf27805aef407d25d5cb21fe779cd962922cb... \n", - "2 f4c9d05b215d7cbd08eca76252dae51cdb7aca9651d4ef... \n", - "3 c1f533318f8480a59173a9728ea0248c0d3eb187f4b897... \n", - "4 455b6b5cae6ca5a17cddd251485f2266d13d6a2c92f07c... \n", + "0 d13c5aaa066f94b4927779ed24cd313b0c686f03407095... \n", + "1 33164e16dd29b1c58cd15cce31df4bfcb75d9903cb66de... \n", + "2 226fe0b00be42932bdff81bc0b318b883bfbf15dd48093... \n", + "3 5a5bed1b5ced617d0594007d591f10bbbca354d50b19ca... \n", + "4 b7f7dbb452c0fb980a0f2050a146147c1006fe5f34e3b0... \n", "... ... \n", - "156979 0cccf0ec1f46d1e0beefcfdeaf5188d67e170cdff92618... \n", - "156980 beefd3462e3f5a8e854942a2796876f6db73ebbd25b435... \n", - "156981 9a3c52aa112f46cf0d129fafbd42051b0fb9b0ff8dcb0e... \n", - "156982 08308c31cd99f495dea73ca276d19a6258d7b4c9c88e43... \n", - "156983 7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... \n", + "119803 961263722c1beadafef2355412d672acac35e4054f6aaa... \n", + "119804 8b07f9156e568a37d362463c84dbd1118b4eeb753bae50... \n", + "119805 a112879f10892d5c698ce150af17aa28615b6d005ca749... \n", + "119806 68fe14b9fc2d53de5ac349d47f80f43fea895e201a31e3... \n", + "119807 6933327f9cc740e893a882282f7d06460207d6d4e084b3... \n", "\n", - " total_miles_travelled trip_count earned_per_hour \\\n", - "0 24.70 2.0 54.118943 \n", - "1 54.18 24.0 59.143622 \n", - "2 41.30 13.0 43.972603 \n", - "3 37.30 20.0 47.415956 \n", - "4 69.69 13.0 57.206451 \n", - "... ... ... ... \n", - "156979 14.90 1.0 69.090909 \n", - "156980 28.40 16.0 55.102041 \n", - "156981 NaN NaN NaN \n", - "156982 19.96 4.0 69.570552 \n", - "156983 NaN NaN NaN \n", + " event_timestamp earned_per_hour total_earned trip_count \\\n", + "0 2019-06-01 00:00:00+00:00 45.559701 203.50 8 \n", + "1 2019-06-01 00:00:00+00:00 36.219512 74.25 5 \n", + "2 2019-06-01 00:00:00+00:00 54.212598 114.75 6 \n", + "3 2019-06-01 00:00:00+00:00 45.000000 70.75 4 \n", + "4 2019-06-01 00:00:00+00:00 53.783319 104.25 5 \n", + "... ... ... ... ... \n", + "119803 2019-07-01 00:00:00+00:00 76.369295 102.25 4 \n", + "119804 2019-07-01 00:00:00+00:00 52.677165 111.50 11 \n", + "119805 2019-07-01 00:00:00+00:00 54.649682 143.00 16 \n", + "119806 2019-07-01 00:00:00+00:00 73.770492 101.25 3 \n", + "119807 2019-07-01 00:00:00+00:00 67.500000 218.25 21 \n", "\n", - " event_timestamp total_trip_seconds avg_fare avg_speed \n", - "0 2019-06-01 00:00:00+00:00 4540.0 34.125000 19.585903 \n", - "1 2019-06-01 00:00:00+00:00 13452.0 9.208333 14.499554 \n", - "2 2019-06-01 00:00:00+00:00 13140.0 12.346154 11.315068 \n", - "3 2019-06-01 00:00:00+00:00 13951.0 9.187500 9.625116 \n", - "4 2019-06-01 00:00:00+00:00 13703.0 16.750000 18.308692 \n", - "... ... ... ... ... \n", - "156979 2019-07-01 00:00:00+00:00 1980.0 38.000000 27.090909 \n", - "156980 2019-07-01 00:00:00+00:00 8820.0 8.437500 11.591837 \n", - "156981 2019-07-01 00:00:00+00:00 NaN NaN NaN \n", - "156982 2019-07-01 00:00:00+00:00 3260.0 15.750000 22.041718 \n", - "156983 2019-07-01 00:00:00+00:00 NaN NaN NaN \n", + " avg_fare total_miles_travelled avg_speed avg_trip_seconds \\\n", + "0 25.437500 69.50 15.559701 2010.000000 \n", + "1 14.850000 15.80 7.707317 1476.000000 \n", + "2 19.125000 38.50 18.188976 1270.000000 \n", + "3 17.687500 20.22 12.860777 1415.000000 \n", + "4 20.850000 34.49 17.793637 1395.600000 \n", + "... ... ... ... ... \n", + "119803 25.562500 36.98 27.619917 1205.000000 \n", + "119804 10.136364 29.00 13.700787 692.727273 \n", + "119805 8.937500 31.00 11.847134 588.750000 \n", + "119806 33.750000 37.86 27.584699 1647.000000 \n", + "119807 10.392857 58.90 18.216495 554.285714 \n", "\n", - "[156984 rows x 10 columns]" + " total_trip_seconds \n", + "0 16080 \n", + "1 7380 \n", + "2 7620 \n", + "3 5660 \n", + "4 6978 \n", + "... ... \n", + "119803 4820 \n", + "119804 7620 \n", + "119805 9420 \n", + "119806 4941 \n", + "119807 11640 \n", + "\n", + "[119808 rows x 10 columns]" ] }, - "execution_count": 22, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -933,7 +910,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 15, "id": "b3a8714f", "metadata": {}, "outputs": [], @@ -949,37 +926,37 @@ " max_value=60,\n", " mostly=0.99 # allow some outliers\n", " )\n", - " \n", + "\n", " ds.expect_column_values_to_be_between(\n", " \"total_miles_travelled\",\n", " min_value=0,\n", " max_value=500,\n", " mostly=0.99 # allow some outliers\n", " )\n", - " \n", + "\n", " # expectation of means based on observed values\n", " observed_mean = ds.trip_count.mean()\n", " ds.expect_column_mean_to_be_between(\"trip_count\",\n", " min_value=observed_mean * (1 - DELTA),\n", " max_value=observed_mean * (1 + DELTA))\n", - " \n", + "\n", " observed_mean = ds.earned_per_hour.mean()\n", " ds.expect_column_mean_to_be_between(\"earned_per_hour\",\n", " min_value=observed_mean * (1 - DELTA),\n", " max_value=observed_mean * (1 + DELTA))\n", - " \n", - " \n", + "\n", + "\n", " # expectation of quantiles\n", " qs = [0.5, 0.75, 0.9, 0.95]\n", " observed_quantiles = ds.avg_fare.quantile(qs)\n", - " \n", + "\n", " ds.expect_column_quantile_values_to_be_between(\n", " \"avg_fare\",\n", " quantile_ranges={\n", " \"quantiles\": qs,\n", " \"value_ranges\": [[None, max_value] for max_value in observed_quantiles]\n", - " }) \n", - " \n", + " })\n", + "\n", " return ds.get_expectation_suite()" ] }, @@ -993,17 +970,10 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "id": "504e5699", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "02/02/2022 02:43:47 PM INFO:\t5 expectation(s) included in expectation_suite. result_format settings filtered.\n" - ] - }, { "data": { "text/plain": [ @@ -1041,8 +1011,8 @@ " \"expectation_type\": \"expect_column_mean_to_be_between\",\n", " \"kwargs\": {\n", " \"column\": \"earned_per_hour\",\n", - " \"min_value\": 52.320624975640214,\n", - " \"max_value\": 63.94743052578249\n", + " \"min_value\": 52.32062497564023,\n", + " \"max_value\": 63.9474305257825\n", " },\n", " \"meta\": {}\n", " },\n", @@ -1082,7 +1052,7 @@ "]>" ] }, - "execution_count": 24, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1109,12 +1079,12 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, "id": "3cc27606", "metadata": {}, "outputs": [], "source": [ - "validation_reference = ds.as_reference(profiler=stats_profiler)" + "validation_reference = ds.as_reference(name=\"validation_reference_dataset\", profiler=stats_profiler)" ] }, { @@ -1127,21 +1097,10 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 18, "id": "ba72e02a", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/pyalex/projects/feast/sdk/python/feast/infra/offline_stores/offline_store.py:93: RuntimeWarning: Dataset validation is an experimental feature. This API is unstable and it could and most probably will be changed in the future. We do not guarantee that future changes will maintain backward compatibility.\n", - " RuntimeWarning,\n", - "02/02/2022 02:43:52 PM INFO:\t5 expectation(s) included in expectation_suite. result_format settings filtered.\n", - "02/02/2022 02:43:53 PM INFO:Validating data_asset_name None with expectation_suite_name default\n" - ] - } - ], + "outputs": [], "source": [ "_ = job.to_df(validation_reference=validation_reference)" ] @@ -1180,7 +1139,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 19, "id": "25450aab", "metadata": {}, "outputs": [], @@ -1190,7 +1149,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 20, "id": "d7f6e892", "metadata": {}, "outputs": [], @@ -1201,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 21, "id": "3a5d3b7b", "metadata": {}, "outputs": [ @@ -1308,7 +1267,7 @@ "[35448 rows x 2 columns]" ] }, - "execution_count": 29, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1320,7 +1279,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 22, "id": "5be40a91", "metadata": {}, "outputs": [], @@ -1350,26 +1309,17 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 23, "id": "3279f2ac", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/pyalex/projects/feast/sdk/python/feast/infra/offline_stores/offline_store.py:93: RuntimeWarning: Dataset validation is an experimental feature. This API is unstable and it could and most probably will be changed in the future. We do not guarantee that future changes will maintain backward compatibility.\n", - " RuntimeWarning,\n", - "02/02/2022 02:43:58 PM INFO:\t5 expectation(s) included in expectation_suite. result_format settings filtered.\n", - "02/02/2022 02:43:59 PM INFO:Validating data_asset_name None with expectation_suite_name default\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "[\n", " {\n", + " \"success\": false,\n", " \"expectation_config\": {\n", " \"expectation_type\": \"expect_column_mean_to_be_between\",\n", " \"kwargs\": {\n", @@ -1380,46 +1330,46 @@ " },\n", " \"meta\": {}\n", " },\n", - " \"meta\": {},\n", " \"result\": {\n", " \"observed_value\": 6.692920555429092,\n", - " \"element_count\": 35448,\n", - " \"missing_count\": 31055,\n", - " \"missing_percent\": 87.6071992778154\n", + " \"element_count\": 4393,\n", + " \"missing_count\": null,\n", + " \"missing_percent\": null\n", " },\n", + " \"meta\": {},\n", " \"exception_info\": {\n", " \"raised_exception\": false,\n", " \"exception_message\": null,\n", " \"exception_traceback\": null\n", - " },\n", - " \"success\": false\n", + " }\n", " },\n", " {\n", + " \"success\": false,\n", " \"expectation_config\": {\n", " \"expectation_type\": \"expect_column_mean_to_be_between\",\n", " \"kwargs\": {\n", " \"column\": \"earned_per_hour\",\n", - " \"min_value\": 52.320624975640214,\n", - " \"max_value\": 63.94743052578249,\n", + " \"min_value\": 52.32062497564023,\n", + " \"max_value\": 63.9474305257825,\n", " \"result_format\": \"COMPLETE\"\n", " },\n", " \"meta\": {}\n", " },\n", - " \"meta\": {},\n", " \"result\": {\n", " \"observed_value\": 68.99268345164135,\n", - " \"element_count\": 35448,\n", - " \"missing_count\": 31055,\n", - " \"missing_percent\": 87.6071992778154\n", + " \"element_count\": 4393,\n", + " \"missing_count\": null,\n", + " \"missing_percent\": null\n", " },\n", + " \"meta\": {},\n", " \"exception_info\": {\n", " \"raised_exception\": false,\n", " \"exception_message\": null,\n", " \"exception_traceback\": null\n", - " },\n", - " \"success\": false\n", + " }\n", " },\n", " {\n", + " \"success\": false,\n", " \"expectation_config\": {\n", " \"expectation_type\": \"expect_column_quantile_values_to_be_between\",\n", " \"kwargs\": {\n", @@ -1454,7 +1404,6 @@ " },\n", " \"meta\": {}\n", " },\n", - " \"meta\": {},\n", " \"result\": {\n", " \"observed_value\": {\n", " \"quantiles\": [\n", @@ -1470,9 +1419,9 @@ " 44.125\n", " ]\n", " },\n", - " \"element_count\": 35448,\n", - " \"missing_count\": 31055,\n", - " \"missing_percent\": 87.6071992778154,\n", + " \"element_count\": 4393,\n", + " \"missing_count\": null,\n", + " \"missing_percent\": null,\n", " \"details\": {\n", " \"success_details\": [\n", " false,\n", @@ -1482,12 +1431,12 @@ " ]\n", " }\n", " },\n", + " \"meta\": {},\n", " \"exception_info\": {\n", " \"raised_exception\": false,\n", " \"exception_message\": null,\n", " \"exception_traceback\": null\n", - " },\n", - " \"success\": false\n", + " }\n", " }\n", "]\n" ] @@ -1536,7 +1485,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.12" + "version": "3.11.8" } }, "nbformat": 4,