diff --git a/_toc.yml b/_toc.yml index cc57b67..7eab632 100644 --- a/_toc.yml +++ b/_toc.yml @@ -17,6 +17,7 @@ parts: chapters: - file: book/02a_buildings_currentness_DuckDB_PyIceberg.ipynb - file: book/02_vandalism_detection.ipynb + - file: book/02_attribute_completeness.ipynb - caption: Data Integration chapters: - file: book/03_hot_tm_project_analysis.ipynb diff --git a/book/02_attribute_completeness.ipynb b/book/02_attribute_completeness.ipynb new file mode 100644 index 0000000..886edbc --- /dev/null +++ b/book/02_attribute_completeness.ipynb @@ -0,0 +1,924 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6878ccac-f936-438b-80f3-43ec4af03f02", + "metadata": {}, + "source": [ + "# Attribute Completeness for Street Surface Tags\n", + "In this notebook we demonstrate how can assess the completeness of tags in OSM.\n", + "\n", + "These are the steps you see further down:\n", + "\n", + "* Set the connection parameters.\n", + "* Prepare your input parameters, e.g. define area of interest and attribute filters.\n", + "* **Download data** using PyIceberg and DuckDB.\n", + "* Filter and process data with DuckDB.\n", + "* Visualize the results on a map." + ] + }, + { + "cell_type": "markdown", + "id": "e8a6cd1b-89bb-4475-bfab-f9996e019f4a", + "metadata": {}, + "source": [ + "## Getting started\n", + "Set connection params." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6d3a0f6c-f845-4bd6-9f4d-96b330e88729", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "s3_user = os.environ[\"S3_ACCESS_KEY_ID\"] # add your user here\n", + "s3_password = os.environ[\"S3_SECRET_ACCESS_KEY\"] # add your password here" + ] + }, + { + "cell_type": "markdown", + "id": "8e49489b-dfa6-41ef-96c8-b0418904415e", + "metadata": {}, + "source": [ + "Configure DuckDB." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bff7f6db-767b-476b-95a8-7335c248084c", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install duckdb==1.0.0" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "eba19d51-892b-4447-8c69-321799a90e67", + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "\n", + "con = duckdb.connect(\n", + " config={\n", + " 'threads': 8,\n", + " 'max_memory': '8GB',\n", + " }\n", + ")\n", + "con.install_extension(\"spatial\")\n", + "con.load_extension(\"spatial\")" + ] + }, + { + "cell_type": "markdown", + "id": "cb32de22-3022-4568-9a73-28c84e8c9600", + "metadata": {}, + "source": [ + "Set the connection params to Iceberg Rest Catalog." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4de25b6f-c440-4c71-b93b-b26682cd9175", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install \"pyiceberg[s3fs,duckdb,sql-sqlite,pyarrow]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "de3fedad-a111-4d45-a81e-bc5a4c1cca41", + "metadata": {}, + "outputs": [], + "source": [ + "from pyiceberg.catalog.rest import RestCatalog\n", + "\n", + "catalog = RestCatalog(\n", + " name=\"default\",\n", + " **{\n", + " \"uri\": \"https://sotm2024.iceberg.ohsome.org\",\n", + " \"s3.endpoint\": \"https://sotm2024.minio.heigit.org\",\n", + " \"py-io-impl\": \"pyiceberg.io.pyarrow.PyArrowFileIO\",\n", + " \"s3.access-key-id\": s3_user,\n", + " \"s3.secret-access-key\": s3_password,\n", + " \"s3.region\": \"eu-central-1\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "41123367-d6d7-4b2c-b4a7-400f1513e88d", + "metadata": {}, + "source": [ + "Set connection to MinIO object storage." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9f09eb04-35f2-44b4-b69d-aaca90b2b5cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "┌─────────┐\n", + "│ Success │\n", + "│ boolean │\n", + "├─────────┤\n", + "│ true │\n", + "└─────────┘\n", + "\n" + ] + } + ], + "source": [ + "query = f\"\"\"\n", + "DROP SECRET IF EXISTS \"__default_s3\";\n", + "CREATE SECRET (\n", + " TYPE S3,\n", + " KEY_ID '{s3_user}',\n", + " SECRET '{s3_password}',\n", + " REGION 'eu-central-1',\n", + " endpoint 'sotm2024.minio.heigit.org',\n", + " use_ssl true,\n", + " url_style 'path'\n", + " );\n", + "\"\"\"\n", + "con.sql(query).show()" + ] + }, + { + "cell_type": "markdown", + "id": "a70d9e71-0fe8-4ed4-83b6-96f83c93c8aa", + "metadata": {}, + "source": [ + "## Prepare the input parameters for your analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a3ed4fd7-aedb-4c58-9c9d-e9f44918afb4", + "metadata": {}, + "outputs": [], + "source": [ + "# Set iceberg table\n", + "namespace = 'geo_sort'\n", + "tablename = 'contributions'\n", + "icebergtable = catalog.load_table((namespace, tablename))\n", + "\n", + "\n", + "# Define status filter\n", + "status = 'latest'\n", + "\n", + "\n", + "# Define location filter\n", + "bboxes = {\n", + " 'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),\n", + " 'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),\n", + " 'mannheim': (8.41416, 49.410362, 8.58999, 49.590489)\n", + "}\n", + "\n", + "selected_region = 'mannheim'\n", + "xmin, ymin, xmax, ymax = bboxes[selected_region]\n", + "area_of_interest_file = f\"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson\"\n", + "\n", + "\n", + "# Define geometry type filter\n", + "geometry_type = 'LineString'" + ] + }, + { + "cell_type": "markdown", + "id": "efe47258-eb04-4031-a517-28ef88ba05ed", + "metadata": {}, + "source": [ + "## Get the Data\n", + "First, we do an iceberg table scan with a pre-filter. This is a fast way to download all potential OSM elements that are needed for our analysis." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9dbe857e-b355-46cd-b7a3-6bd309914ba2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "download took 18.676 sec.\n" + ] + } + ], + "source": [ + "import time\n", + "start_time = time.time()\n", + "\n", + "icebergtable.scan(\n", + " row_filter=(\n", + " f\"status = '{status}' \"\n", + " f\"and geometry_type = '{geometry_type}' \"\n", + " f\"and (xmax >= {xmin} and xmin <= {xmax}) \"\n", + " f\"and (ymax >= {ymin} and ymin <= {ymax}) \"\n", + " ),\n", + " selected_fields=(\n", + " \"osm_id\",\n", + " \"tags\",\n", + " \"length\",\n", + " \"geometry\",\n", + " ),\n", + ").to_duckdb('raw_osm_data',connection=con)\n", + "\n", + "download_time = round(time.time() - start_time, 3)\n", + "print(f\"download took {download_time} sec.\")" + ] + }, + { + "cell_type": "markdown", + "id": "1d7b4c77-1c03-4371-9eee-f1e242898ea3", + "metadata": {}, + "source": [ + "## Filter and process data with DuckDB\n", + "Here we extract the tag values for highways and their correspoding road surface type (if mapped). We also clip the road geometry to the area of interest and calculate the length for the clipped geometry." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a11509b0-66c4-4c1b-8b04-9cfc34441167", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "962e47d4e319425b8be86f4067868eb8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "processing took 7.358 sec.\n" + ] + } + ], + "source": [ + "import time\n", + "start_time = time.time()\n", + "\n", + "query = f\"\"\"\n", + "DROP TABLE IF EXISTS osm_data;\n", + "CREATE TABLE osm_data AS\n", + "(\n", + "SELECT\n", + " tags['highway'][1] as highway_tag_value,\n", + " tags['surface'][1] as surface_tag_value,\n", + " ST_GeomFromText(a.geometry) as osm_geom,\n", + " CASE\n", + " WHEN ST_Within(osm_geom, aoi.geom) THEN osm_geom\n", + " ELSE ST_Intersection(osm_geom, aoi.geom)\n", + " END as clipped_geometry,\n", + " CASE\n", + " WHEN ST_Within(osm_geom, aoi.geom) THEN length\n", + " ELSE ST_Length_Spheroid(clipped_geometry) / 1000\n", + " END as length_km\n", + "FROM\n", + " raw_osm_data as a,\n", + " st_read('{area_of_interest_file}') as aoi\n", + "WHERE 1=1\n", + " and tags['highway'][1] is not null\n", + " -- spatial filtering part\n", + " and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)\n", + ")\n", + ";\n", + "\"\"\"\n", + "con.sql(query)\n", + "\n", + "processing_time = round(time.time() - start_time, 3)\n", + "print(f\"processing took {processing_time} sec.\")" + ] + }, + { + "cell_type": "markdown", + "id": "51cf91c1-6de4-4081-8ee1-1c0ffe717551", + "metadata": {}, + "source": [ + "## Proporation of Roads with Surface Tag\n", + "Let's inspect how many roads have a surface tag." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4a979e9a-0f5f-404b-879d-cb403328c53d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "┌──────────────────┬────────────────────────────┬─────────────────────────────┐\n", + "│ length_km_total │ length_km_with_surface_tag │ proportion_with_surface_tag │\n", + "│ double │ double │ double │\n", + "├──────────────────┼────────────────────────────┼─────────────────────────────┤\n", + "│ 3285612.61274888 │ 2259704.672583999 │ 0.688 │\n", + "└──────────────────┴────────────────────────────┴─────────────────────────────┘" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = f\"\"\"\n", + "SELECT\n", + " SUM(length_km) as length_km_total,\n", + " SUM(CASE\n", + " WHEN surface_tag_value IS NOT NULL THEN length_km\n", + " ELSE 0\n", + " END) as length_km_with_surface_tag,\n", + " round(length_km_with_surface_tag / length_km_total, 3) as proportion_with_surface_tag\n", + "FROM osm_data\n", + "\"\"\"\n", + "con.sql(query)" + ] + }, + { + "cell_type": "markdown", + "id": "dc8ac015-aa8f-4de6-894a-c7adf964e50a", + "metadata": {}, + "source": [ + "## Proporation of Roads with Surface Tag per Road Type\n", + "We can break down by road type. For some road types the surface tag is mapped better than for others." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bfb223db-ceae-4ec1-a688-dd8f587b8adb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | highway_tag_value | \n", + "length_km_total | \n", + "length_km_with_surface_tag | \n", + "proportion_with_surface_tag | \n", + "
---|---|---|---|---|
0 | \n", + "service | \n", + "694704.602918 | \n", + "278210.148166 | \n", + "0.400 | \n", + "
1 | \n", + "footway | \n", + "546853.269568 | \n", + "307684.651441 | \n", + "0.563 | \n", + "
2 | \n", + "track | \n", + "477267.685579 | \n", + "285274.586297 | \n", + "0.598 | \n", + "
3 | \n", + "residential | \n", + "464907.754558 | \n", + "447053.629756 | \n", + "0.962 | \n", + "
4 | \n", + "path | \n", + "446605.435393 | \n", + "343686.039051 | \n", + "0.770 | \n", + "
5 | \n", + "unclassified | \n", + "118270.021870 | \n", + "97051.021870 | \n", + "0.821 | \n", + "
6 | \n", + "tertiary | \n", + "110620.269124 | \n", + "109950.269124 | \n", + "0.994 | \n", + "
7 | \n", + "secondary | \n", + "83886.049524 | \n", + "83724.049524 | \n", + "0.998 | \n", + "
8 | \n", + "primary | \n", + "70041.951266 | \n", + "69604.951266 | \n", + "0.994 | \n", + "
9 | \n", + "living_street | \n", + "48703.000000 | \n", + "42401.000000 | \n", + "0.871 | \n", + "
10 | \n", + "motorway | \n", + "47689.847847 | \n", + "47511.847847 | \n", + "0.996 | \n", + "
11 | \n", + "cycleway | \n", + "36750.000000 | \n", + "31285.000000 | \n", + "0.851 | \n", + "
12 | \n", + "trunk | \n", + "26581.871319 | \n", + "25673.869133 | \n", + "0.966 | \n", + "
13 | \n", + "bridleway | \n", + "25781.881633 | \n", + "18633.871521 | \n", + "0.723 | \n", + "
14 | \n", + "motorway_link | \n", + "18060.630577 | \n", + "17632.630577 | \n", + "0.976 | \n", + "
15 | \n", + "trunk_link | \n", + "14148.107011 | \n", + "13349.107011 | \n", + "0.944 | \n", + "
16 | \n", + "pedestrian | \n", + "12229.000000 | \n", + "10507.000000 | \n", + "0.859 | \n", + "
17 | \n", + "secondary_link | \n", + "11996.000000 | \n", + "11982.000000 | \n", + "0.999 | \n", + "
18 | \n", + "steps | \n", + "11948.002884 | \n", + "5219.000000 | \n", + "0.437 | \n", + "
19 | \n", + "primary_link | \n", + "10418.000000 | \n", + "10171.000000 | \n", + "0.976 | \n", + "
20 | \n", + "construction | \n", + "2905.231679 | \n", + "620.000000 | \n", + "0.213 | \n", + "
21 | \n", + "tertiary_link | \n", + "1958.000000 | \n", + "1902.000000 | \n", + "0.971 | \n", + "
22 | \n", + "corridor | \n", + "935.000000 | \n", + "339.000000 | \n", + "0.363 | \n", + "
23 | \n", + "busway | \n", + "818.000000 | \n", + "49.000000 | \n", + "0.060 | \n", + "
24 | \n", + "proposed | \n", + "612.000000 | \n", + "0.000000 | \n", + "0.000 | \n", + "
25 | \n", + "platform | \n", + "441.000000 | \n", + "0.000000 | \n", + "0.000 | \n", + "
26 | \n", + "raceway | \n", + "189.000000 | \n", + "189.000000 | \n", + "1.000 | \n", + "
27 | \n", + "fi | \n", + "177.000000 | \n", + "0.000000 | \n", + "0.000 | \n", + "
28 | \n", + "bus_stop | \n", + "82.000000 | \n", + "0.000000 | \n", + "0.000 | \n", + "
29 | \n", + "street_lamp | \n", + "32.000000 | \n", + "0.000000 | \n", + "0.000 | \n", + "