+ + + +
+ +
+
+ +
+
+ +
+ +
+ +
+ + +
+ +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ + + +
+

Vandalism Detection for LineStrings

+ +
+ +
+
+ + + + +
+ +
+

Vandalism Detection for LineStrings#

+

In this notebook we demonstrate how you detect and visualize vandalism in OSM. We are going to make use of one of the enriched attributes we have added to ohsome-data-insights.

+

These are the steps you see further down:

+
    +
  • Set the connection parameters.

  • +
  • Prepare your input parameters, e.g. define area of interest and shape filters.

  • +
  • Download data using PyIceberg and DuckDB.

  • +
  • Filter and process data with DuckDB.

  • +
  • Visualize the results on a map.

  • +
+
+

Getting started#

+

Set connection params.

+
+
+
import os
+
+s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
+s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here
+
+
+
+
+

Configure DuckDB.

+
+
+
import duckdb
+
+con = duckdb.connect(
+    config={
+        'threads': 8,
+        'max_memory': '8GB',
+    }
+)
+con.install_extension("spatial")
+con.load_extension("spatial")
+
+
+
+
+

Set the connection params to Iceberg Rest Catalog.

+
+
+
!pip install "pyiceberg[s3fs,duckdb,sql-sqlite,pyarrow]"
+
+
+
+
+
+
+
from pyiceberg.catalog.rest import RestCatalog
+
+catalog = RestCatalog(
+    name="default",
+    **{
+        "uri": "https://sotm2024.iceberg.ohsome.org",
+        "s3.endpoint": "https://sotm2024.minio.heigit.org",
+        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
+        "s3.access-key-id": s3_user,
+        "s3.secret-access-key": s3_password,
+        "s3.region": "eu-central-1"
+    }
+)
+
+
+
+
+
+
+

Prepare the input parameters for your analysis#

+
+
+
# Set iceberg table
+namespace = 'geo_sort'
+tablename = 'contributions'
+icebergtable = catalog.load_table((namespace, tablename))
+
+# Define status filter
+status = 'history'
+
+# Define geometry type filter
+geometry_type = 'LineString'
+
+# Define time filter (optional)
+min_timestamp = '2023-01-01T00:00:00'
+
+
+# Define length_delta threshold in meter.
+# This is a 5000 kilometers!
+length_delta_threshold = 5_000_000
+
+
+
+
+
+
+

Get the Data#

+

First, we do an iceberg table scan with a pre-filter. This is a fast way to download all potential OSM elements that are needed for our analysis.

+

Here we use one of the enriched attributes we have calculated in advance for each OSM contribution. We use the length_delta attribute to inspect all the OSM elements with an unusual large change. Here we download all linestrings which changed more than 5000 kilometers in length.

+
+
+
import time
+start_time = time.time()
+
+icebergtable.scan(
+    row_filter=(
+        f"status = '{status}' "
+        f"and geometry_type = '{geometry_type}' "
+        f"and length_delta >= '{length_delta_threshold}' "
+    ),
+    selected_fields=(
+        "user_id",
+        "osm_id",
+        "valid_from",
+        "tags",
+        "geometry",
+    )
+).to_duckdb('raw_osm_data',connection=con)
+
+download_time = round(time.time() - start_time, 3)
+print(f"download took {download_time} sec.")
+
+
+
+
+
download took 10.675 sec.
+
+
+
+
+
+
+

Display OSM user activity on map#

+

Get data from DucDKB into GeoPandas dataframe.

+
+
+
import geopandas as gpd
+
+map_query = """
+    SELECT
+        osm_id,
+        user_id,
+        valid_from::varchar as date,
+        epoch_ms(valid_from) as valid_from,
+        geometry
+    FROM raw_osm_data;
+"""
+
+df = con.sql(map_query).df()
+
+# convert the data to geodata
+gdf = gpd.GeoDataFrame(
+    df,
+    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
+).set_crs('epsg:4326')
+
+
+
+
+

Define map parameters and style.

+
+
+
import datetime
+import lonboard
+from palettable.matplotlib import Viridis_20
+
+# compute lonboard color style for contious color map
+min_valid_from = datetime.datetime(2008,1,1).timestamp()
+max_valid_from = datetime.datetime(2024,7,1).timestamp()
+
+
+# the lonboard map definition
+layer = lonboard.PathLayer.from_geopandas(
+    gdf,
+    extensions=[lonboard.layer_extension.DataFilterExtension(filter_size=1)],
+    get_filter_value=gdf["valid_from"],  # replace with desired column
+    filter_range=[min_valid_from, max_valid_from],  # replace with desired filter range
+    get_color=[255, 0, 0, 125],
+    get_width=1.5,
+    width_units='pixels'
+
+)
+
+currentness_map = lonboard.Map(
+    basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
+    layers=[layer]
+)
+
+
+
+
+

Define Dates Slider Selection Widget and link to map filter range.

+
+
+
from datetime import date, timedelta
+import ipywidgets
+from traitlets import directional_link
+
+start = datetime.datetime(2008,1,1)
+end = datetime.datetime(2024,6,1)
+delta = end - start   # returns timedelta
+dates = [start + timedelta(days=i) for i in range(delta.days + 1)]
+options = [(i.strftime('%d-%b-%Y'), 1000 * i.timestamp()) for i in dates]
+
+date_slider = ipywidgets.SelectionRangeSlider(
+    options=options,
+    index=(0, len(dates)-1),
+    description='Last Edit:',
+    layout=ipywidgets.Layout(width='1000px'),
+    disabled=False
+)
+
+directional_link(
+    (date_slider, 'value'),
+    (layer, 'filter_range')
+)
+
+
+
+
+
<traitlets.traitlets.directional_link at 0x7c7fa6da9ac0>
+
+
+
+
+

Display the map. Have fun exploring and moving around the time slider!

+
+
+
display(currentness_map, date_slider)
+
+
+
+
+
+
+
+
+ + + + +
+ + + + + + + + +
+ + + + + + +
+
+ + +
+ + +