Merge branch 'main' of github.com:fkroeber/semantique into vis

ZGIS · Jul 1, 2024 · 3726c09 · 3726c09
2 parents f0cdbd0 + 4b79cb8
commit 3726c09
Show file tree

Hide file tree

Showing 9 changed files with 598 additions and 419 deletions.
diff --git a/demo/datacube.ipynb b/demo/datacube.ipynb
diff --git a/demo/extras/cache_tests.ipynb b/demo/extras/cache_tests.ipynb
@@ -21,24 +21,7 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\felix\\AppData\\Roaming\\Python\\Python310\\site-packages\\geopandas\\_compat.py:124: UserWarning: The Shapely GEOS version (3.11.2-CAPI-1.17.2) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n",
-      "  warnings.warn(\n",
-      "C:\\Users\\felix\\AppData\\Local\\Temp/ipykernel_7388/2560623581.py:1: DeprecationWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas still uses PyGEOS by default. However, starting with version 0.14, the default will switch to Shapely. To force to use Shapely 2.0 now, you can either uninstall PyGEOS or set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n",
-      "\n",
-      "import os\n",
-      "os.environ['USE_PYGEOS'] = '0'\n",
-      "import geopandas\n",
-      "\n",
-      "In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n",
-      "  import geopandas as gpd\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import geopandas as gpd\n",
     "import json\n",
@@ -55,15 +38,15 @@
    "outputs": [],
    "source": [
     "# Load a mapping.\n",
-    "with open(\"files/mapping.json\", \"r\") as file:\n",
+    "with open(\"../files/mapping.json\", \"r\") as file:\n",
     "    mapping = sq.mapping.Semantique(json.load(file))\n",
     "\n",
     "# Represent an EO data cube.\n",
-    "with open(\"files/layout.json\", \"r\") as file:\n",
-    "    dc = sq.datacube.GeotiffArchive(json.load(file), src = \"files/layers.zip\")\n",
+    "with open(\"../files/layout_gtiff.json\", \"r\") as file:\n",
+    "    dc = sq.datacube.GeotiffArchive(json.load(file), src = \"../files/layers_gtiff.zip\")\n",
     "\n",
     "# Set the spatio-temporal extent.\n",
-    "space = sq.SpatialExtent(gpd.read_file(\"files/footprint.geojson\"))\n",
+    "space = sq.SpatialExtent(gpd.read_file(\"../files/footprint.geojson\"))\n",
     "time = sq.TemporalExtent(\"2019-01-01\", \"2020-12-31\")"
    ]
   },
@@ -78,9 +61,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "RAM memory requirements are proportional to the number of data layers that are stored as intermediate results. Caching data layers in RAM should only be done for those that are needed again when evaluating downstream parts of the recipe. This requires foresight about the evaluation order of the recipe, which accordingly requires a preview run preceding the actual evaluation. This preview run is performed by loading the data with drastically reduced spatial resolution (5x5 pixel grid). It resolves the data references and fills a cache by creating a list of the data references in the order in which they are evaluated. This list is then used dynamically during the actual evaluation of the recipe as a basis for keeping data layers in the cache and reading them from there if they are needed again.\n",
-    "\n",
-    "Below the result of the preview run is shown first to demonstrate what the resolved data references look like. The resulting initialised cache can then be fed as a context element to the QueryProcessor in a second step for the actual recipe execution."
+    "Caching data layers in RAM should only be done for those that are needed again when evaluating downstream parts of the recipe. This requires foresight about the execution order of the recipe, which accordingly requires a simulated run preceding the actual execution. This simulated run is performed by the FakeProcessor. It resolves the data references and fills a cache by creating a list of the data references in the order in which they are evaluated. This list is then used dynamically during the actual execution of the recipe as a basis for keeping data layers in the cache and reading them from there if they are needed again."
    ]
   },
   {
@@ -89,7 +70,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from semantique.processor.core import QueryProcessor\n",
+    "from semantique.processor.core import FakeProcessor, QueryProcessor\n",
     "\n",
     "# define a simple recipe for a cloudfree composite\n",
     "recipe = sq.QueryRecipe()\n",
@@ -133,10 +114,10 @@
     }
    ],
    "source": [
-    "# step I: preview run\n",
-    "qp = QueryProcessor.parse(recipe, **{**context, \"preview\": True})\n",
-    "qp.optimize().execute()\n",
-    "qp.cache.seq"
+    "# step I: fake run\n",
+    "fp = FakeProcessor.parse(recipe, **context)\n",
+    "fp.optimize().execute()\n",
+    "fp.cache.seq"
    ]
   },
   {
@@ -157,7 +138,7 @@
    ],
    "source": [
     "# step II: query processor execution\n",
-    "qp = QueryProcessor.parse(recipe, **{**context, \"cache\": qp.cache})\n",
+    "qp = QueryProcessor.parse(recipe, **{**context, \"cache\": fp.cache})\n",
     "result = qp.optimize().execute()\n",
     "result[\"composite\"].shape"
    ]
@@ -166,9 +147,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As you can see the preview run resolves the references to the data layers as they are provided by looking up the entities' references in the mapping.json. Note, that in the current case the result is not that interesting, though, since four different data layers are to be loaded. Therefore, there is nothing to be cached during recipe execution. Therefore the QueryProcessor will load all data layers from the referenced sources without storing any of them in the cache. \n",
+    "As you can see the FakeProcessor run resolves the references to the data layers as they are provided by looking up the entities' references in the mapping.json. Note, that in the current case the result is not that interesting, though, since four different data layers are to be loaded. Therefore, there is nothing to be cached during recipe execution. Therefore the QueryProcessor will load all data layers from the referenced sources without storing any of them in the cache. \n",
     "\n",
-    "As a user, however, you can directly initiate the entire caching workflow (preview & full resolution recipe execution) by setting the context parameter when calling `recipe.execute(..., cache_data = True)`. "
+    "As a user, however, you can directly initiate the entire caching workflow (preview & full resolution recipe execution) by setting the context parameter when calling `recipe.execute(..., cache_data = True)`. This is enabled by default."
    ]
   },
   {
@@ -196,7 +177,7 @@
     "* the redundancy of the data references in the recipe, i.e. if layers are called multiple times loading them from cache will reduce the overall time significantly\n",
     "* the data source (EO data cube) from which they are loaded\n",
     "\n",
-    "Especially for the later it should be noted that in this demo only data loaded from a locally stored geotiff (i.e. the GeoTiffArchive layout) are analysed. This is sort of the worst case for demonstrating the benefits of caching since the data is stored locally and is therfore quickly accessible. Also geotiffs that are not stored in cloud-optimised format (CoGs) require to load the whole data into memory even when running in preview mode just to evaluate the sequence of data layers.\n",
+    "Especially for the later it should be noted that in this demo only data loaded from a locally stored geotiff (i.e. the GeoTiffArchive layout) are analysed. This is sort of the worst case for demonstrating the benefits of caching since the data is stored locally and is therfore quickly accessible.\n",
     "\n",
     "Consequently, you will observe that in almost all of the following cases, caching actually adds a small computational overhead. Keep in mind, however, that caching is designed for and particularly beneficial in case of STACCubes when loading data over the internet."
    ]
@@ -217,7 +198,7 @@
     "        \"crs\": 3035, \n",
     "        \"tz\": \"UTC\", \n",
     "        \"spatial_resolution\": [-10, 10],\n",
-    "        \"caching\": caching\n",
+    "        \"cache_data\": caching\n",
     "    }\n",
     "    res = recipe.execute(**context)"
    ]
@@ -248,7 +229,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "649 ms ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "640 ms ± 3.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -267,7 +248,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "998 ms ± 5.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "703 ms ± 18.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -300,7 +281,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "5.09 s ± 61.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "5.28 s ± 72.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -319,7 +300,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "5.27 s ± 51.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "5.51 s ± 106 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -352,7 +333,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "499 ms ± 5.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "495 ms ± 7.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -371,7 +352,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "547 ms ± 4.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "283 ms ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -385,139 +366,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The more expressive examples for the STACCube are provided below. Note that they can't be executed for now (as STACCube in currently still under dev and not yet merged in the main branch). The question if caching brings significant advantages when loading data from a well-indexed OpenDataCube stored on a quickly accessible hot storage, remains to be assessed. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pystac_client import Client\n",
-    "from shapely.geometry import box\n",
-    "from semantique.processor.core import QueryProcessor\n",
-    "import warnings\n",
-    "\n",
-    "# define temporal & spatial range to perform STAC query\n",
-    "xmin, ymin, xmax, ymax = 13.25,54.25,13.75,54.75\n",
-    "aoi = box(xmin, ymin, xmax, ymax)\n",
-    "t_range = [\"2020-07-15\", \"2020-09-01\"]\n",
-    "\n",
-    "# STAC-based metadata retrieval\n",
-    "import planetary_computer as pc\n",
-    "platform = \"Planet\"\n",
-    "catalog = Client.open(\n",
-    "    \"https://planetarycomputer.microsoft.com/api/stac/v1\",\n",
-    "    modifier=pc.sign_inplace,\n",
-    ")\n",
-    "query = catalog.search(\n",
-    "    collections=\"sentinel-2-l2a\", \n",
-    "    datetime=t_range, \n",
-    "    limit=100, \n",
-    "    intersects=aoi\n",
-    ")\n",
-    "item_coll = query.item_collection()\n",
-    "\n",
-    "# define datacube\n",
-    "with open(\"layout_planet.json\", \"r\") as file:\n",
-    "    dc = sq.datacube.STACCube(\n",
-    "        json.load(file), \n",
-    "        src = item_coll,\n",
-    "        dtype=\"int8\",\n",
-    "        na_value=0,\n",
-    "        )\n",
-    "        \n",
-    "# define spatio-temporal context vars \n",
-    "res = 20\n",
-    "epsg = 3035\n",
-    "space = sq.SpatialExtent(gpd.GeoDataFrame(geometry=[aoi], crs = 4326))\n",
-    "time = sq.TemporalExtent(*t_range)\n",
-    "\n",
-    "# load mapping\n",
-    "with open(\"mapping.json\", \"r\") as file:\n",
-    "    rules = json.load(file)\n",
-    "mapping = sq.mapping.Semantique(rules)\n",
-    "\n",
-    "# define recipe\n",
-    "recipe = sq.QueryRecipe()\n",
-    "recipe[\"green_map\"] = (\n",
-    "    sq.entity(\"vegetation\")\n",
-    "    .filter(sq.entity(\"cloud\").evaluate(\"not\"))\n",
-    "    .reduce(\"percentage\", \"time\")\n",
-    ")\n",
-    "recipe[\"all_count\"] = (\n",
-    "    sq.entity(\"all\")\n",
-    "    .reduce(\"count\", \"time\")\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# normal execution (no caching/no preview)\n",
-    "context = {\n",
-    "    \"datacube\": dc,\n",
-    "    \"mapping\": mapping,\n",
-    "    \"space\": space,\n",
-    "    \"time\": time,\n",
-    "    \"crs\": epsg,\n",
-    "    \"tz\": \"UTC\",\n",
-    "    \"spatial_resolution\": [-res, res]\n",
-    "}\n",
-    "\n",
-    "with warnings.catch_warnings():\n",
-    "    warnings.simplefilter(\"ignore\", UserWarning)\n",
-    "    response = recipe.execute(**context)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# preview mode\n",
-    "context = {\n",
-    "    \"datacube\": dc,\n",
-    "    \"mapping\": mapping,\n",
-    "    \"space\": space,\n",
-    "    \"time\": time,\n",
-    "    \"crs\": epsg,\n",
-    "    \"tz\": \"UTC\",\n",
-    "    \"spatial_resolution\": [-res, res],\n",
-    "    \"preview\": True\n",
-    "}\n",
-    "\n",
-    "with warnings.catch_warnings():\n",
-    "    warnings.simplefilter(\"ignore\", UserWarning)\n",
-    "    response = recipe.execute(**context)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# caching mode\n",
-    "context = {\n",
-    "    \"datacube\": dc,\n",
-    "    \"mapping\": mapping,\n",
-    "    \"space\": space,\n",
-    "    \"time\": time,\n",
-    "    \"crs\": epsg,\n",
-    "    \"tz\": \"UTC\",\n",
-    "    \"spatial_resolution\": [-res, res],\n",
-    "    \"caching\": True\n",
-    "}\n",
-    "\n",
-    "with warnings.catch_warnings():\n",
-    "    warnings.simplefilter(\"ignore\", UserWarning)\n",
-    "    response = recipe.execute(**context)"
+    "The more expressive examples for the STACCube are provided below. The question if caching brings significant advantages when loading data from a well-indexed OpenDataCube stored on a quickly accessible hot storage, remains to be assessed. "
    ]
   }
  ],
@@ -537,7 +386,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.2"
+   "version": "3.10.1"
   }
  },
  "nbformat": 4,