From 681c6910c228967cb5fa99b239a214a7525d6da8 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Tue, 27 Jun 2023 13:56:32 +0300
Subject: [PATCH 01/37] Straighten the spec for timeline delete  (#4538)

## Problem

Lets keep 500 for unusual stuff that is not considered normal. Came up
during one of the discussions around console logs now seeing this 500's.

## Summary of changes

- Return 409 Conflict instead of 500
- Remove 200 ok status because it is not used anymore
---
 pageserver/src/http/openapi_spec.yml        | 10 +++++++---
 pageserver/src/http/routes.rs               |  1 +
 pageserver/src/tenant.rs                    | 18 ++++++++++--------
 test_runner/regress/test_timeline_delete.py |  8 ++++----
 4 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 50614653be04..33a2e321419f 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -186,10 +186,8 @@ paths:
               schema:
                 $ref: "#/components/schemas/Error"
     delete:
-      description: "Attempts to delete specified timeline. On 500 errors should be retried"
+      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
       responses:
-        "200":
-          description: Ok
         "400":
           description: Error when no tenant id found in path or no timeline id
           content:
@@ -214,6 +212,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
         "412":
           description: Tenant is missing, or timeline has children
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8d3bb5552bae..0e12cc2b1e0f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,6 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                 format!("Cannot delete timeline which has child timelines: {children:?}")
                     .into_boxed_str(),
             ),
+            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
             Other(e) => ApiError::InternalServerError(e),
         }
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0e8d6b128703..ce1e5bf51fcc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -440,8 +440,13 @@ pub enum GetTimelineError {
 pub enum DeleteTimelineError {
     #[error("NotFound")]
     NotFound,
+
     #[error("HasChildren")]
     HasChildren(Vec<TimelineId>),
+
+    #[error("Timeline deletion is already in progress")]
+    AlreadyInProgress,
+
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
@@ -1755,14 +1760,11 @@ impl Tenant {
             timeline = Arc::clone(timeline_entry.get());
 
             // Prevent two tasks from trying to delete the timeline at the same time.
-            delete_lock_guard =
-                DeletionGuard(Arc::clone(&timeline.delete_lock).try_lock_owned().map_err(
-                    |_| {
-                        DeleteTimelineError::Other(anyhow::anyhow!(
-                            "timeline deletion is already in progress"
-                        ))
-                    },
-                )?);
+            delete_lock_guard = DeletionGuard(
+                Arc::clone(&timeline.delete_lock)
+                    .try_lock_owned()
+                    .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
+            );
 
             // If another task finished the deletion just before we acquired the lock,
             // return success.
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index ddd9ffd75526..7c3424cf329a 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -463,10 +463,10 @@ def first_call_hit_failpoint():
 
         # make the second call and assert behavior
         log.info("second call start")
-        error_msg_re = "timeline deletion is already in progress"
+        error_msg_re = "Timeline deletion is already in progress"
         with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
             ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
-        assert second_call_err.value.status_code == 500
+        assert second_call_err.value.status_code == 409
         env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*")
         # the second call will try to transition the timeline into Stopping state as well
         env.pageserver.allowed_errors.append(
@@ -518,9 +518,9 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
         ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
 
     env.pageserver.allowed_errors.append(
-        f".*{child_timeline_id}.*timeline deletion is already in progress.*"
+        f".*{child_timeline_id}.*Timeline deletion is already in progress.*"
     )
-    with pytest.raises(PageserverApiException, match="timeline deletion is already in progress"):
+    with pytest.raises(PageserverApiException, match="Timeline deletion is already in progress"):
         ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
 
     # make sure the timeout was due to the failpoint

From d748615c1f08e5af105323092eb215b00daaeac4 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Tue, 27 Jun 2023 15:01:32 +0300
Subject: [PATCH 02/37] RemoteTimelineClient::delete_all() to use
 s3::delete_objects (#4461)

## Problem
[#4325](https://github.com/neondatabase/neon/issues/4325)
## Summary of changes
Use delete_objects() method
---
 libs/remote_storage/tests/test_real_s3.rs       | 15 +++++++++++++++
 pageserver/src/tenant/remote_timeline_client.rs |  6 ++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 6fe65a0362b7..0917bab39cd4 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -173,10 +173,15 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
     let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
         .with_context(|| "RemotePath conversion")?;
 
+    let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
+        .with_context(|| "RemotePath conversion")?;
+
     let data1 = "remote blob data1".as_bytes();
     let data1_len = data1.len();
     let data2 = "remote blob data2".as_bytes();
     let data2_len = data2.len();
+    let data3 = "remote blob data3".as_bytes();
+    let data3_len = data3.len();
     ctx.client
         .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
         .await?;
@@ -185,8 +190,18 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
         .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
         .await?;
 
+    ctx.client
+        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
+        .await?;
+
     ctx.client.delete_objects(&[path1, path2]).await?;
 
+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
     Ok(())
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 7808b64d351c..190512f48f0d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -862,10 +862,8 @@ impl RemoteTimelineClient {
                 "Found {} files not bound to index_file.json, proceeding with their deletion",
                 remaining.len()
             );
-            for file in remaining {
-                warn!("Removing {}", file.object_name().unwrap_or_default());
-                self.storage_impl.delete(&file).await?;
-            }
+            warn!("About to remove {} files", remaining.len());
+            self.storage_impl.delete_objects(&remaining).await?;
         }
 
         let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

From 250a27fb853459fbf6d71316e1f7136e0087015b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 27 Jun 2023 16:23:22 +0100
Subject: [PATCH 03/37] Upload Postgres Extensions to S3 (#4505)

## Problem

We want to store Postgres Extensions in S3 (resolves
https://github.com/neondatabase/neon/issues/4493).

Proposed solution:
- Create a separate docker image (from scratch) that contains only
extensions
- Do not include extensions into compute-node (except for neon
extensions)*
- For release and main builds upload extract extension from the image
and upload to S3 (`s3://<bucket>/<release
version>/<postgres_version>/...`)**

*) We're not doing it until the feature is not fully implemented
**) This differs from the initial proposal in
https://github.com/neondatabase/neon/issues/4493 of putting extensions
straight into `s3://<bucket>/<postgres_version>/...`, because we can't
upload directory atomicly. A drawback of this is that we end up with
unnecessary copies of files ~2.1 GB per release (i.e. +2.1 GB for each
commit in main also). We don't really need to update extensions for each
release if there're no relevant changes, but this requires extra work.

## Summary of changes
- Created a separate stage in Dockerfile.compute-node
`postgres-extensions` that contains only extensions
- Added a separate step in a workflow that builds `postgres-extensions`
image (because of a bug in kaniko this step is commented out because it
takes way too long to get built)
- Extract extensions from the image and upload files to S3 for release
and main builds
- Upload extenstions only for staging (for now)
---
 .github/workflows/build_and_test.yml | 91 ++++++++++++++++++++++++++++
 Dockerfile.compute-node              |  9 +++
 2 files changed, 100 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8215c0229178..d92337e780b6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -722,6 +722,33 @@ jobs:
                            --dockerfile Dockerfile.compute-node
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+                           --cleanup
+
+      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
+      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
+      # so we won't build extension twice, but extract them from compute-node.
+      #
+      # - name: Kaniko build extensions only
+      #   run: |
+      #     # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
+      #     # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
+      #     # it still fails with error:
+      #     #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
+      #     #
+      #     # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
+      #     find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
+      #
+      #     /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
+      #                      --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+      #                      --context . \
+      #                      --build-arg GIT_VERSION=${{ github.sha }} \
+      #                      --build-arg PG_VERSION=${{ matrix.version }} \
+      #                      --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
+      #                      --dockerfile Dockerfile.compute-node \
+      #                      --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+      #                      --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+      #                      --cleanup \
+      #                      --target postgres-extensions
 
       # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
       - name: Cleanup ECR folder
@@ -842,6 +869,8 @@ jobs:
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          # TODO: Uncomment when we start to build this image in `compute-node-image`
+          # crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions:${{needs.tag.outputs.build-tag}} latest
 
       - name: Push images to production ECR
         if: |
@@ -854,6 +883,8 @@ jobs:
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
+          # TODO: Uncomment when we start to build this image in `compute-node-image`
+          # crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions:latest
 
       - name: Configure Docker Hub login
         run: |
@@ -877,10 +908,70 @@ jobs:
           crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/extensions:${{needs.tag.outputs.build-tag}} latest
 
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 
+  upload-postgres-extensions-to-s3:
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+       github.event_name != 'workflow_dispatch'
+    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
+    needs: [ tag, promote-images ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
+
+    env:
+      # While on transition period extract extensions from compute-node image (see a comment above for compute-node-image job)
+      # EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
+      # In compute image we have a bit different directory layout (/usr/local/pgsql put into /usr/local).
+      # This variable can be inlined after
+      # USR_LOCAL_PGSQL_PATH: /usr/local/pgsql
+      USR_LOCAL_PGSQL_PATH: /usr/local
+      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
+      S3_BUCKETS: |
+        ${{ github.ref_name == 'release' &&
+          '["neon-prod-extensions-ap-southeast-1", "neon-prod-extensions-eu-central-1", "neon-prod-extensions-us-east-1", "neon-prod-extensions-us-east-2", "neon-prod-extensions-us-west-2"]' ||
+          '["neon-dev-extensions-eu-central-1", "neon-dev-extensions-eu-west-1", "neon-dev-extensions-us-east-2"]' }}
+
+    steps:
+      - name: Pull postgres-extensions image
+        run: |
+          docker pull ${EXTENSIONS_IMAGE}
+
+      - name: Create postgres-extensions container
+        id: create-container
+        run: |
+          CID=$(docker create ${EXTENSIONS_IMAGE} true)
+          echo "CID=${CID}" >> $GITHUB_OUTPUT
+
+      - name: Extract postgres-extensions from container
+        run: |
+          rm -rf ./usr-local-pgsql # Just in case
+          docker cp ${{ steps.create-container.outputs.CID }}:${USR_LOCAL_PGSQL_PATH} ./usr-local-pgsql
+
+          # Delete Neon extensitons (they always present on compute-node image)
+          rm -rf ./usr-local-pgsql/share/extension/neon*
+          rm -rf ./usr-local-pgsql/lib/neon*
+
+      - name: Upload postgres-extensions to S3
+        run: |
+          for BUCKET in $(echo ${S3_BUCKETS} | jq --raw-output '.[]'); do
+            # Source directories are specified in Dockerfile.compute-node for postgres-extensions target
+            aws s3 cp --recursive --only-show-errors ./usr-local-pgsql/share/extension s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}/share/extension
+            aws s3 cp --recursive --only-show-errors ./usr-local-pgsql/lib             s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}/lib
+          done
+
+      - name: Cleanup
+        if: ${{ always() && steps.create-container.outputs.CID }}
+        run: |
+          docker rm ${{ steps.create-container.outputs.CID }}
+
   deploy:
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d69b2cf17470..682d49b9021f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -698,6 +698,15 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
+#
+# Extenstion only
+#
+#########################################################################################
+FROM scratch AS postgres-extensions
+COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
+COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+
 #########################################################################################
 #
 # Final layer

From ef2b9ffbcb34d43f968f9c9949732ec55956f4af Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Tue, 27 Jun 2023 12:05:27 -0400
Subject: [PATCH 04/37] Basebackup forward compatibility (#4572)

---
 pageserver/src/page_service.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d32518b513b7..8728559d7287 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -904,7 +904,7 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            let lsn = if params.len() == 3 {
+            let lsn = if params.len() >= 3 {
                 Some(
                     Lsn::from_str(params[2])
                         .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,

From 7fe0a4bf1a47b03d8f866d0f73799f0cae928355 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 27 Jun 2023 18:05:10 +0100
Subject: [PATCH 05/37] Fix promote-images job (#4577)

## Problem

```
+ crane tag neondatabase/extensions:3337 latest
Error: fetching "neondatabase/extensions:3337": GET https://index.docker.io/v2/neondatabase/extensions/manifests/3337: MANIFEST_UNKNOWN: manifest unknown; unknown tag=3337
```

We don't build `neondatabase/extensions` image yet (broken in
https://github.com/neondatabase/neon/pull/4505)

## Summary of changes
- Do not try to interact with `neondatabase/extensions`
---
 .github/workflows/build_and_test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d92337e780b6..1bb6568d66b6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -908,7 +908,8 @@ jobs:
           crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions:${{needs.tag.outputs.build-tag}} latest
+          # TODO: Uncomment when we start to build this image in `compute-node-image`
+          # crane tag neondatabase/extensions:${{needs.tag.outputs.build-tag}} latest
 
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr

From 195d4932c6d6ec3787b3543a8322ff0025d7daaf Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 28 Jun 2023 09:04:13 +0300
Subject: [PATCH 06/37] Set LwLSN after WAL records when redo is performed or
 skipped (#4579)

## Problem

See #4516

Inspecting log it is possible to notice that if lwlsn is set to the
beginning of applied WAL record, then
incorrect version of the page is loaded:

```
2023-06-27 18:36:51.930 GMT [3273945] CONTEXT:  WAL redo at 0/14AF6F0 for Heap/INSERT: off 2 flags 0x01; blkref #0: rel 1663/5/1259, blk 0 FPW
2023-06-27 18:36:51.930 GMT [3273945] LOG:  Do REDO block 0 of rel 1663/5/1259 fork 0 at LSN 0/**014AF6F0**..0/014AFA60
2023-06-27 18:37:02.173 GMT [3273963] LOG:  Read blk 0 in rel 1663/5/1259 fork 0 (request LSN 0/**014AF6F0**): lsn=0/**0143C7F8** at character 22
2023-06-27 18:37:47.780 GMT [3273945] LOG:  apply WAL record at 0/1BB8F38 xl_tot_len=188, xl_prev=0/1BB8EF8
2023-06-27 18:37:47.780 GMT [3273945] CONTEXT:  WAL redo at 0/1BB8F38 for Heap/INPLACE: off 2; blkref #0: rel 1663/5/1259, blk 0
2023-06-27 18:37:47.780 GMT [3273945] LOG:  Do REDO block 0 of rel 1663/5/1259 fork 0 at LSN 0/01BB8F38..0/01BB8FF8
2023-06-27 18:37:47.780 GMT [3273945] CONTEXT:  WAL redo at 0/1BB8F38 for Heap/INPLACE: off 2; blkref #0: rel 1663/5/1259, blk 0
2023-06-27 18:37:47.780 GMT [3273945] PANIC:  invalid lp
```

## Summary of changes

1. Use end record LSN  for both cases
2. Update lwlsn for relation metadata

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pgxn/neon/pagestore_smgr.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 528d4eb0511f..79dec0881dc4 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -2675,7 +2675,6 @@ bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;
-	XLogRecPtr	prev_end_recptr = record->ReadRecPtr - 1;
 	RelFileNode	rnode;
 	ForkNumber	forknum;
 	BlockNumber	blkno;
@@ -2719,16 +2718,15 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 
 	no_redo_needed = buffer < 0;
 
-	/* we don't have the buffer in memory, update lwLsn past this record */
+	/* In both cases st lwlsn past this WAL record */
+	SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
+
+	/* we don't have the buffer in memory, update lwLsn past this record,
+	 * also evict page fro file cache
+	 */
 	if (no_redo_needed)
-	{
-		SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
 		lfc_evict(rnode, forknum, blkno);
-	}
-	else
-	{
-		SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
-	}
+
 
 	LWLockRelease(partitionLock);
 
@@ -2736,7 +2734,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	if (get_cached_relsize(rnode, forknum, &relsize))
 	{
 		if (relsize < blkno + 1)
+		{
 			update_cached_relsize(rnode, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
+		}
 	}
 	else
 	{
@@ -2768,6 +2769,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 		Assert(nbresponse->n_blocks > blkno);
 
 		set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
+		SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
 
 		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
 	}

From 02ef246db6727bbac1564262e473d071cc93a2c8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 28 Jun 2023 09:18:45 +0300
Subject: [PATCH 07/37] refactor: to pattern of await after timeout (#4432)

Refactor the `!completed` to be about `Option<_>` instead, side-stepping
any boolean true/false or false/true. As discussed on
https://github.com/neondatabase/neon/pull/4399#discussion_r1219321848
---
 pageserver/src/task_mgr.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index d8db12a113a1..13db38d95678 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -506,17 +506,17 @@ pub async fn shutdown_tasks(
                     warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                 }
             }
-            let completed = tokio::select! {
+            let join_handle = tokio::select! {
                 biased;
-                _ = &mut join_handle => { true },
+                _ = &mut join_handle => { None },
                 _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                     // allow some time to elapse before logging to cut down the number of log
                     // lines.
                     info!("waiting for {} to shut down", task.name);
-                    false
+                    Some(join_handle)
                 }
             };
-            if !completed {
+            if let Some(join_handle) = join_handle {
                 // we never handled this return value, but:
                 // - we don't deschedule which would lead to is_cancelled
                 // - panics are already logged (is_panicked)

From ec9b585837d08c607a5ddfae86cc20bd4adce693 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Wed, 28 Jun 2023 12:35:52 +0300
Subject: [PATCH 08/37] Add Activating as a possible state for attaching a
 tenant in test_tenant_relocation validation (#4581)

Fix flakyness of test_tenant_relocation
---
 test_runner/regress/test_tenant_relocation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 2a5b30803b45..70b931d7f941 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -419,8 +419,6 @@ def test_tenant_relocation(
             new_pageserver_http.tenant_attach(tenant_id)
 
             # wait for tenant to finish attaching
-            tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id)
-            assert tenant_status["state"]["slug"] in ["Attaching", "Active"]
             wait_until(
                 number_of_iterations=10,
                 interval=1,

From c19681bc12dc7749e50ad3f119a633daa582a7b0 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 28 Jun 2023 11:39:07 -0400
Subject: [PATCH 09/37] neon_local: support force init (#4363)

When we use local SSD for bench and create `.neon` directory before we
do `cargo neon init`, the initialization process will error due to
directory already exists. This PR adds a flag `--force` that removes
everything inside the directory if `.neon` already exists.

---------

Signed-off-by: Alex Chi Z. <chi@neon.tech>
---
 control_plane/src/bin/neon_local.rs | 11 +++++++++-
 control_plane/src/local_env.rs      | 34 +++++++++++++++++++++++------
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 52af936d7b7d..8995a1856468 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -308,7 +308,8 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
 
     let mut env =
         LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    env.init(pg_version)
+    let force = init_match.get_flag("force");
+    env.init(pg_version, force)
         .context("Failed to initialize neon repository")?;
 
     // Initialize pageserver, create initial tenant and timeline.
@@ -1013,6 +1014,13 @@ fn cli() -> Command {
         .help("If set, the node will be a hot replica on the specified timeline")
         .required(false);
 
+    let force_arg = Arg::new("force")
+        .value_parser(value_parser!(bool))
+        .long("force")
+        .action(ArgAction::SetTrue)
+        .help("Force initialization even if the repository is not empty")
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1028,6 +1036,7 @@ fn cli() -> Command {
                         .value_name("config"),
                 )
                 .arg(pg_version_arg.clone())
+                .arg(force_arg)
         )
         .subcommand(
             Command::new("timeline")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index df70cb313928..208eb9e7ec0b 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -364,7 +364,7 @@ impl LocalEnv {
     //
     // Initialize a new Neon repository
     //
-    pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
         // check if config already exists
         let base_path = &self.base_data_dir;
         ensure!(
@@ -372,11 +372,29 @@ impl LocalEnv {
             "repository base path is missing"
         );
 
-        ensure!(
-            !base_path.exists(),
-            "directory '{}' already exists. Perhaps already initialized?",
-            base_path.display()
-        );
+        if base_path.exists() {
+            if force {
+                println!("removing all contents of '{}'", base_path.display());
+                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
+                // all contents inside. This helps if the developer symbol links another directory (i.e.,
+                // S3 local SSD) to the `.neon` base directory.
+                for entry in std::fs::read_dir(base_path)? {
+                    let entry = entry?;
+                    let path = entry.path();
+                    if path.is_dir() {
+                        fs::remove_dir_all(&path)?;
+                    } else {
+                        fs::remove_file(&path)?;
+                    }
+                }
+            } else {
+                bail!(
+                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
+                    base_path.display()
+                );
+            }
+        }
+
         if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
             bail!(
                 "Can't find postgres binary at {}",
@@ -392,7 +410,9 @@ impl LocalEnv {
             }
         }
 
-        fs::create_dir(base_path)?;
+        if !base_path.exists() {
+            fs::create_dir(base_path)?;
+        }
 
         // Generate keypair for JWT.
         //

From 44e7d5132fe483c4c0ed1cdfcdfdfef5dfe92eba Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 29 Jun 2023 15:53:16 +0300
Subject: [PATCH 10/37] fix: hide token from logs (#4584)

fixes #4583 and also changes all needlessly arg listing places to use
`skip_all`.
---
 compute_tools/src/compute.rs      | 16 ++++++++--------
 compute_tools/src/configurator.rs |  2 +-
 compute_tools/src/pg_helpers.rs   |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 6b549e198b52..87acefc1bbd8 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -235,7 +235,7 @@ impl ComputeNode {
 
     // Get basebackup from the libpq connection to pageserver using `connstr` and
     // unarchive it to `pgdata` directory overriding all its previous content.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all, fields(%lsn))]
     fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
         let spec = compute_state.pspec.as_ref().expect("spec must be set");
         let start_time = Utc::now();
@@ -277,7 +277,7 @@ impl ComputeNode {
 
     // Run `postgres` in a special mode with `--sync-safekeepers` argument
     // and return the reported LSN back to the caller.
-    #[instrument(skip(self, storage_auth_token))]
+    #[instrument(skip_all)]
     fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
         let start_time = Utc::now();
 
@@ -322,7 +322,7 @@ impl ComputeNode {
 
     /// Do all the preparations like PGDATA directory creation, configuration,
     /// safekeepers sync, basebackup, etc.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all)]
     pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         let spec = &pspec.spec;
@@ -380,7 +380,7 @@ impl ComputeNode {
 
     /// Start Postgres as a child process and manage DBs/roles.
     /// After that this will hang waiting on the postmaster process to exit.
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
     pub fn start_postgres(
         &self,
         storage_auth_token: Option<String>,
@@ -404,7 +404,7 @@ impl ComputeNode {
     }
 
     /// Do initial configuration of the already started Postgres.
-    #[instrument(skip(self, compute_state))]
+    #[instrument(skip_all)]
     pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
         // If connection fails,
         // it may be the old node with `zenith_admin` superuser.
@@ -458,7 +458,7 @@ impl ComputeNode {
     // We could've wrapped this around `pg_ctl reload`, but right now we don't use
     // `pg_ctl` for start / stop, so this just seems much easier to do as we already
     // have opened connection to Postgres and superuser access.
-    #[instrument(skip(self, client))]
+    #[instrument(skip_all)]
     fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
         client.simple_query("SELECT pg_reload_conf()")?;
         Ok(())
@@ -466,7 +466,7 @@ impl ComputeNode {
 
     /// Similar to `apply_config()`, but does a bit different sequence of operations,
     /// as it's used to reconfigure a previously started and configured Postgres node.
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
     pub fn reconfigure(&self) -> Result<()> {
         let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
 
@@ -501,7 +501,7 @@ impl ComputeNode {
         Ok(())
     }
 
-    #[instrument(skip(self))]
+    #[instrument(skip_all)]
     pub fn start_compute(&self) -> Result<std::process::Child> {
         let compute_state = self.state.lock().unwrap().clone();
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs
index a07fd0b8cd72..13550e01767d 100644
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus;
 
 use crate::compute::ComputeNode;
 
-#[instrument(skip(compute))]
+#[instrument(skip_all)]
 fn configurator_main_loop(compute: &Arc<ComputeNode>) {
     info!("waiting for reconfiguration requests");
     loop {
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index b76ed1fd853e..6a78bffd1b6d 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -215,7 +215,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections. It's ready to
 /// accept connections when the state-field in `pgdata/postmaster.pid` says
 /// 'ready'.
-#[instrument(skip(pg))]
+#[instrument(skip_all, fields(pgdata = %pgdata.display()))]
 pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
     let pid_path = pgdata.join("postmaster.pid");
 

From b2a5e91a888c083a06f5b66cb92887b2f7b998b2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 29 Jun 2023 14:33:26 +0100
Subject: [PATCH 11/37] Upload custom extensions to S3 (#4585)

## Problem

We want to have a number of custom extensions that will not be available
by default, as an example of such is [Postgres
Anonymizer](https://postgresql-anonymizer.readthedocs.io/en/stable/)
(please note that the extension should be added to
`shared_preload_libraries`). To distinguish them, custom extensions
should be added to a different S3 path:

```
s3://<bucket>/<release version>/<postgres_version>/<ext_name>/share/extensions/
s3://<bucket>/<release version>/<postgres_version>/<ext_name>/lib

where <ext_name> is an extension name
```

Resolves https://github.com/neondatabase/neon/issues/4582

## Summary of changes
- Add Postgres Anonymizer extension to Dockerfile (it's included only to
postgres-extensions target)
- Build extensions image from postgres-extensions target in a workflow
- Upload custom extensions to S3 (different directory)
---
 .github/workflows/build_and_test.yml | 110 +++++++++++++++------------
 Dockerfile.compute-node              |  32 +++++++-
 2 files changed, 92 insertions(+), 50 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1bb6568d66b6..46dd89445196 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -728,27 +728,29 @@ jobs:
       # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
       # so we won't build extension twice, but extract them from compute-node.
       #
-      # - name: Kaniko build extensions only
-      #   run: |
-      #     # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
-      #     # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
-      #     # it still fails with error:
-      #     #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
-      #     #
-      #     # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
-      #     find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
-      #
-      #     /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
-      #                      --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-      #                      --context . \
-      #                      --build-arg GIT_VERSION=${{ github.sha }} \
-      #                      --build-arg PG_VERSION=${{ matrix.version }} \
-      #                      --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
-      #                      --dockerfile Dockerfile.compute-node \
-      #                      --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-      #                      --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-      #                      --cleanup \
-      #                      --target postgres-extensions
+      # For now we use extensions image only for new custom extensitons
+      - name: Kaniko build extensions only
+        run: |
+          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
+          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
+          # it still fails with error:
+          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
+          #
+          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
+          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
+
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+                           --context . \
+                           --build-arg GIT_VERSION=${{ github.sha }} \
+                           --build-arg PG_VERSION=${{ matrix.version }} \
+                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
+                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
+                           --dockerfile Dockerfile.compute-node \
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+                           --cleanup \
+                           --target postgres-extensions
 
       # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
       - name: Cleanup ECR folder
@@ -867,10 +869,10 @@ jobs:
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          # TODO: Uncomment when we start to build this image in `compute-node-image`
-          # crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
 
       - name: Push images to production ECR
         if: |
@@ -881,10 +883,10 @@ jobs:
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          # TODO: Uncomment when we start to build this image in `compute-node-image`
-          # crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
 
       - name: Configure Docker Hub login
         run: |
@@ -906,10 +908,10 @@ jobs:
           crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          # TODO: Uncomment when we start to build this image in `compute-node-image`
-          # crane tag neondatabase/extensions:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
 
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
@@ -926,57 +928,69 @@ jobs:
         version: [ v14, v15 ]
 
     env:
-      # While on transition period extract extensions from compute-node image (see a comment above for compute-node-image job)
-      # EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
-      # In compute image we have a bit different directory layout (/usr/local/pgsql put into /usr/local).
-      # This variable can be inlined after
-      # USR_LOCAL_PGSQL_PATH: /usr/local/pgsql
-      USR_LOCAL_PGSQL_PATH: /usr/local
+      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
+      # Later all the extensions will be moved to extensions image.
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
+      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
       AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
       S3_BUCKETS: |
         ${{ github.ref_name == 'release' &&
-          '["neon-prod-extensions-ap-southeast-1", "neon-prod-extensions-eu-central-1", "neon-prod-extensions-us-east-1", "neon-prod-extensions-us-east-2", "neon-prod-extensions-us-west-2"]' ||
-          '["neon-dev-extensions-eu-central-1", "neon-dev-extensions-eu-west-1", "neon-dev-extensions-us-east-2"]' }}
+          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
+          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
 
     steps:
       - name: Pull postgres-extensions image
         run: |
           docker pull ${EXTENSIONS_IMAGE}
+          docker pull ${COMPUTE_NODE_IMAGE}
 
       - name: Create postgres-extensions container
         id: create-container
         run: |
-          CID=$(docker create ${EXTENSIONS_IMAGE} true)
+          EID=$(docker create ${EXTENSIONS_IMAGE} true)
+          echo "EID=${EID}" >> $GITHUB_OUTPUT
+
+          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
           echo "CID=${CID}" >> $GITHUB_OUTPUT
 
       - name: Extract postgres-extensions from container
         run: |
-          rm -rf ./usr-local-pgsql # Just in case
-          docker cp ${{ steps.create-container.outputs.CID }}:${USR_LOCAL_PGSQL_PATH} ./usr-local-pgsql
+          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
+
+          # In compute image we have a bit different directory layout
+          mkdir -p extensions-to-upload/share
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
 
           # Delete Neon extensitons (they always present on compute-node image)
-          rm -rf ./usr-local-pgsql/share/extension/neon*
-          rm -rf ./usr-local-pgsql/lib/neon*
+          rm -rf ./extensions-to-upload/share/extension/neon*
+          rm -rf ./extensions-to-upload/lib/neon*
+
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
+          for EXT_NAME in $(ls ./custom-extensions); do
+            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
+
+            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
+            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
+          done
 
       - name: Upload postgres-extensions to S3
         run: |
-          for BUCKET in $(echo ${S3_BUCKETS} | jq --raw-output '.[]'); do
-            # Source directories are specified in Dockerfile.compute-node for postgres-extensions target
-            aws s3 cp --recursive --only-show-errors ./usr-local-pgsql/share/extension s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}/share/extension
-            aws s3 cp --recursive --only-show-errors ./usr-local-pgsql/lib             s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}/lib
+          for BUCKET in $(echo ${S3_BUCKETS}); do
+            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
           done
 
       - name: Cleanup
-        if: ${{ always() && steps.create-container.outputs.CID }}
+        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
         run: |
-          docker rm ${{ steps.create-container.outputs.CID }}
+          docker rm ${{ steps.create-container.outputs.CID }} || true
+          docker rm ${{ steps.create-container.outputs.EID }} || true
 
   deploy:
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ promote-images, tag, regress-tests ]
+    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
     if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
     steps:
       - name: Fix git ownership
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 682d49b9021f..21877f6f240f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,6 +515,26 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
+#########################################################################################
+#
+# Layer "pg-anon-pg-build"
+# compile anon extension
+#
+#########################################################################################
+FROM build-deps AS pg-anon-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
+    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sort  > /before.txt && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
+    find /usr/local/pgsql -type f | sort  > /after.txt && \
+    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
+
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -623,6 +643,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
+# Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -704,8 +725,15 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 #
 #########################################################################################
 FROM scratch AS postgres-extensions
-COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
-COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+# After the transition this layer will include all extensitons.
+# As for now, it's only for new custom ones
+#
+# # Default extensions
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+# Custom extensions
+COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
+COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
 
 #########################################################################################
 #

From ca0e0781c878cf78b2872b0abc096dba2a3f0228 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 29 Jun 2023 09:56:17 -0400
Subject: [PATCH 12/37] use const instead of magic number for repartition
 threshold (#4286)

There is a magic number about how often we repartition and therefore
affecting how often we compact. This PR makes this number `10` a global
constant and add docs.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 060000a01a6d..192aa6e22d97 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1298,6 +1298,9 @@ impl Timeline {
     }
 }
 
+/// Number of times we will compute partition within a checkpoint distance.
+const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
+
 // Private functions
 impl Timeline {
     fn get_checkpoint_distance(&self) -> u64 {
@@ -1492,7 +1495,8 @@ impl Timeline {
                 initial_logical_size_can_start,
                 initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
             };
-            result.repartition_threshold = result.get_checkpoint_distance() / 10;
+            result.repartition_threshold =
+                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
             result
                 .metrics
                 .last_record_gauge

From 032b6030111b18ee5f25a90d03bf1d46784eb5a9 Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Thu, 29 Jun 2023 10:55:02 -0400
Subject: [PATCH 13/37] Fix: Wrong Enum Variant (#4589)

---
 pageserver/src/pgdatadir_mapping.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 998c199ba6ca..a54cf9f91b70 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -887,7 +887,7 @@ impl<'a> DatadirModification<'a> {
         ctx: &RequestContext,
     ) -> Result<(), RelationError> {
         if rel.relnode == 0 {
-            return Err(RelationError::AlreadyExists);
+            return Err(RelationError::InvalidRelnode);
         }
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.

From 7e20b49da473276479340d2e2efd42dc2b336236 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 29 Jun 2023 15:06:07 -0400
Subject: [PATCH 14/37] refactor: use LayerDesc in LayerMap (part 2) (#4437)

## Problem

part of https://github.com/neondatabase/neon/issues/4392, continuation
of https://github.com/neondatabase/neon/pull/4408

## Summary of changes

This PR removes all layer objects from LayerMap and moves it to the
timeline struct. In timeline struct, LayerFileManager maps a layer
descriptor to a layer object, and it is stored in the same RwLock as
LayerMap to avoid behavior difference.

Key changes:

* LayerMap now does not have generic, and only stores descriptors.
* In Timeline, we add a new struct called layer mapping.
* Currently, layer mapping is stored in the same lock with layer map.
Every time we retrieve data from the layer map, we will need to map the
descriptor to the actual object.
* Replace_historic is moved to layer mapping's replace, and the return
value behavior is different from before. I'm a little bit unsure about
this part and it would be good to have some comments on that.
* Some test cases are rewritten to adapt to the new interface, and we
can decide whether to remove it in the future because it does not make
much sense now.
* LayerDescriptor is moved to `tests` module and should only be intended
for unit testing / benchmarks.
* Because we now have a usage pattern like "take the guard of lock, then
get the reference of two fields", we want to avoid dropping the
incorrect object when we intend to unlock the lock guard. Therefore, a
new set of helper function `drop_r/wlock` is added. This can be removed
in the future when we finish the refactor.

TODOs after this PR: fully remove RemoteLayer, and move LayerMapping to
a separate LayerCache.

all refactor PRs:

```
#4437 --- #4479 ------------ #4510 (refactor done at this point)
      \-- #4455 -- #4502 --/
```

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/benches/bench_layer_map.rs         |  31 +-
 pageserver/src/tenant.rs                      |   1 +
 pageserver/src/tenant/layer_map.rs            | 345 ++++-----------
 .../layer_map/historic_layer_coverage.rs      | 211 +--------
 pageserver/src/tenant/storage_layer.rs        | 197 +++++----
 .../src/tenant/storage_layer/remote_layer.rs  |   9 +-
 pageserver/src/tenant/timeline.rs             | 409 +++++++++++++-----
 .../src/tenant/timeline/eviction_task.rs      |   4 +-
 test_runner/regress/test_ondemand_download.py |   4 +-
 9 files changed, 512 insertions(+), 699 deletions(-)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 45dc9fad4a5b..03bb7a5bfd90 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,22 +1,23 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
+use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
+use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::Instant;
+use utils::id::{TenantId, TimelineId};
 
 use utils::lsn::Lsn;
 
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
-    let mut layer_map = LayerMap::<LayerDescriptor>::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
+    let mut layer_map = LayerMap::default();
 
     let mut min_lsn = Lsn(u64::MAX);
     let mut max_lsn = Lsn(0);
@@ -33,7 +34,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
         min_lsn = min(min_lsn, lsn_range.start);
         max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
 
-        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
+        updates.insert_historic(layer.layer_desc().clone());
     }
 
     println!("min: {min_lsn}, max: {max_lsn}");
@@ -43,7 +44,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
 }
 
 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
     // For each image layer we query one of the pages contained, at LSN right
     // before the image layer was created. This gives us a somewhat uniform
     // coverage of both the lsn and key space because image layers have
@@ -69,7 +70,7 @@ fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn
 
 // Construct a partitioning for testing get_difficulty map when we
 // don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
     let mut parts = Vec::new();
 
     // We add a partition boundary at the start of each image layer,
@@ -209,13 +210,15 @@ fn bench_sequential(c: &mut Criterion) {
     for i in 0..100_000 {
         let i32 = (i as u32) % 100;
         let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = LayerDescriptor {
-            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            lsn: Lsn(i)..Lsn(i + 1),
-            is_incremental: false,
-            short_id: format!("Layer {}", i),
-        };
-        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
+        let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
+            TenantId::generate(),
+            TimelineId::generate(),
+            zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            Lsn(i),
+            false,
+            0,
+        ));
+        updates.insert_historic(layer.layer_desc().clone());
     }
     updates.flush();
     println!("Finished layer map init in {:?}", now.elapsed());
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ce1e5bf51fcc..47e9b5b4ec95 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -590,6 +590,7 @@ impl Tenant {
                     .layers
                     .read()
                     .await
+                    .0
                     .iter_historic_layers()
                     .next()
                     .is_some(),
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index ca1a71b62394..dee02ac433b3 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,25 +51,23 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
-use anyhow::Context;
 use anyhow::Result;
-use std::collections::HashMap;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;
 
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
-pub use historic_layer_coverage::Replacement;
+pub use historic_layer_coverage::LayerKey;
 
 use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;
-use super::storage_layer::PersistentLayerKey;
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-pub struct LayerMap<L: ?Sized> {
+#[derive(Default)]
+pub struct LayerMap {
     //
     // 'open_layer' holds the current InMemoryLayer that is accepting new
     // records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -95,24 +93,6 @@ pub struct LayerMap<L: ?Sized> {
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
     l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
-
-    /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
-    /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
-    /// RemoteLayer will be removed.
-    mapping: HashMap<PersistentLayerKey, Arc<L>>,
-}
-
-impl<L: ?Sized> Default for LayerMap<L> {
-    fn default() -> Self {
-        Self {
-            open_layer: None,
-            next_open_layer_at: None,
-            frozen_layers: VecDeque::default(),
-            l0_delta_layers: Vec::default(),
-            historic: BufferedHistoricLayerCoverage::default(),
-            mapping: HashMap::default(),
-        }
-    }
 }
 
 /// The primary update API for the layer map.
@@ -120,24 +100,21 @@ impl<L: ?Sized> Default for LayerMap<L> {
 /// Batching historic layer insertions and removals is good for
 /// performance and this struct helps us do that correctly.
 #[must_use]
-pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+pub struct BatchedUpdates<'a> {
     // While we hold this exclusive reference to the layer map the type checker
     // will prevent us from accidentally reading any unflushed updates.
-    layer_map: &'a mut LayerMap<L>,
+    layer_map: &'a mut LayerMap,
 }
 
 /// Provide ability to batch more updates while hiding the read
 /// API so we don't accidentally read without flushing.
-impl<L> BatchedUpdates<'_, L>
-where
-    L: ?Sized + Layer,
-{
+impl BatchedUpdates<'_> {
     ///
     /// Insert an on-disk layer.
     ///
     // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
-    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
-        self.layer_map.insert_historic_noflush(layer_desc, layer)
+    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
+        self.layer_map.insert_historic_noflush(layer_desc)
     }
 
     ///
@@ -145,31 +122,8 @@ where
     ///
     /// This should be called when the corresponding file on disk has been deleted.
     ///
-    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
-        self.layer_map.remove_historic_noflush(layer_desc, layer)
-    }
-
-    /// Replaces existing layer iff it is the `expected`.
-    ///
-    /// If the expected layer has been removed it will not be inserted by this function.
-    ///
-    /// Returned `Replacement` describes succeeding in replacement or the reason why it could not
-    /// be done.
-    ///
-    /// TODO replacement can be done without buffering and rebuilding layer map updates.
-    ///      One way to do that is to add a layer of indirection for returned values, so
-    ///      that we can replace values only by updating a hashmap.
-    pub fn replace_historic(
-        &mut self,
-        expected_desc: PersistentLayerDesc,
-        expected: &Arc<L>,
-        new_desc: PersistentLayerDesc,
-        new: Arc<L>,
-    ) -> anyhow::Result<Replacement<Arc<L>>> {
-        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
-
-        self.layer_map
-            .replace_historic_noflush(expected_desc, expected, new_desc, new)
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
+        self.layer_map.remove_historic_noflush(layer_desc)
     }
 
     // We will flush on drop anyway, but this method makes it
@@ -185,25 +139,19 @@ where
 // than panic later or read without flushing.
 //
 // TODO maybe warn if flush hasn't explicitly been called
-impl<L> Drop for BatchedUpdates<'_, L>
-where
-    L: ?Sized + Layer,
-{
+impl Drop for BatchedUpdates<'_> {
     fn drop(&mut self) {
         self.layer_map.flush_updates();
     }
 }
 
 /// Return value of LayerMap::search
-pub struct SearchResult<L: ?Sized> {
-    pub layer: Arc<L>,
+pub struct SearchResult {
+    pub layer: Arc<PersistentLayerDesc>,
     pub lsn_floor: Lsn,
 }
 
-impl<L> LayerMap<L>
-where
-    L: ?Sized + Layer,
-{
+impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
     /// 'key', with lsn.start < 'end_lsn'.
@@ -235,7 +183,7 @@ where
     /// NOTE: This only searches the 'historic' layers, *not* the
     /// 'open' and 'frozen' layers!
     ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
         let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
         let latest_delta = version.delta_coverage.query(key.to_i128());
         let latest_image = version.image_coverage.query(key.to_i128());
@@ -244,7 +192,6 @@ where
             (None, None) => None,
             (None, Some(image)) => {
                 let lsn_floor = image.get_lsn_range().start;
-                let image = self.get_layer_from_mapping(&image.key()).clone();
                 Some(SearchResult {
                     layer: image,
                     lsn_floor,
@@ -252,7 +199,6 @@ where
             }
             (Some(delta), None) => {
                 let lsn_floor = delta.get_lsn_range().start;
-                let delta = self.get_layer_from_mapping(&delta.key()).clone();
                 Some(SearchResult {
                     layer: delta,
                     lsn_floor,
@@ -263,7 +209,6 @@ where
                 let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                 let image_exact_match = img_lsn + 1 == end_lsn;
                 if image_is_newer || image_exact_match {
-                    let image = self.get_layer_from_mapping(&image.key()).clone();
                     Some(SearchResult {
                         layer: image,
                         lsn_floor: img_lsn,
@@ -271,7 +216,6 @@ where
                 } else {
                     let lsn_floor =
                         std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
-                    let delta = self.get_layer_from_mapping(&delta.key()).clone();
                     Some(SearchResult {
                         layer: delta,
                         lsn_floor,
@@ -282,7 +226,7 @@ where
     }
 
     /// Start a batch of updates, applied on drop
-    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
         BatchedUpdates { layer_map: self }
     }
 
@@ -292,48 +236,32 @@ where
     /// Helper function for BatchedUpdates::insert_historic
     ///
     /// TODO(chi): remove L generic so that we do not need to pass layer object.
-    pub(self) fn insert_historic_noflush(
-        &mut self,
-        layer_desc: PersistentLayerDesc,
-        layer: Arc<L>,
-    ) {
-        self.mapping.insert(layer_desc.key(), layer.clone());
-
+    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
         // TODO: See #3869, resulting #4088, attempted fix and repro #4094
 
-        if Self::is_l0(&layer) {
+        if Self::is_l0(&layer_desc) {
             self.l0_delta_layers.push(layer_desc.clone().into());
         }
 
         self.historic.insert(
-            historic_layer_coverage::LayerKey::from(&*layer),
+            historic_layer_coverage::LayerKey::from(&layer_desc),
             layer_desc.into(),
         );
     }
 
-    fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
-        let layer = self
-            .mapping
-            .get(key)
-            .with_context(|| format!("{key:?}"))
-            .expect("inconsistent layer mapping");
-        layer
-    }
-
     ///
     /// Remove an on-disk layer from the map.
     ///
     /// Helper function for BatchedUpdates::remove_historic
     ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
         self.historic
-            .remove(historic_layer_coverage::LayerKey::from(&*layer));
-        if Self::is_l0(&layer) {
+            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
+        let layer_key = layer_desc.key();
+        if Self::is_l0(&layer_desc) {
             let len_before = self.l0_delta_layers.len();
             let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
-            l0_delta_layers.retain(|other| {
-                !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
-            });
+            l0_delta_layers.retain(|other| other.key() != layer_key);
             self.l0_delta_layers = l0_delta_layers;
             // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
             // there's a chance that the comparison fails at runtime due to it comparing (pointer,
@@ -344,69 +272,6 @@ where
                 "failed to locate removed historic layer from l0_delta_layers"
             );
         }
-        self.mapping.remove(&layer_desc.key());
-    }
-
-    pub(self) fn replace_historic_noflush(
-        &mut self,
-        expected_desc: PersistentLayerDesc,
-        expected: &Arc<L>,
-        new_desc: PersistentLayerDesc,
-        new: Arc<L>,
-    ) -> anyhow::Result<Replacement<Arc<L>>> {
-        let key = historic_layer_coverage::LayerKey::from(&**expected);
-        let other = historic_layer_coverage::LayerKey::from(&*new);
-
-        let expected_l0 = Self::is_l0(expected);
-        let new_l0 = Self::is_l0(&new);
-
-        anyhow::ensure!(
-            key == other,
-            "expected and new must have equal LayerKeys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-            expected_l0 == new_l0,
-            "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
-        );
-
-        let l0_index = if expected_l0 {
-            // find the index in case replace worked, we need to replace that as well
-            let pos = self.l0_delta_layers.iter().position(|slot| {
-                Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
-            });
-
-            if pos.is_none() {
-                return Ok(Replacement::NotFound);
-            }
-            pos
-        } else {
-            None
-        };
-
-        let new_desc = Arc::new(new_desc);
-        let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
-            **existing == expected_desc
-        });
-
-        if let Replacement::Replaced { .. } = &replaced {
-            self.mapping.remove(&expected_desc.key());
-            self.mapping.insert(new_desc.key(), new);
-            if let Some(index) = l0_index {
-                self.l0_delta_layers[index] = new_desc;
-            }
-        }
-
-        let replaced = match replaced {
-            Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
-            Replacement::NotFound => Replacement::NotFound,
-            Replacement::RemovalBuffered => Replacement::RemovalBuffered,
-            Replacement::Unexpected(x) => {
-                Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
-            }
-        };
-
-        Ok(replaced)
     }
 
     /// Helper function for BatchedUpdates::drop.
@@ -454,10 +319,8 @@ where
         Ok(true)
     }
 
-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic
-            .iter()
-            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
+    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
+        self.historic.iter()
     }
 
     ///
@@ -472,7 +335,7 @@ where
         &self,
         key_range: &Range<Key>,
         lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
+    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
         let version = match self.historic.get().unwrap().get_version(lsn.0) {
             Some(v) => v,
             None => return Ok(vec![]),
@@ -482,36 +345,26 @@ where
         let end = key_range.end.to_i128();
 
         // Initialize loop variables
-        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
+        let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
         let mut current_key = start;
         let mut current_val = version.image_coverage.query(start);
 
         // Loop through the change events and push intervals
         for (change_key, change_val) in version.image_coverage.range(start..end) {
             let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
-            coverage.push((
-                kr,
-                current_val
-                    .take()
-                    .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
-            ));
+            coverage.push((kr, current_val.take()));
             current_key = change_key;
             current_val = change_val.clone();
         }
 
         // Add the final interval
         let kr = Key::from_i128(current_key)..Key::from_i128(end);
-        coverage.push((
-            kr,
-            current_val
-                .take()
-                .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
-        ));
+        coverage.push((kr, current_val.take()));
 
         Ok(coverage)
     }
 
-    pub fn is_l0(layer: &L) -> bool {
+    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
         range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
     }
 
@@ -537,7 +390,7 @@ where
     /// TODO The optimal number should probably be slightly higher than 1, but to
     ///      implement that we need to plumb a lot more context into this function
     ///      than just the current partition_range.
-    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
+    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
         // Case 1
         if !Self::is_l0(layer) {
             return true;
@@ -595,9 +448,7 @@ where
                     let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                     let lr = lsn.start..val.get_lsn_range().start;
                     if !kr.is_empty() {
-                        let base_count =
-                            Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
-                                as usize;
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
                         let new_limit = limit.map(|l| l - base_count);
                         let max_stacked_deltas_underneath =
                             self.count_deltas(&kr, &lr, new_limit)?;
@@ -620,9 +471,7 @@ where
                 let lr = lsn.start..val.get_lsn_range().start;
 
                 if !kr.is_empty() {
-                    let base_count =
-                        Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
-                            as usize;
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
                     let new_limit = limit.map(|l| l - base_count);
                     let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
                     max_stacked_deltas = std::cmp::max(
@@ -772,12 +621,8 @@ where
     }
 
     /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
-        Ok(self
-            .l0_delta_layers
-            .iter()
-            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
-            .collect())
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
+        Ok(self.l0_delta_layers.to_vec())
     }
 
     /// debugging function to print out the contents of the layer map
@@ -802,72 +647,51 @@ where
         println!("End dump LayerMap");
         Ok(())
     }
-
-    /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
-    ///
-    /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
-    #[inline(always)]
-    pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
-        // "dyn Trait" objects are "fat pointers" in that they have two components:
-        // - pointer to the object
-        // - pointer to the vtable
-        //
-        // rust does not provide a guarantee that these vtables are unique, but however
-        // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
-        // pointer and the vtable need to be equal.
-        //
-        // See: https://github.com/rust-lang/rust/issues/103763
-        //
-        // A future version of rust will most likely use this form below, where we cast each
-        // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
-        // not affect the comparison.
-        //
-        // See: https://github.com/rust-lang/rust/pull/106450
-        let left = Arc::as_ptr(left) as *const ();
-        let right = Arc::as_ptr(right) as *const ();
-
-        left == right
-    }
 }
 
 #[cfg(test)]
 mod tests {
-    use super::{LayerMap, Replacement};
-    use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
+    use super::LayerMap;
+    use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
     use std::str::FromStr;
     use std::sync::Arc;
 
     mod l0_delta_layers_updated {
 
+        use crate::tenant::{
+            storage_layer::{PersistentLayer, PersistentLayerDesc},
+            timeline::LayerFileManager,
+        };
+
         use super::*;
 
         #[test]
         fn for_full_range_delta() {
             // l0_delta_layers are used by compaction, and should observe all buffered updates
             l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                true
-            )
+                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                 true
+             )
         }
 
         #[test]
         fn for_non_full_range_delta() {
             // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
             l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                // because not full range
-                false
-            )
+                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                 // because not full range
+                 false
+             )
         }
 
         #[test]
         fn for_image() {
             l0_delta_layers_updated_scenario(
-                "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                // code only checks if it is a full range layer, doesn't care about images, which must
-                // mean we should in practice never have full range images
-                false
-            )
+                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                 // code only checks if it is a full range layer, doesn't care about images, which must
+                 // mean we should in practice never have full range images
+                 false
+             )
         }
 
         #[test]
@@ -883,16 +707,16 @@ mod tests {
             let not_found = Arc::new(layer.clone());
             let new_version = Arc::new(layer);
 
-            let mut map = LayerMap::default();
+            // after the immutable storage state refactor, the replace operation
+            // will not use layer map any more. We keep it here for consistency in test cases
+            // and can remove it in the future.
+            let _map = LayerMap::default();
 
-            let res = map.batch_update().replace_historic(
-                not_found.get_persistent_layer_desc(),
-                &not_found,
-                new_version.get_persistent_layer_desc(),
-                new_version,
-            );
+            let mut mapping = LayerFileManager::new();
 
-            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
+            mapping
+                .replace_and_verify(not_found, new_version)
+                .unwrap_err();
         }
 
         fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
@@ -903,49 +727,44 @@ mod tests {
             let downloaded = Arc::new(skeleton);
 
             let mut map = LayerMap::default();
+            let mut mapping = LayerFileManager::new();
 
             // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
             // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));
+            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
 
             let expected_in_counts = (1, usize::from(expected_l0));
 
             map.batch_update()
-                .insert_historic(remote.get_persistent_layer_desc(), remote.clone());
-            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
-
-            let replaced = map
-                .batch_update()
-                .replace_historic(
-                    remote.get_persistent_layer_desc(),
-                    &remote,
-                    downloaded.get_persistent_layer_desc(),
-                    downloaded.clone(),
-                )
+                .insert_historic(remote.layer_desc().clone());
+            mapping.insert(remote.clone());
+            assert_eq!(
+                count_layer_in(&map, remote.layer_desc()),
+                expected_in_counts
+            );
+
+            mapping
+                .replace_and_verify(remote, downloaded.clone())
                 .expect("name derived attributes are the same");
-            assert!(
-                matches!(replaced, Replacement::Replaced { .. }),
-                "{replaced:?}"
+            assert_eq!(
+                count_layer_in(&map, downloaded.layer_desc()),
+                expected_in_counts
             );
-            assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
 
             map.batch_update()
-                .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
-            assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
+                .remove_historic(downloaded.layer_desc().clone());
+            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
         }
 
-        fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
+        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
             let historic = map
                 .iter_historic_layers()
-                .filter(|x| LayerMap::compare_arced_layers(x, layer))
+                .filter(|x| x.key() == layer.key())
                 .count();
             let l0s = map
                 .get_level0_deltas()
                 .expect("why does this return a result");
-            let l0 = l0s
-                .iter()
-                .filter(|x| LayerMap::compare_arced_layers(x, layer))
-                .count();
+            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
 
             (historic, l0)
         }
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index 49dcbc63c2b7..0f515970272c 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -3,6 +3,8 @@ use std::ops::Range;
 
 use tracing::info;
 
+use crate::tenant::storage_layer::PersistentLayerDesc;
+
 use super::layer_coverage::LayerCoverageTuple;
 
 /// Layers in this module are identified and indexed by this data.
@@ -41,8 +43,8 @@ impl Ord for LayerKey {
     }
 }
 
-impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey {
-    fn from(layer: &'a L) -> Self {
+impl From<&PersistentLayerDesc> for LayerKey {
+    fn from(layer: &PersistentLayerDesc) -> Self {
         let kr = layer.get_key_range();
         let lr = layer.get_lsn_range();
         LayerKey {
@@ -454,59 +456,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
         self.buffer.insert(layer_key, None);
     }
 
-    /// Replaces a previous layer with a new layer value.
-    ///
-    /// The replacement is conditional on:
-    /// - there is an existing `LayerKey` record
-    /// - there is no buffered removal for the given `LayerKey`
-    /// - the given closure returns true for the current `Value`
-    ///
-    /// The closure is used to compare the latest value (buffered insert, or existing layer)
-    /// against some expectation. This allows to use `Arc::ptr_eq` or similar which would be
-    /// inaccessible via `PartialEq` trait.
-    ///
-    /// Returns a `Replacement` value describing the outcome; only the case of
-    /// `Replacement::Replaced` modifies the map and requires a rebuild.
-    pub fn replace<F>(
-        &mut self,
-        layer_key: &LayerKey,
-        new: Value,
-        check_expected: F,
-    ) -> Replacement<Value>
-    where
-        F: FnOnce(&Value) -> bool,
-    {
-        let (slot, in_buffered) = match self.buffer.get(layer_key) {
-            Some(inner @ Some(_)) => {
-                // we compare against the buffered version, because there will be a later
-                // rebuild before querying
-                (inner.as_ref(), true)
-            }
-            Some(None) => {
-                // buffer has removal for this key; it will not be equivalent by any check_expected.
-                return Replacement::RemovalBuffered;
-            }
-            None => {
-                // no pending modification for the key, check layers
-                (self.layers.get(layer_key), false)
-            }
-        };
-
-        match slot {
-            Some(existing) if !check_expected(existing) => {
-                // unfortunate clone here, but otherwise the nll borrowck grows the region of
-                // 'a to cover the whole function, and we could not mutate in the other
-                // Some(existing) branch
-                Replacement::Unexpected(existing.clone())
-            }
-            None => Replacement::NotFound,
-            Some(_existing) => {
-                self.insert(layer_key.to_owned(), new);
-                Replacement::Replaced { in_buffered }
-            }
-        }
-    }
-
     pub fn rebuild(&mut self) {
         // Find the first LSN that needs to be rebuilt
         let rebuild_since: u64 = match self.buffer.iter().next() {
@@ -575,22 +524,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
     }
 }
 
-/// Outcome of the replace operation.
-#[derive(Debug)]
-pub enum Replacement<Value> {
-    /// Previous value was replaced with the new value.
-    Replaced {
-        /// Replacement happened for a scheduled insert.
-        in_buffered: bool,
-    },
-    /// Key was not found buffered updates or existing layers.
-    NotFound,
-    /// Key has been scheduled for removal, it was not replaced.
-    RemovalBuffered,
-    /// Previous value was rejected by the closure.
-    Unexpected(Value),
-}
-
 #[test]
 fn test_retroactive_regression_1() {
     let mut map = BufferedHistoricLayerCoverage::new();
@@ -699,139 +632,3 @@ fn test_retroactive_simple() {
         assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
     }
 }
-
-#[test]
-fn test_retroactive_replacement() {
-    let mut map = BufferedHistoricLayerCoverage::new();
-
-    let keys = [
-        LayerKey {
-            key: 0..5,
-            lsn: 100..101,
-            is_image: true,
-        },
-        LayerKey {
-            key: 3..9,
-            lsn: 110..111,
-            is_image: true,
-        },
-        LayerKey {
-            key: 4..6,
-            lsn: 120..121,
-            is_image: true,
-        },
-    ];
-
-    let layers = [
-        "Image 1".to_string(),
-        "Image 2".to_string(),
-        "Image 3".to_string(),
-    ];
-
-    for (key, layer) in keys.iter().zip(layers.iter()) {
-        map.insert(key.to_owned(), layer.to_owned());
-    }
-
-    // rebuild is not necessary here, because replace works for both buffered updates and existing
-    // layers.
-
-    for (key, orig_layer) in keys.iter().zip(layers.iter()) {
-        let replacement = format!("Remote {orig_layer}");
-
-        // evict
-        let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
-        assert!(
-            matches!(ret, Replacement::Replaced { .. }),
-            "replace {orig_layer}: {ret:?}"
-        );
-        map.rebuild();
-
-        let at = key.lsn.end + 1;
-
-        let version = map.get().expect("rebuilt").get_version(at).unwrap();
-        assert_eq!(
-            version.image_coverage.query(4).as_deref(),
-            Some(replacement.as_str()),
-            "query for 4 at version {at} after eviction",
-        );
-
-        // download
-        let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
-        assert!(
-            matches!(ret, Replacement::Replaced { .. }),
-            "replace {orig_layer} back: {ret:?}"
-        );
-        map.rebuild();
-        let version = map.get().expect("rebuilt").get_version(at).unwrap();
-        assert_eq!(
-            version.image_coverage.query(4).as_deref(),
-            Some(orig_layer.as_str()),
-            "query for 4 at version {at} after download",
-        );
-    }
-}
-
-#[test]
-fn missing_key_is_not_inserted_with_replace() {
-    let mut map = BufferedHistoricLayerCoverage::new();
-    let key = LayerKey {
-        key: 0..5,
-        lsn: 100..101,
-        is_image: true,
-    };
-
-    let ret = map.replace(&key, "should not replace", |_| true);
-    assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
-    map.rebuild();
-    assert!(map
-        .get()
-        .expect("no changes to rebuild")
-        .get_version(102)
-        .is_none());
-}
-
-#[test]
-fn replacing_buffered_insert_and_remove() {
-    let mut map = BufferedHistoricLayerCoverage::new();
-    let key = LayerKey {
-        key: 0..5,
-        lsn: 100..101,
-        is_image: true,
-    };
-
-    map.insert(key.clone(), "Image 1");
-    let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
-    assert!(
-        matches!(ret, Replacement::Replaced { in_buffered: true }),
-        "{ret:?}"
-    );
-    map.rebuild();
-
-    assert_eq!(
-        map.get()
-            .expect("rebuilt")
-            .get_version(102)
-            .unwrap()
-            .image_coverage
-            .query(4),
-        Some("Remote Image 1")
-    );
-
-    map.remove(key.clone());
-    let ret = map.replace(&key, "should not replace", |_| true);
-    assert!(
-        matches!(ret, Replacement::RemovalBuffered),
-        "cannot replace after scheduled remove: {ret:?}"
-    );
-
-    map.rebuild();
-
-    let ret = map.replace(&key, "should not replace", |_| true);
-    assert!(
-        matches!(ret, Replacement::NotFound),
-        "cannot replace after remove + rebuild: {ret:?}"
-    );
-
-    let at_version = map.get().expect("rebuilt").get_version(102);
-    assert!(at_version.is_none());
-}
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 0af3d4ce39d3..7bc513b3a19a 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -176,13 +176,10 @@ impl LayerAccessStats {
     /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
     ///
     /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn for_loading_layer<L>(
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+    pub(crate) fn for_loading_layer(
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
         status: LayerResidenceStatus,
-    ) -> Self
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> Self {
         let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
         new.record_residence_event(
             layer_map_lock_held_witness,
@@ -197,14 +194,11 @@ impl LayerAccessStats {
     /// The `new_status` is not recorded in `self`.
     ///
     /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn clone_for_residence_change<L>(
+    pub(crate) fn clone_for_residence_change(
         &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
         new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> LayerAccessStats {
         let clone = {
             let inner = self.0.lock().unwrap();
             inner.clone()
@@ -232,14 +226,12 @@ impl LayerAccessStats {
     /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
     /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
     ///
-    pub(crate) fn record_residence_event<L>(
+    pub(crate) fn record_residence_event(
         &self,
-        _layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        _layer_map_lock_held_witness: &BatchedUpdates<'_>,
         status: LayerResidenceStatus,
         reason: LayerResidenceEventReason,
-    ) where
-        L: ?Sized + Layer,
-    {
+    ) {
         let mut locked = self.0.lock().unwrap();
         locked.iter_mut().for_each(|inner| {
             inner
@@ -473,94 +465,125 @@ pub fn downcast_remote_layer(
     }
 }
 
-/// Holds metadata about a layer without any content. Used mostly for testing.
-///
-/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
-/// LayerDescriptor.
-#[derive(Clone, Debug)]
-pub struct LayerDescriptor {
-    pub key: Range<Key>,
-    pub lsn: Range<Lsn>,
-    pub is_incremental: bool,
-    pub short_id: String,
-}
+pub mod tests {
+    use super::*;
 
-impl LayerDescriptor {
-    /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
-    /// and the tenant / timeline id does not matter.
-    pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
-        PersistentLayerDesc::new_delta(
-            TenantId::from_array([0; 16]),
-            TimelineId::from_array([0; 16]),
-            self.key.clone(),
-            self.lsn.clone(),
-            233,
-        )
+    /// Holds metadata about a layer without any content. Used mostly for testing.
+    ///
+    /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
+    /// LayerDescriptor.
+    #[derive(Clone, Debug)]
+    pub struct LayerDescriptor {
+        base: PersistentLayerDesc,
     }
-}
 
-impl Layer for LayerDescriptor {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key.clone()
+    impl From<PersistentLayerDesc> for LayerDescriptor {
+        fn from(base: PersistentLayerDesc) -> Self {
+            Self { base }
+        }
     }
 
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn.clone()
-    }
+    impl Layer for LayerDescriptor {
+        fn get_value_reconstruct_data(
+            &self,
+            _key: Key,
+            _lsn_range: Range<Lsn>,
+            _reconstruct_data: &mut ValueReconstructState,
+            _ctx: &RequestContext,
+        ) -> Result<ValueReconstructResult> {
+            todo!("This method shouldn't be part of the Layer trait")
+        }
 
-    fn is_incremental(&self) -> bool {
-        self.is_incremental
-    }
+        fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+            todo!()
+        }
 
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-        _ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        todo!("This method shouldn't be part of the Layer trait")
-    }
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn get_key_range(&self) -> Range<Key> {
+            self.layer_desc().key_range.clone()
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn get_lsn_range(&self) -> Range<Lsn> {
+            self.layer_desc().lsn_range.clone()
+        }
 
-    fn short_id(&self) -> String {
-        self.short_id.clone()
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn is_incremental(&self) -> bool {
+            self.layer_desc().is_incremental
+        }
+
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn short_id(&self) -> String {
+            self.layer_desc().short_id()
+        }
     }
 
-    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        todo!()
+    impl PersistentLayer for LayerDescriptor {
+        fn layer_desc(&self) -> &PersistentLayerDesc {
+            &self.base
+        }
+
+        fn local_path(&self) -> Option<PathBuf> {
+            unimplemented!()
+        }
+
+        fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
+            unimplemented!()
+        }
+
+        fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
+            unimplemented!()
+        }
+
+        fn delete_resident_layer_file(&self) -> Result<()> {
+            unimplemented!()
+        }
+
+        fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
+            unimplemented!()
+        }
+
+        fn access_stats(&self) -> &LayerAccessStats {
+            unimplemented!()
+        }
     }
-}
 
-impl From<DeltaFileName> for LayerDescriptor {
-    fn from(value: DeltaFileName) -> Self {
-        let short_id = value.to_string();
-        LayerDescriptor {
-            key: value.key_range,
-            lsn: value.lsn_range,
-            is_incremental: true,
-            short_id,
+    impl From<DeltaFileName> for LayerDescriptor {
+        fn from(value: DeltaFileName) -> Self {
+            LayerDescriptor {
+                base: PersistentLayerDesc::new_delta(
+                    TenantId::from_array([0; 16]),
+                    TimelineId::from_array([0; 16]),
+                    value.key_range,
+                    value.lsn_range,
+                    233,
+                ),
+            }
         }
     }
-}
 
-impl From<ImageFileName> for LayerDescriptor {
-    fn from(value: ImageFileName) -> Self {
-        let short_id = value.to_string();
-        let lsn = value.lsn_as_range();
-        LayerDescriptor {
-            key: value.key_range,
-            lsn,
-            is_incremental: false,
-            short_id,
+    impl From<ImageFileName> for LayerDescriptor {
+        fn from(value: ImageFileName) -> Self {
+            LayerDescriptor {
+                base: PersistentLayerDesc::new_img(
+                    TenantId::from_array([0; 16]),
+                    TimelineId::from_array([0; 16]),
+                    value.key_range,
+                    value.lsn,
+                    false,
+                    233,
+                ),
+            }
         }
     }
-}
 
-impl From<LayerFileName> for LayerDescriptor {
-    fn from(value: LayerFileName) -> Self {
-        match value {
-            LayerFileName::Delta(d) => Self::from(d),
-            LayerFileName::Image(i) => Self::from(i),
+    impl From<LayerFileName> for LayerDescriptor {
+        fn from(value: LayerFileName) -> Self {
+            match value {
+                LayerFileName::Delta(d) => Self::from(d),
+                LayerFileName::Image(i) => Self::from(i),
+            }
         }
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index 387bae5b1f9c..9d423ed81574 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -218,15 +218,12 @@ impl RemoteLayer {
     }
 
     /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer<L>(
+    pub fn create_downloaded_layer(
         &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_>,
         conf: &'static PageServerConf,
         file_size: u64,
-    ) -> Arc<dyn PersistentLayer>
-    where
-        L: ?Sized + Layer,
-    {
+    ) -> Arc<dyn PersistentLayer> {
         if self.desc.is_delta {
             let fname = self.desc.delta_file_name();
             Arc::new(DeltaLayer::new(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 192aa6e22d97..39c72a7e477a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,7 +3,7 @@
 mod eviction_task;
 mod walreceiver;
 
-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
 use fail::fail_point;
 use futures::StreamExt;
@@ -85,7 +85,9 @@ use super::config::TenantConf;
 use super::layer_map::BatchedUpdates;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
-use super::storage_layer::{DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset};
+use super::storage_layer::{
+    DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc, PersistentLayerKey,
+};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -118,6 +120,92 @@ impl PartialOrd for Hole {
     }
 }
 
+pub struct LayerFileManager(HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>);
+
+impl LayerFileManager {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
+        // The assumption for the `expect()` is that all code maintains the following invariant:
+        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
+        self.0
+            .get(&desc.key())
+            .with_context(|| format!("get layer from desc: {}", desc.filename().file_name()))
+            .expect("not found")
+            .clone()
+    }
+
+    pub(crate) fn insert(&mut self, layer: Arc<dyn PersistentLayer>) {
+        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
+        if present.is_some() && cfg!(debug_assertions) {
+            panic!("overwriting a layer: {:?}", layer.layer_desc())
+        }
+    }
+
+    pub(crate) fn new() -> Self {
+        Self(HashMap::new())
+    }
+
+    pub(crate) fn remove(&mut self, layer: Arc<dyn PersistentLayer>) {
+        let present = self.0.remove(&layer.layer_desc().key());
+        if present.is_none() && cfg!(debug_assertions) {
+            panic!(
+                "removing layer that is not present in layer mapping: {:?}",
+                layer.layer_desc()
+            )
+        }
+    }
+
+    pub(crate) fn replace_and_verify(
+        &mut self,
+        expected: Arc<dyn PersistentLayer>,
+        new: Arc<dyn PersistentLayer>,
+    ) -> Result<()> {
+        let key = expected.layer_desc().key();
+        let other = new.layer_desc().key();
+
+        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
+        let new_l0 = LayerMap::is_l0(new.layer_desc());
+
+        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
+            "layermap-replace-notfound"
+        ));
+
+        anyhow::ensure!(
+            key == other,
+            "expected and new layer have different keys: {key:?} != {other:?}"
+        );
+
+        anyhow::ensure!(
+            expected_l0 == new_l0,
+            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
+        );
+
+        if let Some(layer) = self.0.get_mut(&expected.layer_desc().key()) {
+            anyhow::ensure!(
+                compare_arced_layers(&expected, layer),
+                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
+                expected = Arc::as_ptr(&expected),
+                new = Arc::as_ptr(layer),
+            );
+            *layer = new;
+            Ok(())
+        } else {
+            anyhow::bail!("layer was not found");
+        }
+    }
+}
+
+/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
+/// Can be removed after all refactors are done.
+fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
+    drop(rlock)
+}
+
+/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
+/// Can be removed after all refactors are done.
+fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
+    drop(rlock)
+}
+
 pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -129,7 +217,24 @@ pub struct Timeline {
 
     pub pg_version: u32,
 
-    pub(crate) layers: Arc<tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>>,
+    /// The tuple has two elements.
+    /// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote).
+    /// 2. `LayerMap`, the acceleration data structure for `get_reconstruct_data`.
+    ///
+    /// `LayerMap` maps out the `(PAGE,LSN) / (KEY,LSN)` space, which is composed of `(KeyRange, LsnRange)` rectangles.
+    /// We describe these rectangles through the `PersistentLayerDesc` struct.
+    ///
+    /// When we want to reconstruct a page, we first find the `PersistentLayerDesc`'s that we need for page reconstruction,
+    /// using `LayerMap`. Then, we use `LayerFileManager` to get the `PersistentLayer`'s that correspond to the
+    /// `PersistentLayerDesc`'s.
+    ///
+    /// Hence, it's important to keep things coherent. The `LayerFileManager` must always have an entry for all
+    /// `PersistentLayerDesc`'s in the `LayerMap`. If it doesn't, `LayerFileManager::get_from_desc` will panic at
+    /// runtime, e.g., during page reconstruction.
+    ///
+    /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
+    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
+    pub(crate) layers: Arc<tokio::sync::RwLock<(LayerMap, LayerFileManager)>>,
 
     /// Set of key ranges which should be covered by image layers to
     /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
@@ -599,7 +704,8 @@ impl Timeline {
     /// This method makes no distinction between local and remote layers.
     /// Hence, the result **does not represent local filesystem usage**.
     pub async fn layer_size_sum(&self) -> u64 {
-        let layer_map = self.layers.read().await;
+        let guard = self.layers.read().await;
+        let (layer_map, _) = &*guard;
         let mut size = 0;
         for l in layer_map.iter_historic_layers() {
             size += l.file_size();
@@ -909,7 +1015,8 @@ impl Timeline {
     pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
         let last_lsn = self.get_last_record_lsn();
         let open_layer_size = {
-            let layers = self.layers.read().await;
+            let guard = self.layers.read().await;
+            let (layers, _) = &*guard;
             let Some(open_layer) = layers.open_layer.as_ref() else {
                 return Ok(());
             };
@@ -1040,7 +1147,8 @@ impl Timeline {
     }
 
     pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
-        let layer_map = self.layers.read().await;
+        let guard = self.layers.read().await;
+        let (layer_map, mapping) = &*guard;
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
         if let Some(open_layer) = &layer_map.open_layer {
             in_memory_layers.push(open_layer.info());
@@ -1051,6 +1159,7 @@ impl Timeline {
 
         let mut historic_layers = Vec::new();
         for historic_layer in layer_map.iter_historic_layers() {
+            let historic_layer = mapping.get_from_desc(&historic_layer);
             historic_layers.push(historic_layer.info(reset));
         }
 
@@ -1160,7 +1269,8 @@ impl Timeline {
         }
 
         // start the batch update
-        let mut layer_map = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layer_map, mapping) = &mut *guard;
         let mut batch_updates = layer_map.batch_update();
 
         let mut results = Vec::with_capacity(layers_to_evict.len());
@@ -1169,14 +1279,19 @@ impl Timeline {
             let res = if cancel.is_cancelled() {
                 None
             } else {
-                Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates))
+                Some(self.evict_layer_batch_impl(
+                    &layer_removal_guard,
+                    l,
+                    &mut batch_updates,
+                    mapping,
+                ))
             };
             results.push(res);
         }
 
         // commit the updates & release locks
         batch_updates.flush();
-        drop(layer_map);
+        drop_wlock(guard);
         drop(layer_removal_guard);
 
         assert_eq!(results.len(), layers_to_evict.len());
@@ -1187,10 +1302,9 @@ impl Timeline {
         &self,
         _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
         local_layer: &Arc<dyn PersistentLayer>,
-        batch_updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
+        batch_updates: &mut BatchedUpdates<'_>,
+        mapping: &mut LayerFileManager,
     ) -> anyhow::Result<bool> {
-        use super::layer_map::Replacement;
-
         if local_layer.is_remote_layer() {
             // TODO(issue #3851): consider returning an err here instead of false,
             // which is the same out the match later
@@ -1238,13 +1352,10 @@ impl Timeline {
             ),
         });
 
-        let replaced = match batch_updates.replace_historic(
-            local_layer.layer_desc().clone(),
-            local_layer,
-            new_remote_layer.layer_desc().clone(),
-            new_remote_layer,
-        )? {
-            Replacement::Replaced { .. } => {
+        assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
+
+        let succeed = match mapping.replace_and_verify(local_layer.clone(), new_remote_layer) {
+            Ok(()) => {
                 if let Err(e) = local_layer.delete_resident_layer_file() {
                     error!("failed to remove layer file on evict after replacement: {e:#?}");
                 }
@@ -1277,24 +1388,17 @@ impl Timeline {
 
                 true
             }
-            Replacement::NotFound => {
-                debug!(evicted=?local_layer, "layer was no longer in layer map");
-                false
-            }
-            Replacement::RemovalBuffered => {
-                unreachable!("not doing anything else in this batch")
-            }
-            Replacement::Unexpected(other) => {
-                error!(
-                    local_layer.ptr=?Arc::as_ptr(local_layer),
-                    other.ptr=?Arc::as_ptr(&other),
-                    ?other,
-                    "failed to replace");
+            Err(err) => {
+                if cfg!(debug_assertions) {
+                    panic!("failed to replace: {err}, evicted: {local_layer:?}");
+                } else {
+                    error!(evicted=?local_layer, "failed to replace: {err}");
+                }
                 false
             }
         };
 
-        Ok(replaced)
+        Ok(succeed)
     }
 }
 
@@ -1421,7 +1525,10 @@ impl Timeline {
                 timeline_id,
                 tenant_id,
                 pg_version,
-                layers: Arc::new(tokio::sync::RwLock::new(LayerMap::default())),
+                layers: Arc::new(tokio::sync::RwLock::new((
+                    LayerMap::default(),
+                    LayerFileManager::new(),
+                ))),
                 wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
@@ -1606,14 +1713,15 @@ impl Timeline {
         let mut layers = self.layers.try_write().expect(
             "in the context where we call this function, no other task has access to the object",
         );
-        layers.next_open_layer_at = Some(Lsn(start_lsn.0));
+        layers.0.next_open_layer_at = Some(Lsn(start_lsn.0));
     }
 
     ///
     /// Scan the timeline directory to populate the layer map.
     ///
     pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
-        let mut layers = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layers, mapping) = &mut *guard;
         let mut updates = layers.batch_update();
         let mut num_layers = 0;
 
@@ -1656,7 +1764,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
+                self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1688,7 +1796,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
+                self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1742,7 +1850,8 @@ impl Timeline {
 
         // We're holding a layer map lock for a while but this
         // method is only called during init so it's fine.
-        let mut layer_map = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layer_map, mapping) = &mut *guard;
         let mut updates = layer_map.batch_update();
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1787,7 +1896,7 @@ impl Timeline {
                         anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                     } else {
                         self.metrics.resident_physical_size_gauge.sub(local_size);
-                        updates.remove_historic(local_layer.layer_desc().clone(), local_layer);
+                        self.remove_historic_layer(local_layer, &mut updates, mapping);
                         // fall-through to adding the remote layer
                     }
                 } else {
@@ -1826,7 +1935,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
+                    self.insert_historic_layer(remote_layer, &mut updates, mapping);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1853,7 +1962,7 @@ impl Timeline {
                         ),
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
+                    self.insert_historic_layer(remote_layer, &mut updates, mapping);
                 }
             }
         }
@@ -1892,13 +2001,14 @@ impl Timeline {
 
         let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
 
-        let local_layers = self
-            .layers
-            .read()
-            .await
-            .iter_historic_layers()
-            .map(|l| (l.filename(), l))
-            .collect::<HashMap<_, _>>();
+        let local_layers = {
+            let guard = self.layers.read().await;
+            let (layers, mapping) = &*guard;
+            layers
+                .iter_historic_layers()
+                .map(|l| (l.filename(), mapping.get_from_desc(&l)))
+                .collect::<HashMap<_, _>>()
+        };
 
         // If no writes happen, new branches do not have any layers, only the metadata file.
         let has_local_layers = !local_layers.is_empty();
@@ -2269,25 +2379,53 @@ impl Timeline {
     }
 
     async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
-        for historic_layer in self.layers.read().await.iter_historic_layers() {
+        let guard = self.layers.read().await;
+        let (layers, mapping) = &*guard;
+        for historic_layer in layers.iter_historic_layers() {
             let historic_layer_name = historic_layer.filename().file_name();
             if layer_file_name == historic_layer_name {
-                return Some(historic_layer);
+                return Some(mapping.get_from_desc(&historic_layer));
             }
         }
 
         None
     }
 
+    /// Helper function to insert a layer from both layer map and layer file manager. Will be removed in the future
+    /// after we introduce `LayerMapManager`.
+    fn insert_historic_layer(
+        &self,
+        layer: Arc<dyn PersistentLayer>,
+        updates: &mut BatchedUpdates<'_>,
+        mapping: &mut LayerFileManager,
+    ) {
+        updates.insert_historic(layer.layer_desc().clone());
+        mapping.insert(layer);
+    }
+
+    /// Helper function to remove a layer from both layer map and layer file manager. Will be removed in the future
+    /// after we introduce `LayerMapManager`.
+    fn remove_historic_layer(
+        &self,
+        layer: Arc<dyn PersistentLayer>,
+        updates: &mut BatchedUpdates<'_>,
+        mapping: &mut LayerFileManager,
+    ) {
+        updates.remove_historic(layer.layer_desc().clone());
+        mapping.remove(layer);
+    }
+
     /// Removes the layer from local FS (if present) and from memory.
     /// Remote storage is not affected by this operation.
     fn delete_historic_layer(
         &self,
         // we cannot remove layers otherwise, since gc and compaction will race
         _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layer: Arc<dyn PersistentLayer>,
-        updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
+        layer: Arc<PersistentLayerDesc>,
+        updates: &mut BatchedUpdates<'_>,
+        mapping: &mut LayerFileManager,
     ) -> anyhow::Result<()> {
+        let layer = mapping.get_from_desc(&layer);
         if !layer.is_remote_layer() {
             layer.delete_resident_layer_file()?;
             let layer_file_size = layer.file_size();
@@ -2301,7 +2439,8 @@ impl Timeline {
         //      won't be needed for page reconstruction for this timeline,
         //      and mark what we can't delete yet as deleted from the layer
         //      map index without actually rebuilding the index.
-        updates.remove_historic(layer.layer_desc().clone(), layer);
+        updates.remove_historic(layer.layer_desc().clone());
+        mapping.remove(layer);
 
         Ok(())
     }
@@ -2486,7 +2625,8 @@ impl Timeline {
             #[allow(clippy::never_loop)] // see comment at bottom of this loop
             'layer_map_search: loop {
                 let remote_layer = {
-                    let layers = timeline.layers.read().await;
+                    let guard = timeline.layers.read().await;
+                    let (layers, mapping) = &*guard;
 
                     // Check the open and frozen in-memory layers first, in order from newest
                     // to oldest.
@@ -2548,6 +2688,7 @@ impl Timeline {
                     }
 
                     if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
+                        let layer = mapping.get_from_desc(&layer);
                         // If it's a remote layer, download it and retry.
                         if let Some(remote_layer) =
                             super::storage_layer::downcast_remote_layer(&layer)
@@ -2669,7 +2810,8 @@ impl Timeline {
     /// Get a handle to the latest layer for appending.
     ///
     async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
-        let mut layers = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layers, _) = &mut *guard;
 
         ensure!(lsn.is_aligned());
 
@@ -2746,7 +2888,8 @@ impl Timeline {
         } else {
             Some(self.write_lock.lock().await)
         };
-        let mut layers = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layers, _) = &mut *guard;
         if let Some(open_layer) = &layers.open_layer {
             let open_layer_rc = Arc::clone(open_layer);
             // Does this layer need freezing?
@@ -2760,7 +2903,7 @@ impl Timeline {
             layers.next_open_layer_at = Some(end_lsn);
             self.last_freeze_at.store(end_lsn);
         }
-        drop(layers);
+        drop_wlock(guard);
     }
 
     /// Layer flusher task's main loop.
@@ -2784,7 +2927,8 @@ impl Timeline {
             let flush_counter = *layer_flush_start_rx.borrow();
             let result = loop {
                 let layer_to_flush = {
-                    let layers = self.layers.read().await;
+                    let guard = self.layers.read().await;
+                    let (layers, _) = &*guard;
                     layers.frozen_layers.front().cloned()
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
@@ -2907,15 +3051,17 @@ impl Timeline {
         fail_point!("flush-frozen-before-sync");
 
         // The new on-disk layers are now in the layer map. We can remove the
-        // in-memory layer from the map now.
+        // in-memory layer from the map now. We do not modify `LayerFileManager` because
+        // it only contains persistent layers. The flushed layer is stored in
+        // the mapping in `create_delta_layer`.
         {
             let mut layers = self.layers.write().await;
-            let l = layers.frozen_layers.pop_front();
+            let l = layers.0.frozen_layers.pop_front();
 
             // Only one thread may call this function at a time (for this
             // timeline). If two threads tried to flush the same frozen
             // layer to disk at the same time, that would not work.
-            assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer));
+            assert!(compare_arced_layers(&l.unwrap(), &frozen_layer));
 
             // release lock on 'layers'
         }
@@ -3048,14 +3194,15 @@ impl Timeline {
 
         // Add it to the layer map
         let l = Arc::new(new_delta);
-        let mut layers = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layers, mapping) = &mut *guard;
         let mut batch_updates = layers.batch_update();
         l.access_stats().record_residence_event(
             &batch_updates,
             LayerResidenceStatus::Resident,
             LayerResidenceEventReason::LayerCreate,
         );
-        batch_updates.insert_historic(l.layer_desc().clone(), l);
+        self.insert_historic_layer(l, &mut batch_updates, mapping);
         batch_updates.flush();
 
         // update metrics
@@ -3104,7 +3251,8 @@ impl Timeline {
     ) -> anyhow::Result<bool> {
         let threshold = self.get_image_creation_threshold();
 
-        let layers = self.layers.read().await;
+        let guard = self.layers.read().await;
+        let (layers, _) = &*guard;
 
         let mut max_deltas = 0;
         {
@@ -3282,7 +3430,8 @@ impl Timeline {
 
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
-        let mut layers = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layers, mapping) = &mut *guard;
         let mut updates = layers.batch_update();
         let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
 
@@ -3304,10 +3453,10 @@ impl Timeline {
                 LayerResidenceStatus::Resident,
                 LayerResidenceEventReason::LayerCreate,
             );
-            updates.insert_historic(l.layer_desc().clone(), l);
+            self.insert_historic_layer(l, &mut updates, mapping);
         }
         updates.flush();
-        drop(layers);
+        drop_wlock(guard);
         timer.stop_and_record();
 
         Ok(layer_paths_to_upload)
@@ -3317,7 +3466,7 @@ impl Timeline {
 #[derive(Default)]
 struct CompactLevel0Phase1Result {
     new_layers: Vec<DeltaLayer>,
-    deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
+    deltas_to_compact: Vec<Arc<PersistentLayerDesc>>,
 }
 
 /// Top-level failure to compact.
@@ -3468,14 +3617,19 @@ impl Timeline {
     fn compact_level0_phase1(
         self: Arc<Self>,
         _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layers: tokio::sync::OwnedRwLockReadGuard<LayerMap<dyn PersistentLayer>>,
+        guard: tokio::sync::OwnedRwLockReadGuard<(LayerMap, LayerFileManager)>,
         mut stats: CompactLevel0Phase1StatsBuilder,
         target_file_size: u64,
         ctx: &RequestContext,
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         stats.read_lock_held_spawn_blocking_startup_micros =
             stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let mut level0_deltas = layers.get_level0_deltas()?;
+        let (layers, mapping) = &*guard;
+        let level0_deltas = layers.get_level0_deltas()?;
+        let mut level0_deltas = level0_deltas
+            .into_iter()
+            .map(|x| mapping.get_from_desc(&x))
+            .collect_vec();
         stats.level0_deltas_count = Some(level0_deltas.len());
         // Only compact if enough layers have accumulated.
         let threshold = self.get_compaction_threshold();
@@ -3595,7 +3749,7 @@ impl Timeline {
         }
         stats.read_lock_held_compute_holes_micros =
             stats.read_lock_held_prerequisites_micros.till_now();
-        drop(layers);
+        drop_rlock(guard);
         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
         let mut holes = heap.into_vec();
         holes.sort_unstable_by_key(|hole| hole.key_range.start);
@@ -3822,7 +3976,10 @@ impl Timeline {
 
         Ok(CompactLevel0Phase1Result {
             new_layers,
-            deltas_to_compact,
+            deltas_to_compact: deltas_to_compact
+                .into_iter()
+                .map(|x| Arc::new(x.layer_desc().clone()))
+                .collect(),
         })
     }
 
@@ -3886,7 +4043,8 @@ impl Timeline {
                 .context("wait for layer upload ops to complete")?;
         }
 
-        let mut layers = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layers, mapping) = &mut *guard;
         let mut updates = layers.batch_update();
         let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
         for l in new_layers {
@@ -3918,7 +4076,7 @@ impl Timeline {
                 LayerResidenceStatus::Resident,
                 LayerResidenceEventReason::LayerCreate,
             );
-            updates.insert_historic(x.layer_desc().clone(), x);
+            self.insert_historic_layer(x, &mut updates, mapping);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can
@@ -3926,10 +4084,13 @@ impl Timeline {
         let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
         for l in deltas_to_compact {
             layer_names_to_delete.push(l.filename());
-            self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
+            // NB: the layer file identified by descriptor `l` is guaranteed to be present
+            // in the LayerFileManager because we kept holding `layer_removal_cs` the entire
+            // time, even though we dropped `Timeline::layers` inbetween.
+            self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates, mapping)?;
         }
         updates.flush();
-        drop(layers);
+        drop_wlock(guard);
 
         // Also schedule the deletions in remote storage
         if let Some(remote_client) = &self.remote_client {
@@ -4146,7 +4307,8 @@ impl Timeline {
         // 4. newer on-disk image layers cover the layer's whole key range
         //
         // TODO holding a write lock is too agressive and avoidable
-        let mut layers = self.layers.write().await;
+        let mut guard = self.layers.write().await;
+        let (layers, mapping) = &mut *guard;
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
 
@@ -4225,7 +4387,7 @@ impl Timeline {
                 // delta layers. Image layers can form "stairs" preventing old image from been deleted.
                 // But image layers are in any case less sparse than delta layers. Also we need some
                 // protection from replacing recent image layers with new one after each GC iteration.
-                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) {
+                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) {
                     wanted_image_layers.add_range(l.get_key_range());
                 }
                 result.layers_not_updated += 1;
@@ -4262,6 +4424,7 @@ impl Timeline {
                         layer_removal_cs.clone(),
                         doomed_layer,
                         &mut updates,
+                        mapping,
                     )?; // FIXME: schedule succeeded deletions before returning?
                     result.layers_removed += 1;
                 }
@@ -4446,42 +4609,15 @@ impl Timeline {
 
                     // Download complete. Replace the RemoteLayer with the corresponding
                     // Delta- or ImageLayer in the layer map.
-                    let mut layers = self_clone.layers.write().await;
-                    let mut updates = layers.batch_update();
-                    let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
+                    let mut guard = self_clone.layers.write().await;
+                    let (layers, mapping) = &mut *guard;
+                    let updates = layers.batch_update();
+                    let new_layer =
+                        remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
                     {
-                        use crate::tenant::layer_map::Replacement;
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        let failure = match updates.replace_historic(l.layer_desc().clone(), &l, new_layer.layer_desc().clone(), new_layer) {
-                            Ok(Replacement::Replaced { .. }) => false,
-                            Ok(Replacement::NotFound) => {
-                                // TODO: the downloaded file should probably be removed, otherwise
-                                // it will be added to the layermap on next load? we should
-                                // probably restart any get_reconstruct_data search as well.
-                                //
-                                // See: https://github.com/neondatabase/neon/issues/3533
-                                error!("replacing downloaded layer into layermap failed because layer was not found");
-                                true
-                            }
-                            Ok(Replacement::RemovalBuffered) => {
-                                unreachable!("current implementation does not remove anything")
-                            }
-                            Ok(Replacement::Unexpected(other)) => {
-                                // if the other layer would have the same pointer value as
-                                // expected, it means they differ only on vtables.
-                                //
-                                // otherwise there's no known reason for this to happen as
-                                // compacted layers should have different covering rectangle
-                                // leading to produce Replacement::NotFound.
-
-                                error!(
-                                    expected.ptr = ?Arc::as_ptr(&l),
-                                    other.ptr = ?Arc::as_ptr(&other),
-                                    ?other,
-                                    "replacing downloaded layer into layermap failed because another layer was found instead of expected"
-                                );
-                                true
-                            }
+                        let failure = match mapping.replace_and_verify(l, new_layer) {
+                            Ok(()) => false,
                             Err(e) => {
                                 // this is a precondition failure, the layer filename derived
                                 // attributes didn't match up, which doesn't seem likely.
@@ -4509,7 +4645,7 @@ impl Timeline {
                         }
                     }
                     updates.flush();
-                    drop(layers);
+                    drop_wlock(guard);
 
                     info!("on-demand download successful");
 
@@ -4520,7 +4656,10 @@ impl Timeline {
                     remote_layer.ongoing_download.close();
                 } else {
                     // Keep semaphore open. We'll drop the permit at the end of the function.
-                    error!("layer file download failed: {:?}", result.as_ref().unwrap_err());
+                    error!(
+                        "layer file download failed: {:?}",
+                        result.as_ref().unwrap_err()
+                    );
                 }
 
                 // Don't treat it as an error if the task that triggered the download
@@ -4534,7 +4673,8 @@ impl Timeline {
                 drop(permit);
 
                 Ok(())
-            }.in_current_span(),
+            }
+            .in_current_span(),
         );
 
         receiver.await.context("download task cancelled")?
@@ -4604,9 +4744,11 @@ impl Timeline {
     ) {
         let mut downloads = Vec::new();
         {
-            let layers = self.layers.read().await;
+            let guard = self.layers.read().await;
+            let (layers, mapping) = &*guard;
             layers
                 .iter_historic_layers()
+                .map(|l| mapping.get_from_desc(&l))
                 .filter_map(|l| l.downcast_remote_layer())
                 .map(|l| self.download_remote_layer(l))
                 .for_each(|dl| downloads.push(dl))
@@ -4707,7 +4849,8 @@ impl LocalLayerInfoForDiskUsageEviction {
 
 impl Timeline {
     pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
-        let layers = self.layers.read().await;
+        let guard = self.layers.read().await;
+        let (layers, mapping) = &*guard;
 
         let mut max_layer_size: Option<u64> = None;
         let mut resident_layers = Vec::new();
@@ -4716,6 +4859,8 @@ impl Timeline {
             let file_size = l.file_size();
             max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
 
+            let l = mapping.get_from_desc(&l);
+
             if l.is_remote_layer() {
                 continue;
             }
@@ -4874,3 +5019,31 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
         ),
     }
 }
+
+/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
+///
+/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
+///
+/// If comparing persistent layers, ALWAYS compare the layer descriptor key.
+#[inline(always)]
+pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {
+    // "dyn Trait" objects are "fat pointers" in that they have two components:
+    // - pointer to the object
+    // - pointer to the vtable
+    //
+    // rust does not provide a guarantee that these vtables are unique, but however
+    // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
+    // pointer and the vtable need to be equal.
+    //
+    // See: https://github.com/rust-lang/rust/issues/103763
+    //
+    // A future version of rust will most likely use this form below, where we cast each
+    // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
+    // not affect the comparison.
+    //
+    // See: https://github.com/rust-lang/rust/pull/106450
+    let left = Arc::as_ptr(left) as *const ();
+    let right = Arc::as_ptr(right) as *const ();
+
+    left == right
+}
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 80c521021162..03cf2d89ad9f 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -197,9 +197,11 @@ impl Timeline {
         // We don't want to hold the layer map lock during eviction.
         // So, we just need to deal with this.
         let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let layers = self.layers.read().await;
+            let guard = self.layers.read().await;
+            let (layers, mapping) = &*guard;
             let mut candidates = Vec::new();
             for hist_layer in layers.iter_historic_layers() {
+                let hist_layer = mapping.get_from_desc(&hist_layer);
                 if hist_layer.is_remote_layer() {
                     continue;
                 }
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index c26ec76172bc..a30f0f02e151 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -713,9 +713,7 @@ def test_ondemand_download_failure_to_replace(
         # error message is not useful
         pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
 
-    actual_message = (
-        ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
-    )
+    actual_message = ".* ERROR .*layermap-replace-notfound"
     assert env.pageserver.log_contains(actual_message) is not None
     env.pageserver.allowed_errors.append(actual_message)
 

From b9902004965c717810d19302bc9f613a480a66bf Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Fri, 30 Jun 2023 15:01:06 +0300
Subject: [PATCH 15/37] tests: use shortcut everywhere to get timeline path
 (#4586)

---
 test_runner/regress/test_disk_usage_eviction.py   | 15 +++++++--------
 test_runner/regress/test_layer_eviction.py        |  2 +-
 test_runner/regress/test_remote_storage.py        |  2 +-
 test_runner/regress/test_tenant_detach.py         | 12 ++++++------
 test_runner/regress/test_tenant_relocation.py     |  4 +---
 .../regress/test_tenants_with_remote_storage.py   |  2 +-
 6 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 0ec023b9e115..fc2d29c2c102 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -1,7 +1,6 @@
 import shutil
 import time
 from dataclasses import dataclass
-from pathlib import Path
 from typing import Dict, Tuple
 
 import pytest
@@ -428,14 +427,14 @@ def poor_mans_du(
     largest_layer = 0
     smallest_layer = None
     for tenant_id, timeline_id in timelines:
-        dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
-        assert dir.exists(), f"timeline dir does not exist: {dir}"
-        sum = 0
-        for file in dir.iterdir():
+        timeline_dir = env.timeline_dir(tenant_id, timeline_id)
+        assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
+        total = 0
+        for file in timeline_dir.iterdir():
             if "__" not in file.name:
                 continue
             size = file.stat().st_size
-            sum += size
+            total += size
             largest_layer = max(largest_layer, size)
             if smallest_layer:
                 smallest_layer = min(smallest_layer, size)
@@ -443,8 +442,8 @@ def poor_mans_du(
                 smallest_layer = size
             log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
 
-        log.info(f"{tenant_id}/{timeline_id}: sum {sum}")
-        total_on_disk += sum
+        log.info(f"{tenant_id}/{timeline_id}: sum {total}")
+        total_on_disk += total
 
     assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
     return (total_on_disk, largest_layer, smallest_layer or 0)
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index b22e545f20e3..d4f0c94a29c2 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -58,7 +58,7 @@ def test_basic_eviction(
     for sk in env.safekeepers:
         sk.stop()
 
-    timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    timeline_path = env.timeline_dir(tenant_id, timeline_id)
     initial_local_layers = sorted(
         list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
     )
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index f2b954a82293..9a7026631462 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -535,7 +535,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             "pitr_interval": "0s",
         }
     )
-    timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    timeline_path = env.timeline_dir(tenant_id, timeline_id)
 
     client = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 2a015d5d177e..2ded79954e8e 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -632,14 +632,14 @@ def test_ignored_tenant_download_missing_layers(
 
     # ignore the tenant and remove its layers
     pageserver_http.tenant_ignore(tenant_id)
-    tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
     layers_removed = False
-    for dir_entry in tenant_timeline_dir.iterdir():
+    for dir_entry in timeline_dir.iterdir():
         if dir_entry.name.startswith("00000"):
             # Looks like a layer file. Remove it
             dir_entry.unlink()
             layers_removed = True
-    assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}"
+    assert layers_removed, f"Found no layers for tenant {timeline_dir}"
 
     # now, load it from the local files and expect it to work due to remote storage restoration
     pageserver_http.tenant_load(tenant_id=tenant_id)
@@ -688,14 +688,14 @@ def test_ignored_tenant_stays_broken_without_metadata(
 
     # ignore the tenant and remove its metadata
     pageserver_http.tenant_ignore(tenant_id)
-    tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
     metadata_removed = False
-    for dir_entry in tenant_timeline_dir.iterdir():
+    for dir_entry in timeline_dir.iterdir():
         if dir_entry.name == "metadata":
             # Looks like a layer file. Remove it
             dir_entry.unlink()
             metadata_removed = True
-    assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"
+    assert metadata_removed, f"Failed to find metadata file in {timeline_dir}"
 
     env.pageserver.allowed_errors.append(
         f".*{tenant_id}.*: load failed.*: failed to load metadata.*"
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 70b931d7f941..9043c29060d0 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -214,9 +214,7 @@ def switch_pg_to_new_pageserver(
 
     endpoint.start()
 
-    timeline_to_detach_local_path = (
-        env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
-    )
+    timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id)
     files_before_detach = os.listdir(timeline_to_detach_local_path)
     assert (
         "metadata" in files_before_detach
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index dca2cd3d286b..98f9e942769a 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -257,7 +257,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
     env.endpoints.stop_all()
     env.pageserver.stop()
 
-    timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
     local_layer_truncated = None
     for path in Path.iterdir(timeline_dir):
         if path.name.startswith("00000"):

From f558f88a0857ead1132cab3fd63ab382716b4097 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 30 Jun 2023 14:53:21 +0200
Subject: [PATCH 16/37] refactor: distinguished error type for timeline
 creation failure (#4597)

refs https://github.com/neondatabase/neon/issues/4595
---
 pageserver/src/http/routes.rs | 10 ++++++----
 pageserver/src/tenant.rs      | 30 +++++++++++++++++++-----------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0e12cc2b1e0f..08d3fcf8df54 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -23,7 +23,6 @@ use super::models::{
     TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::disk_usage_eviction_task;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -35,6 +34,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
+use crate::{disk_usage_eviction_task, tenant};
 use utils::{
     auth::JwtAuth,
     http::{
@@ -328,15 +328,17 @@ async fn timeline_create_handler(
             &ctx,
         )
         .await {
-            Ok(Some(new_timeline)) => {
+            Ok(new_timeline) => {
                 // Created. Construct a TimelineInfo for it.
                 let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                     .await
                     .map_err(ApiError::InternalServerError)?;
                 json_response(StatusCode::CREATED, timeline_info)
             }
-            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
-            Err(err) => Err(ApiError::InternalServerError(err)),
+            Err(tenant::CreateTimelineError::AlreadyExists) => {
+                json_response(StatusCode::CONFLICT, ())
+            }
+            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
         }
     }
     .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 47e9b5b4ec95..339291149ff1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -501,6 +501,14 @@ impl DeletionGuard {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub enum CreateTimelineError {
+    #[error("a timeline with the given ID already exists")]
+    AlreadyExists,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -1375,8 +1383,7 @@ impl Tenant {
     /// Returns the new timeline ID and reference to its Timeline object.
     ///
     /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
-    /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given,
-    /// a new unique ID is generated.
+    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
     pub async fn create_timeline(
         &self,
         new_timeline_id: TimelineId,
@@ -1385,11 +1392,12 @@ impl Tenant {
         pg_version: u32,
         broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Arc<Timeline>>> {
-        anyhow::ensure!(
-            self.is_active(),
-            "Cannot create timelines on inactive tenant"
-        );
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
+        if !self.is_active() {
+            return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                "Cannot create timelines on inactive tenant"
+            )));
+        }
 
         if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
             debug!("timeline {new_timeline_id} already exists");
@@ -1409,7 +1417,7 @@ impl Tenant {
                     .context("wait for timeline uploads to complete")?;
             }
 
-            return Ok(None);
+            return Err(CreateTimelineError::AlreadyExists);
         }
 
         let loaded_timeline = match ancestor_timeline_id {
@@ -1424,12 +1432,12 @@ impl Tenant {
                     let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                     if ancestor_ancestor_lsn > *lsn {
                         // can we safely just branch from the ancestor instead?
-                        bail!(
+                        return Err(CreateTimelineError::Other(anyhow::anyhow!(
                             "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
                             lsn,
                             ancestor_timeline_id,
                             ancestor_ancestor_lsn,
-                        );
+                        )));
                     }
 
                     // Wait for the WAL to arrive and be processed on the parent branch up
@@ -1463,7 +1471,7 @@ impl Tenant {
             })?;
         }
 
-        Ok(Some(loaded_timeline))
+        Ok(loaded_timeline)
     }
 
     /// perform one garbage collection iteration, removing old data files from disk.

From 3e55d9dec637631f547293331a33ddb9f1370f20 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 30 Jun 2023 14:19:24 +0300
Subject: [PATCH 17/37] Bump vendor/postgres Point to REL_14_STABLE_neon and
 REL_15_STABLE_neon. This change updates PostgreSQL versions to 14.8 and 15.3

---
 vendor/postgres-v14 | 2 +-
 vendor/postgres-v15 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a2daebc6b445..1144aee1661c 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a2daebc6b445dcbcca9c18e1711f47c1db7ffb04
+Subproject commit 1144aee1661c79eec65e784a8dad8bd450d9df79
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 2df2ce374464..1984832c740a 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 2df2ce374464a7449e15dfa46c956b73b4f4098b
+Subproject commit 1984832c740a7fa0e468bb720f40c525b652835d

From fbd37740c553fe797d0ff651d222d5863fc20f91 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 30 Jun 2023 21:55:56 +0300
Subject: [PATCH 18/37] Make file_cache logging less verbose (#4601)

## Problem


Message "set local file cache limit to..." polutes compute logs.

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 1ab2ae668a7d..02795bc8b8c7 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -190,7 +190,7 @@ lfc_change_limit_hook(int newval, void *extra)
 		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
 		lfc_ctl->used -= 1;
 	}
-	elog(LOG, "set local file cache limit to %d", new_size);
+	elog(DEBUG1, "set local file cache limit to %d", new_size);
 	LWLockRelease(lfc_lock);
 }
 

From 9de1a6fb146a92034976b9cea8d40a985ac0ca3d Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Fri, 30 Jun 2023 16:29:47 -0400
Subject: [PATCH 19/37] cold starts: Run sync_safekeepers on compute_ctl
 shutdown (#4588)

---
 compute_tools/src/bin/compute_ctl.rs      | 10 ++++++++++
 compute_tools/src/compute.rs              |  2 +-
 control_plane/src/background_process.rs   |  5 +++++
 control_plane/src/endpoint.rs             | 18 +++++++++++++++++-
 test_runner/regress/test_timeline_size.py |  1 +
 test_runner/regress/test_wal_acceptor.py  |  4 ++++
 6 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 90b39e9dd9c2..68f6bf3844d9 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -256,6 +256,16 @@ fn main() -> Result<()> {
         exit_code = ecode.code()
     }
 
+    // Maybe sync safekeepers again, to speed up next startup
+    let compute_state = compute.state.lock().unwrap().clone();
+    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+    if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
+        info!("syncing safekeepers on shutdown");
+        let storage_auth_token = pspec.storage_auth_token.clone();
+        let lsn = compute.sync_safekeepers(storage_auth_token)?;
+        info!("synced safekeepers at lsn {lsn}");
+    }
+
     if let Err(err) = compute.check_for_core_dumps() {
         error!("error while checking for core dumps: {err:?}");
     }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 87acefc1bbd8..70d83a7b477a 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -278,7 +278,7 @@ impl ComputeNode {
     // Run `postgres` in a special mode with `--sync-safekeepers` argument
     // and return the reported LSN back to the caller.
     #[instrument(skip_all)]
-    fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
+    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
         let start_time = Utc::now();
 
         let sync_handle = Command::new(&self.pgbin)
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 1f3f8f45ea6f..00af1a1d534e 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -180,6 +180,11 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
     }
 
     // Wait until process is gone
+    wait_until_stopped(process_name, pid)?;
+    Ok(())
+}
+
+pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
     for retries in 0..RETRIES {
         match process_has_stopped(pid) {
             Ok(true) => {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 52683ff1c326..ab921d096f25 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -405,6 +405,16 @@ impl Endpoint {
                 String::from_utf8_lossy(&pg_ctl.stderr),
             );
         }
+
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
+        //
+        // TODO use background_process::stop_process instead
+        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
+        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
+        let pid = nix::unistd::Pid::from_raw(pid as i32);
+        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
+
         Ok(())
     }
 
@@ -507,7 +517,13 @@ impl Endpoint {
             .stdin(std::process::Stdio::null())
             .stderr(logfile.try_clone()?)
             .stdout(logfile);
-        let _child = cmd.spawn()?;
+        let child = cmd.spawn()?;
+
+        // Write down the pid so we can wait for it when we want to stop
+        // TODO use background_process::start_process instead
+        let pid = child.id();
+        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
+        std::fs::write(pidfile_path, pid.to_string())?;
 
         // Wait for it to start
         let mut attempt = 0;
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 5bdbc1892707..6338f4ca77a6 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -416,6 +416,7 @@ def test_timeline_physical_size_post_compaction(
     wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
 
     # shutdown safekeepers to prevent new data from coming in
+    endpoint.stop()  # We can't gracefully stop after safekeepers die
     for sk in env.safekeepers:
         sk.stop()
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 994858edf7a7..5828d4306c32 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1210,6 +1210,10 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
             with conn.cursor() as cur:
                 cur.execute("INSERT INTO t (key) VALUES (1)")
 
+    # Stop all computes gracefully before safekeepers stop responding to them
+    endpoint_1.stop_and_destroy()
+    endpoint_3.stop_and_destroy()
+
     # Remove initial tenant's br1 (active)
     assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"]
     assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()

From c9f05d418d5d68e432ca069a66363e1ba2da2d72 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Fri, 30 Jun 2023 13:49:06 -0700
Subject: [PATCH 20/37] Bump vm-builder v0.11.0 -> v0.11.1 (#4605)

This applies the fix from https://github.com/neondatabase/autoscaling/pull/367,
which should resolve the "leaking cloud_admin connections" issue that
has been  observed for some customers.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 46dd89445196..068e34420ff5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -767,7 +767,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.11.0
+      VM_BUILDER_VERSION: v0.11.1
 
     steps:
       - name: Checkout

From ff1a1aea86b0245ad039039ed4c86591270ca567 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 30 Jun 2023 14:17:44 -0700
Subject: [PATCH 21/37] Make control plane connector send encrypted password
 (#4607)

Control plane needs the encrypted hash that postgres itself generates
---
 pgxn/neon/control_plane_connector.c        | 16 ++++++++++++++++
 test_runner/regress/test_ddl_forwarding.py |  1 +
 2 files changed, 17 insertions(+)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 82e4af4b4a79..debbbce117f2 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -32,6 +32,7 @@
 #include "port.h"
 #include <curl/curl.h>
 #include "utils/jsonb.h"
+#include "libpq/crypt.h"
 
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
 
@@ -161,7 +162,22 @@ ConstructDeltaMessage()
 			PushKeyValue(&state, "name", entry->name);
 			if (entry->password)
 			{
+#if PG_MAJORVERSION_NUM == 14
+				char	   *logdetail;
+#else
+				const char *logdetail;
+#endif
 				PushKeyValue(&state, "password", (char *) entry->password);
+				char	   *encrypted_password = get_role_password(entry->name, &logdetail);
+
+				if (encrypted_password)
+				{
+					PushKeyValue(&state, "encrypted_password", encrypted_password);
+				}
+				else
+				{
+					elog(ERROR, "Failed to get encrypted password: %s", logdetail);
+				}
 			}
 			if (entry->old_name[0] != '\0')
 			{
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 6bfa8fdbe7d2..ebd836ecbc53 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -33,6 +33,7 @@ def handle_role(dbs, roles, operation):
                     dbs[db] = operation["name"]
         if "password" in operation:
             roles[operation["name"]] = operation["password"]
+            assert "encrypted_password" in operation
     elif operation["op"] == "del":
         if "old_name" in operation:
             roles.pop(operation["old_name"])

From 4957bb2d4879f44c80b24dad1df3fd0457acb670 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 3 Jul 2023 13:53:57 +0300
Subject: [PATCH 22/37] fix(proxy): stray span enter globbers up logs (#4612)

Prod logs have deep accidential span nesting. Introduced in #3759 and
has been untouched since, maybe no one watches proxy logs? :) I found it
by accident when looking to see if proxy logs have ansi colors with
`{neon_service="proxy"}`.

The solution is to mostly stop using `Span::enter` or `Span::entered` in
async code. Kept on `Span::entered` in cancel on shutdown related path.
---
 proxy/src/console/mgmt.rs | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 30364be6f45b..35d1ff59b76a 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -8,7 +8,7 @@ use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, Quer
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use std::future;
 use tokio::net::{TcpListener, TcpStream};
-use tracing::{error, info, info_span};
+use tracing::{error, info, info_span, Instrument};
 
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
 
@@ -44,19 +44,30 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> {
             .set_nodelay(true)
             .context("failed to set client socket option")?;
 
-        tokio::task::spawn(async move {
-            let span = info_span!("mgmt", peer = %peer_addr);
-            let _enter = span.enter();
+        let span = info_span!("mgmt", peer = %peer_addr);
 
-            info!("started a new console management API thread");
-            scopeguard::defer! {
-                info!("console management API thread is about to finish");
-            }
+        tokio::task::spawn(
+            async move {
+                info!("serving a new console management API connection");
+
+                // these might be long running connections, have a separate logging for cancelling
+                // on shutdown and other ways of stopping.
+                let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
+                    let _e = span.entered();
+                    info!("console management API task cancelled");
+                });
+
+                if let Err(e) = handle_connection(socket).await {
+                    error!("serving failed with an error: {e}");
+                } else {
+                    info!("serving completed");
+                }
 
-            if let Err(e) = handle_connection(socket).await {
-                error!("thread failed with an error: {e}");
+                // we can no longer get dropped
+                scopeguard::ScopeGuard::into_inner(cancelled);
             }
-        });
+            .instrument(span),
+        );
     }
 }
 
@@ -77,14 +88,14 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
         pgb: &mut PostgresBackendTCP,
         query: &str,
     ) -> Result<(), QueryError> {
-        try_process_query(pgb, query).await.map_err(|e| {
+        try_process_query(pgb, query).map_err(|e| {
             error!("failed to process response: {e:?}");
             e
         })
     }
 }
 
-async fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
+fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
     let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);

From 86604b3b7dd4a3f51bee4a8b07bd44b74682c0fb Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Mon, 3 Jul 2023 07:37:30 -0400
Subject: [PATCH 23/37] Delete Unnecessary files in Extension Bucket (#4606)

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 068e34420ff5..cd4906579e18 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -967,6 +967,10 @@ jobs:
           rm -rf ./extensions-to-upload/share/extension/neon*
           rm -rf ./extensions-to-upload/lib/neon*
 
+          # Delete leftovers from the extension build step
+          rm -rf ./extensions-to-upload/lib/pgxs
+          rm -rf ./extensions-to-upload/lib/pkgconfig
+
           docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
           for EXT_NAME in $(ls ./custom-extensions); do
             mkdir -p ./extensions-to-upload/${EXT_NAME}/share

From 26828560a82dadb47585c8aabfd8b569acd44448 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 3 Jul 2023 15:20:49 +0300
Subject: [PATCH 24/37]  Add timeouts and retries to consumption metrics
 reporting client (#4563)

## Problem
#4528
## Summary of changes
Add a 60 seconds default timeout to the reqwest client
Add retries for up to 3 times to call into the metric consumption
endpoint

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Cargo.lock                            | 94 ++++++++++++++++++++++++---
 Cargo.toml                            |  1 +
 pageserver/src/consumption_metrics.rs | 76 +++++++++++++++-------
 proxy/Cargo.toml                      |  1 +
 proxy/src/http.rs                     | 21 ++++++
 proxy/src/metrics.rs                  | 58 +++++++++--------
 6 files changed, 191 insertions(+), 60 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4be74614c272..c3dc0243ddee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1140,7 +1140,7 @@ dependencies = [
  "crossterm_winapi",
  "libc",
  "mio",
- "parking_lot",
+ "parking_lot 0.12.1",
  "signal-hook",
  "signal-hook-mio",
  "winapi",
@@ -1210,7 +1210,7 @@ dependencies = [
  "hashbrown 0.12.3",
  "lock_api",
  "once_cell",
- "parking_lot_core",
+ "parking_lot_core 0.9.7",
 ]
 
 [[package]]
@@ -1939,6 +1939,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
 dependencies = [
  "cfg-if",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
 ]
 
 [[package]]
@@ -2629,6 +2632,17 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "parking_lot"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
+dependencies = [
+ "instant",
+ "lock_api",
+ "parking_lot_core 0.8.6",
+]
+
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@@ -2636,7 +2650,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
  "lock_api",
- "parking_lot_core",
+ "parking_lot_core 0.9.7",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
+dependencies = [
+ "cfg-if",
+ "instant",
+ "libc",
+ "redox_syscall 0.2.16",
+ "smallvec",
+ "winapi",
 ]
 
 [[package]]
@@ -2957,7 +2985,7 @@ dependencies = [
  "lazy_static",
  "libc",
  "memchr",
- "parking_lot",
+ "parking_lot 0.12.1",
  "procfs",
  "thiserror",
 ]
@@ -3045,7 +3073,7 @@ dependencies = [
  "native-tls",
  "once_cell",
  "opentelemetry",
- "parking_lot",
+ "parking_lot 0.12.1",
  "pin-project-lite",
  "postgres-native-tls",
  "postgres_backend",
@@ -3056,6 +3084,7 @@ dependencies = [
  "regex",
  "reqwest",
  "reqwest-middleware",
+ "reqwest-retry",
  "reqwest-tracing",
  "routerify",
  "rstest",
@@ -3291,6 +3320,29 @@ dependencies = [
  "thiserror",
 ]
 
+[[package]]
+name = "reqwest-retry"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "chrono",
+ "futures",
+ "getrandom",
+ "http",
+ "hyper",
+ "parking_lot 0.11.2",
+ "reqwest",
+ "reqwest-middleware",
+ "retry-policies",
+ "task-local-extensions",
+ "tokio",
+ "tracing",
+ "wasm-timer",
+]
+
 [[package]]
 name = "reqwest-tracing"
 version = "0.4.4"
@@ -3309,6 +3361,17 @@ dependencies = [
  "tracing-opentelemetry",
 ]
 
+[[package]]
+name = "retry-policies"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
+dependencies = [
+ "anyhow",
+ "chrono",
+ "rand",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3518,7 +3581,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
- "parking_lot",
+ "parking_lot 0.12.1",
  "postgres",
  "postgres-protocol",
  "postgres_backend",
@@ -3947,7 +4010,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
- "parking_lot",
+ "parking_lot 0.12.1",
  "prost",
  "tokio",
  "tokio-stream",
@@ -4281,7 +4344,7 @@ dependencies = [
  "futures-channel",
  "futures-util",
  "log",
- "parking_lot",
+ "parking_lot 0.12.1",
  "percent-encoding",
  "phf",
  "pin-project-lite",
@@ -4991,6 +5054,21 @@ version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
 
+[[package]]
+name = "wasm-timer"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f"
+dependencies = [
+ "futures",
+ "js-sys",
+ "parking_lot 0.11.2",
+ "pin-utils",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
 [[package]]
 name = "web-sys"
 version = "0.3.63"
diff --git a/Cargo.toml b/Cargo.toml
index 551a9dc78308..6d327f58029b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,6 +95,7 @@ regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
+reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustls = "0.20"
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index ca7b9650e89a..f7592afc12b9 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,6 +24,8 @@ const RESIDENT_SIZE: &str = "resident_size";
 const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
 
+const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+
 #[serde_as]
 #[derive(Serialize, Debug)]
 struct Ids {
@@ -73,7 +75,10 @@ pub async fn collect_metrics(
     );
 
     // define client here to reuse it for all requests
-    let client = reqwest::Client::new();
+    let client = reqwest::ClientBuilder::new()
+        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
+        .build()
+        .expect("Failed to create http client with timeout");
     let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
     let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
 
@@ -83,7 +88,7 @@ pub async fn collect_metrics(
                 info!("collect_metrics received cancellation request");
                 return Ok(());
             },
-            _ = ticker.tick() => {
+            tick_at = ticker.tick() => {
 
                 // send cached metrics every cached_metric_collection_interval
                 let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
@@ -93,6 +98,12 @@ pub async fn collect_metrics(
                 }
 
                 collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
+
+                crate::tenant::tasks::warn_when_period_overrun(
+                    tick_at.elapsed(),
+                    metric_collection_interval,
+                    "consumption_metrics_collect_metrics",
+                );
             }
         }
     }
@@ -273,31 +284,42 @@ pub async fn collect_metrics_iteration(
         })
         .expect("PageserverConsumptionMetric should not fail serialization");
 
-        let res = client
-            .post(metric_collection_endpoint.clone())
-            .json(&chunk_json)
-            .send()
-            .await;
-
-        match res {
-            Ok(res) => {
-                if res.status().is_success() {
-                    // update cached metrics after they were sent successfully
-                    for (curr_key, curr_val) in chunk.iter() {
-                        cached_metrics.insert(curr_key.clone(), *curr_val);
-                    }
-                } else {
-                    error!("metrics endpoint refused the sent metrics: {:?}", res);
-                    for metric in chunk_to_send.iter() {
-                        // Report if the metric value is suspiciously large
-                        if metric.value > (1u64 << 40) {
+        const MAX_RETRIES: u32 = 3;
+
+        for attempt in 0..MAX_RETRIES {
+            let res = client
+                .post(metric_collection_endpoint.clone())
+                .json(&chunk_json)
+                .send()
+                .await;
+
+            match res {
+                Ok(res) => {
+                    if res.status().is_success() {
+                        // update cached metrics after they were sent successfully
+                        for (curr_key, curr_val) in chunk.iter() {
+                            cached_metrics.insert(curr_key.clone(), *curr_val);
+                        }
+                    } else {
+                        error!("metrics endpoint refused the sent metrics: {:?}", res);
+                        for metric in chunk_to_send
+                            .iter()
+                            .filter(|metric| metric.value > (1u64 << 40))
+                        {
+                            // Report if the metric value is suspiciously large
                             error!("potentially abnormal metric value: {:?}", metric);
                         }
                     }
+                    break;
+                }
+                Err(err) if err.is_timeout() => {
+                    error!(attempt, "timeout sending metrics, retrying immediately");
+                    continue;
+                }
+                Err(err) => {
+                    error!(attempt, ?err, "failed to send metrics");
+                    break;
                 }
-            }
-            Err(err) => {
-                error!("failed to send metrics: {:?}", err);
             }
         }
     }
@@ -317,7 +339,7 @@ pub async fn calculate_synthetic_size_worker(
             _ = task_mgr::shutdown_watcher() => {
                 return Ok(());
             },
-        _ = ticker.tick() => {
+        tick_at = ticker.tick() => {
 
                 let tenants = match mgr::list_tenants().await {
                     Ok(tenants) => tenants,
@@ -343,6 +365,12 @@ pub async fn calculate_synthetic_size_worker(
                     }
 
                 }
+
+                crate::tenant::tasks::warn_when_period_overrun(
+                    tick_at.elapsed(),
+                    synthetic_size_calculation_interval,
+                    "consumption_metrics_synthetic_size_worker",
+                );
             }
         }
     }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index e7a4fd236e97..d43ea714b1e8 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -38,6 +38,7 @@ rand.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 reqwest-middleware.workspace = true
+reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
 rustls-pemfile.workspace = true
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 5cf49b669ca2..153cdc02e542 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -6,8 +6,11 @@ pub mod server;
 pub mod sql_over_http;
 pub mod websocket;
 
+use std::time::Duration;
+
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
+pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 
 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -21,6 +24,24 @@ pub fn new_client() -> ClientWithMiddleware {
         .build()
 }
 
+pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
+    let timeout_client = reqwest::ClientBuilder::new()
+        .timeout(default_timout)
+        .build()
+        .expect("Failed to create http client with timeout");
+
+    let retry_policy =
+        ExponentialBackoff::builder().build_with_total_retry_duration(default_timout);
+
+    reqwest_middleware::ClientBuilder::new(timeout_client)
+        .with(reqwest_tracing::TracingMiddleware::default())
+        // As per docs, "This middleware always errors when given requests with streaming bodies".
+        // That's all right because we only use this client to send `serde_json::RawValue`, which
+        // is not a stream.
+        .with(RetryTransientMiddleware::new_with_policy(retry_policy))
+        .build()
+}
+
 /// Thin convenience wrapper for an API provided by an http endpoint.
 #[derive(Debug, Clone)]
 pub struct Endpoint {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 6ae1e3a447a8..00fd7f040588 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,11 +4,13 @@ use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 use tracing::{error, info, instrument, trace, warn};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
+const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+
 ///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
@@ -30,7 +32,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
         info!("metrics collector has shut down");
     }
 
-    let http_client = http::new_client();
+    let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
     let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
@@ -182,36 +184,36 @@ async fn collect_metrics_iteration(
             }
         };
 
-        if res.status().is_success() {
-            // update cached metrics after they were sent successfully
-            for send_metric in chunk {
-                let stop_time = match send_metric.kind {
-                    EventType::Incremental { stop_time, .. } => stop_time,
-                    _ => unreachable!(),
-                };
-
-                cached_metrics
-                    .entry(Ids {
-                        endpoint_id: send_metric.extra.endpoint_id.clone(),
-                        branch_id: send_metric.extra.branch_id.clone(),
-                    })
-                    // update cached value (add delta) and time
-                    .and_modify(|e| {
-                        e.0 = e.0.saturating_add(send_metric.value);
-                        e.1 = stop_time
-                    })
-                    // cache new metric
-                    .or_insert((send_metric.value, stop_time));
-            }
-        } else {
+        if !res.status().is_success() {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter() {
+            for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
                 // Report if the metric value is suspiciously large
-                if metric.value > (1u64 << 40) {
-                    error!("potentially abnormal metric value: {:?}", metric);
-                }
+                error!("potentially abnormal metric value: {:?}", metric);
             }
         }
+        // update cached metrics after they were sent
+        // (to avoid sending the same metrics twice)
+        // see the relevant discussion on why to do so even if the status is not success:
+        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
+        for send_metric in chunk {
+            let stop_time = match send_metric.kind {
+                EventType::Incremental { stop_time, .. } => stop_time,
+                _ => unreachable!(),
+            };
+
+            cached_metrics
+                .entry(Ids {
+                    endpoint_id: send_metric.extra.endpoint_id.clone(),
+                    branch_id: send_metric.extra.branch_id.clone(),
+                })
+                // update cached value (add delta) and time
+                .and_modify(|e| {
+                    e.0 = e.0.saturating_add(send_metric.value);
+                    e.1 = stop_time
+                })
+                // cache new metric
+                .or_insert((send_metric.value, stop_time));
+        }
     }
     Ok(())
 }

From 24eaa3b7ca818558c8255343e8fa134d876449fe Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 3 Jul 2023 14:21:10 +0200
Subject: [PATCH 25/37] timeline creation: reflect failures due to ancestor LSN
 issues in status code (#4600)

Before, it was a `500` and control plane would retry, wereas it actually
should have stopped retrying.

(Stacked on top of https://github.com/neondatabase/neon/pull/4597 )

fixes https://github.com/neondatabase/neon/issues/4595
part of https://github.com/neondatabase/cloud/issues/5626

---------

Co-authored-by: Shany Pozin <shany@neon.tech>
---
 pageserver/src/http/openapi_spec.yml      |  6 ++++
 pageserver/src/http/routes.rs             |  5 +++
 pageserver/src/tenant.rs                  | 23 +++++++++----
 test_runner/fixtures/pageserver/http.py   | 19 +++++++++--
 test_runner/regress/test_branch_and_gc.py | 10 +++++-
 test_runner/regress/test_branch_behind.py | 41 +++++++++++++++++++----
 6 files changed, 87 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 33a2e321419f..290492203a2b 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -722,6 +722,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ForbiddenError"
+        "406":
+          description: Permanently unsatisfiable request, don't retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
         "409":
           description: Timeline already exists, creation skipped
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 08d3fcf8df54..58dcbb2aac0a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -338,6 +338,11 @@ async fn timeline_create_handler(
             Err(tenant::CreateTimelineError::AlreadyExists) => {
                 json_response(StatusCode::CONFLICT, ())
             }
+            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
+                json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
+                    format!("{err:#}")
+                ))
+            }
             Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
         }
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 339291149ff1..3fa2a4bab428 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -506,6 +506,8 @@ pub enum CreateTimelineError {
     #[error("a timeline with the given ID already exists")]
     AlreadyExists,
     #[error(transparent)]
+    AncestorLsn(anyhow::Error),
+    #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
@@ -1432,7 +1434,7 @@ impl Tenant {
                     let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                     if ancestor_ancestor_lsn > *lsn {
                         // can we safely just branch from the ancestor instead?
-                        return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                        return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                             "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
                             lsn,
                             ancestor_timeline_id,
@@ -2715,7 +2717,7 @@ impl Tenant {
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let tl = self
             .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
             .await?;
@@ -2732,7 +2734,7 @@ impl Tenant {
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
         self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
             .await
     }
@@ -2743,7 +2745,7 @@ impl Tenant {
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
         _ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let src_id = src_timeline.timeline_id;
 
         // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
@@ -2783,16 +2785,17 @@ impl Tenant {
             .context(format!(
                 "invalid branch start lsn: less than latest GC cutoff {}",
                 *latest_gc_cutoff_lsn,
-            ))?;
+            ))
+            .map_err(CreateTimelineError::AncestorLsn)?;
 
         // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
             let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
             if start_lsn < cutoff {
-                bail!(format!(
+                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                     "invalid branch start lsn: less than planned GC cutoff {cutoff}"
-                ));
+                )));
             }
         }
 
@@ -3825,6 +3828,9 @@ mod tests {
         {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
+                let CreateTimelineError::AncestorLsn(err) = err else {
+                    panic!("wrong error type")
+                };
                 assert!(err.to_string().contains("invalid branch start lsn"));
                 assert!(err
                     .source()
@@ -3854,6 +3860,9 @@ mod tests {
         {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
+                let CreateTimelineError::AncestorLsn(err) = err else {
+                    panic!("wrong error type");
+                };
                 assert!(&err.to_string().contains("invalid branch start lsn"));
                 assert!(&err
                     .source()
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 5c4f5177d066..824d11cb1762 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -21,6 +21,18 @@ def __init__(self, message, status_code: int):
         self.status_code = status_code
 
 
+class TimelineCreate406(PageserverApiException):
+    def __init__(self, res: requests.Response):
+        assert res.status_code == 406
+        super().__init__(res.json()["msg"], res.status_code)
+
+
+class TimelineCreate409(PageserverApiException):
+    def __init__(self, res: requests.Response):
+        assert res.status_code == 409
+        super().__init__("", res.status_code)
+
+
 @dataclass
 class InMemoryLayerInfo:
     kind: str
@@ -309,9 +321,12 @@ def timeline_create(
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
         )
-        self.verbose_error(res)
         if res.status_code == 409:
-            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
+            raise TimelineCreate409(res)
+        if res.status_code == 406:
+            raise TimelineCreate406(res)
+
+        self.verbose_error(res)
 
         res_json = res.json()
         assert isinstance(res_json, dict)
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index 4a03421fcf87..53e67b1592bd 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -4,7 +4,8 @@
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.types import Lsn
+from fixtures.pageserver.http import TimelineCreate406
+from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 
 
@@ -173,5 +174,12 @@ def do_gc():
     # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC.
     with pytest.raises(Exception, match="invalid branch start lsn: .*"):
         env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn)
+    # retry the same with the HTTP API, so that we can inspect the status code
+    with pytest.raises(TimelineCreate406):
+        new_timeline_id = TimelineId.generate()
+        log.info(
+            f"Expecting failure for branch behind gc'ing LSN, new_timeline_id={new_timeline_id}"
+        )
+        pageserver_http_client.timeline_create(env.pg_version, tenant, new_timeline_id, b0, lsn)
 
     thread.join()
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 3f7d49ab0351..a19b2862f8da 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -1,6 +1,7 @@
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pageserver.http import TimelineCreate406
 from fixtures.types import Lsn, TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
@@ -17,7 +18,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
 
     # Branch at the point where only 100 rows were inserted
-    env.neon_cli.create_branch("test_branch_behind")
+    branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
     endpoint_main = env.endpoints.create_start("test_branch_behind")
     log.info("postgres is running on 'test_branch_behind' branch")
 
@@ -89,6 +90,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100
 
     # Check bad lsn's for branching
+    pageserver_http = env.pageserver.http_client()
 
     # branch at segment boundary
     env.neon_cli.create_branch(
@@ -97,27 +99,52 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     endpoint = env.endpoints.create_start("test_branch_segment_boundary")
     assert endpoint.safe_psql("SELECT 1")[0][0] == 1
 
-    # branch at pre-initdb lsn
+    # branch at pre-initdb lsn (from main branch)
     with pytest.raises(Exception, match="invalid branch start lsn: .*"):
         env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42"))
+    # retry the same with the HTTP API, so that we can inspect the status code
+    with pytest.raises(TimelineCreate406):
+        new_timeline_id = TimelineId.generate()
+        log.info(f"Expecting failure for branch pre-initdb LSN, new_timeline_id={new_timeline_id}")
+        pageserver_http.timeline_create(
+            env.pg_version, env.initial_tenant, new_timeline_id, env.initial_timeline, Lsn("0/42")
+        )
 
     # branch at pre-ancestor lsn
     with pytest.raises(Exception, match="less than timeline ancestor lsn"):
         env.neon_cli.create_branch(
             "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn=Lsn("0/42")
         )
+    # retry the same with the HTTP API, so that we can inspect the status code
+    with pytest.raises(TimelineCreate406):
+        new_timeline_id = TimelineId.generate()
+        log.info(
+            f"Expecting failure for branch pre-ancestor LSN, new_timeline_id={new_timeline_id}"
+        )
+        pageserver_http.timeline_create(
+            env.pg_version,
+            env.initial_tenant,
+            new_timeline_id,
+            branch_behind_timeline_id,
+            Lsn("0/42"),
+        )
 
     # check that we cannot create branch based on garbage collected data
-    with env.pageserver.http_client() as pageserver_http:
-        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
-        gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
-        print_gc_result(gc_result)
-
+    pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
+    gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
+    print_gc_result(gc_result)
     with pytest.raises(Exception, match="invalid branch start lsn: .*"):
         # this gced_lsn is pretty random, so if gc is disabled this woudln't fail
         env.neon_cli.create_branch(
             "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn
         )
+    # retry the same with the HTTP API, so that we can inspect the status code
+    with pytest.raises(TimelineCreate406):
+        new_timeline_id = TimelineId.generate()
+        log.info(f"Expecting failure for branch behind gc'd LSN, new_timeline_id={new_timeline_id}")
+        pageserver_http.timeline_create(
+            env.pg_version, env.initial_tenant, new_timeline_id, branch_behind_timeline_id, gced_lsn
+        )
 
     # check that after gc everything is still there
     assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100

From 78a7f68902b6abc577759abb53edcc7c7c316fcd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 3 Jul 2023 13:51:40 +0100
Subject: [PATCH 26/37] Make pg_version and build_type regular parameters
 (#4311)

## Problem

All tests have already been parametrised by Postgres version and build
type (to have them distinguishable in the Allure report), but despite
it, it's anyway required to have DEFAULT_PG_VERSION and BUILD_TYPE env
vars set to corresponding values, for example to
run`test_timeline_deletion_with_files_stuck_in_upload_queue[release-pg14-local_fs]`
test it's required to set `DEFAULT_PG_VERSION=14` and
`BUILD_TYPE=release`.
This PR makes the test framework pick up parameters from the test name
itself.

## Summary of changes
- Postgres version and build type related fixtures now are
function-scoped (instead of being sessions scoped before)
- Deprecate `--pg-version` argument in favour of DEFAULT_PG_VERSION env
variable (it's easier to parse)
- GitHub autocomment now includes only one command with all the failed
tests + runs them in parallel
---
 scripts/comment-test-report.js        | 24 ++++++++-----
 test_runner/conftest.py               |  2 +-
 test_runner/fixtures/allure.py        | 25 --------------
 test_runner/fixtures/neon_fixtures.py | 27 +++++++++++----
 test_runner/fixtures/parametrize.py   | 50 +++++++++++++++++++++++++++
 test_runner/fixtures/pg_version.py    | 22 ++++--------
 6 files changed, 93 insertions(+), 57 deletions(-)
 delete mode 100644 test_runner/fixtures/allure.py
 create mode 100644 test_runner/fixtures/parametrize.py

diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index 432c78d1afae..dd60d42a3786 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -144,15 +144,22 @@ const reportSummary = async (params) => {
                 }
                 summary += `- \`${testName}\`: ${links.join(", ")}\n`
             }
+        }
+    }
 
-            const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name)
-            const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"`
-
-            summary += "```\n"
-            summary += `# Run failed on Postgres ${pgVersion} tests locally:\n`
-            summary += `${command}\n`
-            summary += "```\n"
+    if (failedTestsCount > 0) {
+        const testsToRerun = []
+        for (const pgVersion of Object.keys(failedTests)) {
+            for (const testName of Object.keys(failedTests[pgVersion])) {
+                testsToRerun.push(...failedTests[pgVersion][testName].map(test => test.name))
+            }
         }
+        const command = `scripts/pytest -vv -n $(nproc) -k "${testsToRerun.join(" or ")}"`
+
+        summary += "```\n"
+        summary += `# Run all failed tests locally:\n`
+        summary += `${command}\n`
+        summary += "```\n"
     }
 
     if (flakyTestsCount > 0) {
@@ -164,8 +171,7 @@ const reportSummary = async (params) => {
                     const links = []
                     for (const test of tests) {
                         const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
-                        links.push(`[${status} ${test.buildType}](${allureLink})`)
+                        links.push(`[${test.buildType}](${allureLink})`)
                     }
                     summary += `- \`${testName}\`: ${links.join(", ")}\n`
                 }
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 4e649e111acc..1c36c1ed02c4 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,6 +1,6 @@
 pytest_plugins = (
     "fixtures.pg_version",
-    "fixtures.allure",
+    "fixtures.parametrize",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/allure.py b/test_runner/fixtures/allure.py
deleted file mode 100644
index 6f40bd2aa2b2..000000000000
--- a/test_runner/fixtures/allure.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import os
-
-import pytest
-
-from fixtures.pg_version import DEFAULT_VERSION, PgVersion
-
-"""
-Set of utilities to make Allure report more informative.
-
-- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress)
-to make tests distinguishable in Allure report.
-"""
-
-
-@pytest.fixture(scope="function", autouse=True)
-def allure_noop():
-    pass
-
-
-def pytest_generate_tests(metafunc):
-    if "test_runner/regress" in metafunc.definition._nodeid:
-        build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower()
-        pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION))
-
-        metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"])
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e56bf78019a4..c3e985397893 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -102,8 +102,8 @@ def base_dir() -> Iterator[Path]:
     yield base_dir
 
 
-@pytest.fixture(scope="session")
-def neon_binpath(base_dir: Path) -> Iterator[Path]:
+@pytest.fixture(scope="function")
+def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]:
     if os.getenv("REMOTE_ENV"):
         # we are in remote env and do not have neon binaries locally
         # this is the case for benchmarks run on self-hosted runner
@@ -113,7 +113,6 @@ def neon_binpath(base_dir: Path) -> Iterator[Path]:
     if env_neon_bin := os.environ.get("NEON_BIN"):
         binpath = Path(env_neon_bin)
     else:
-        build_type = os.environ.get("BUILD_TYPE", "debug")
         binpath = base_dir / "target" / build_type
     log.info(f"neon_binpath is {binpath}")
 
@@ -123,7 +122,7 @@ def neon_binpath(base_dir: Path) -> Iterator[Path]:
     yield binpath
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
     if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"):
         distrib_dir = Path(env_postgres_bin).resolve()
@@ -147,7 +146,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:
     yield output_dir
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
     versioned_dir = pg_distrib_dir / pg_version.v_prefixed
 
@@ -174,7 +173,23 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu
     def myfixture(...)
        ...
     """
-    return "function" if os.environ.get("TEST_SHARED_FIXTURES") is None else "session"
+    scope: Literal["session", "function"]
+
+    if os.environ.get("TEST_SHARED_FIXTURES") is None:
+        # Create the environment in the per-test output directory
+        scope = "function"
+    elif (
+        os.environ.get("BUILD_TYPE") is not None
+        and os.environ.get("DEFAULT_PG_VERSION") is not None
+    ):
+        scope = "session"
+    else:
+        pytest.fail(
+            "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set",
+            pytrace=False,
+        )
+
+    return scope
 
 
 @pytest.fixture(scope="session")
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
new file mode 100644
index 000000000000..53350138ddcd
--- /dev/null
+++ b/test_runner/fixtures/parametrize.py
@@ -0,0 +1,50 @@
+import os
+from typing import Optional
+
+import pytest
+from _pytest.fixtures import FixtureRequest
+from _pytest.python import Metafunc
+
+from fixtures.pg_version import PgVersion
+
+"""
+Dynamically parametrize tests by Postgres version and build type (debug/release/remote)
+"""
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pg_version(request: FixtureRequest) -> Optional[PgVersion]:
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in str(request.node.path):
+        v = os.environ.get("DEFAULT_PG_VERSION")
+        return PgVersion(v)
+
+    return None
+
+
+@pytest.fixture(scope="function", autouse=True)
+def build_type(request: FixtureRequest) -> Optional[str]:
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in str(request.node.path):
+        return os.environ.get("BUILD_TYPE", "").lower()
+
+    return None
+
+
+def pytest_generate_tests(metafunc: Metafunc):
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in metafunc.definition._nodeid:
+        return
+
+    if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
+        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
+    else:
+        pg_versions = [PgVersion(v)]
+
+    if (bt := os.environ.get("BUILD_TYPE")) is None:
+        build_types = ["debug", "release"]
+    else:
+        build_types = [bt.lower()]
+
+    metafunc.parametrize("build_type", build_types)
+    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 14ae88cc2cd1..b61f52be3c00 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -1,12 +1,10 @@
 import enum
 import os
-from typing import Iterator, Optional
+from typing import Optional
 
 import pytest
+from _pytest.config import Config
 from _pytest.config.argparsing import Parser
-from pytest import FixtureRequest
-
-from fixtures.log_helper import log
 
 """
 This fixture is used to determine which version of Postgres to use for tests.
@@ -75,18 +73,10 @@ def pytest_addoption(parser: Parser):
         "--pg-version",
         action="store",
         type=PgVersion,
-        help="Postgres version to use for tests",
+        help="DEPRECATED: Postgres version to use for tests",
     )
 
 
-@pytest.fixture(scope="session")
-def pg_version(request: FixtureRequest) -> Iterator[PgVersion]:
-    if v := request.config.getoption("--pg-version"):
-        version, source = v, "from --pg-version command-line argument"
-    elif v := os.environ.get("DEFAULT_PG_VERSION"):
-        version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable"
-    else:
-        version, source = DEFAULT_VERSION, "default version"
-
-    log.info(f"pg_version is {version} ({source})")
-    yield version
+def pytest_configure(config: Config):
+    if config.getoption("--pg-version"):
+        raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")

From cff7ae0b0defcdc15200579f85b938c8d3e4976a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 3 Jul 2023 16:37:02 +0300
Subject: [PATCH 27/37] fix: no more ansi colored logs (#4613)

Allure does not support ansi colored logs, yet `compute_ctl` has them.

Upgrade criterion to get rid of atty dependency, disable ansi colors,
remove atty dependency and disable ansi feature of tracing-subscriber.

This is a heavy-handed approach. I am not aware of a workflow where
you'd want to connect a terminal directly to for example `compute_ctl`,
usually you find the logs in a file. If someone had been using colors,
they will now need to:
- turn the `tracing-subscriber.default-features` to `true`
- edit their wanted project to have colors

I decided to explicitly disable ansi colors in case we would have in
future a dependency accidentally enabling the feature on
`tracing-subscriber`, which would be quite surprising but not
unimagineable.

By getting rid of `atty` from dependencies we get rid of
<https://github.com/advisories/GHSA-g98v-hv3f-hcfr>.
---
 Cargo.lock                  | 104 ++++++------------------------------
 Cargo.toml                  |   5 +-
 compute_tools/src/logger.rs |   1 +
 libs/utils/Cargo.toml       |   1 -
 libs/utils/src/logging.rs   |   2 +-
 proxy/Cargo.toml            |   1 -
 proxy/src/logging.rs        |   2 +-
 workspace_hack/Cargo.toml   |   2 +-
 8 files changed, 22 insertions(+), 96 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c3dc0243ddee..0c0ed530fbbe 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -200,17 +200,6 @@ dependencies = [
  "critical-section",
 ]
 
-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi 0.1.19",
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -805,18 +794,6 @@ dependencies = [
  "libloading",
 ]
 
-[[package]]
-name = "clap"
-version = "3.2.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
-dependencies = [
- "bitflags",
- "clap_lex 0.2.4",
- "indexmap",
- "textwrap",
-]
-
 [[package]]
 name = "clap"
 version = "4.3.0"
@@ -837,7 +814,7 @@ dependencies = [
  "anstream",
  "anstyle",
  "bitflags",
- "clap_lex 0.5.0",
+ "clap_lex",
  "strsim",
 ]
 
@@ -853,15 +830,6 @@ dependencies = [
  "syn 2.0.16",
 ]
 
-[[package]]
-name = "clap_lex"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
-dependencies = [
- "os_str_bytes",
-]
-
 [[package]]
 name = "clap_lex"
 version = "0.5.0"
@@ -915,7 +883,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "compute_api",
  "futures",
  "hyper",
@@ -977,7 +945,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.3.0",
+ "clap",
  "comfy-table",
  "compute_api",
  "git-version",
@@ -1047,19 +1015,19 @@ dependencies = [
 
 [[package]]
 name = "criterion"
-version = "0.4.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
 dependencies = [
  "anes",
- "atty",
  "cast",
  "ciborium",
- "clap 3.2.25",
+ "clap",
  "criterion-plot",
+ "is-terminal",
  "itertools",
- "lazy_static",
  "num-traits",
+ "once_cell",
  "oorandom",
  "plotters",
  "rayon",
@@ -1676,15 +1644,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
-[[package]]
-name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "hermit-abi"
 version = "0.2.6"
@@ -2270,16 +2229,6 @@ dependencies = [
  "windows-sys 0.45.0",
 ]
 
-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2507,31 +2456,19 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "os_str_bytes"
-version = "6.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
-
 [[package]]
 name = "outref"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "pagectl"
 version = "0.1.0"
 dependencies = [
  "anyhow",
  "bytes",
- "clap 4.3.0",
+ "clap",
  "git-version",
  "pageserver",
  "postgres_ffi",
@@ -2550,7 +2487,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "close_fds",
  "const_format",
  "consumption_metrics",
@@ -3050,12 +2987,11 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
- "atty",
  "base64 0.13.1",
  "bstr",
  "bytes",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -3570,7 +3506,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "const_format",
  "crc32c",
  "fs2",
@@ -4000,7 +3936,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.3.0",
+ "clap",
  "const_format",
  "futures",
  "futures-core",
@@ -4181,12 +4117,6 @@ dependencies = [
  "syn 1.0.109",
 ]
 
-[[package]]
-name = "textwrap"
-version = "0.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
-
 [[package]]
 name = "thiserror"
 version = "1.0.40"
@@ -4602,7 +4532,7 @@ name = "trace"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.3.0",
+ "clap",
  "pageserver_api",
  "utils",
  "workspace_hack",
@@ -4704,7 +4634,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
- "nu-ansi-term",
  "once_cell",
  "regex",
  "serde",
@@ -4873,7 +4802,6 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
- "atty",
  "bincode",
  "byteorder",
  "bytes",
@@ -4950,7 +4878,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.3.0",
+ "clap",
  "env_logger",
  "log",
  "once_cell",
@@ -5330,7 +5258,7 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "clap_builder",
  "crossbeam-utils",
  "either",
diff --git a/Cargo.toml b/Cargo.toml
index 6d327f58029b..4b9bd8b961e8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,7 +34,6 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
-atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
@@ -129,7 +128,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.18.0"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
@@ -171,7 +170,7 @@ utils = { version = "0.1", path = "./libs/utils/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 
 ## Build dependencies
-criterion = "0.4"
+criterion = "0.5.1"
 rcgen = "0.10"
 rstest = "0.17"
 tempfile = "3.4"
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index f6fc88296830..3ae68de8efae 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -18,6 +18,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
         .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
 
     let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_ansi(false)
         .with_target(false)
         .with_writer(std::io::stderr);
 
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 8239ffff5737..87b0082356ba 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 2b8c852d86d7..7b32bf2841b9 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -84,7 +84,7 @@ pub fn init(
     let r = r.with({
         let log_layer = tracing_subscriber::fmt::layer()
             .with_target(false)
-            .with_ansi(atty::is(atty::Stream::Stdout))
+            .with_ansi(false)
             .with_writer(std::io::stdout);
         let log_layer = match log_format {
             LogFormat::Json => log_layer.json().boxed(),
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index d43ea714b1e8..c8b0cc7cf08d 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
-atty.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 0c8c2858b900..3405b8cbc672 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -18,7 +18,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
         .from_env_lossy();
 
     let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(atty::is(atty::Stream::Stderr))
+        .with_ansi(false)
         .with_writer(std::io::stderr)
         .with_target(false);
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 677b59f4535c..63a65d388948 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -54,7 +54,7 @@ toml_edit = { version = "0.19", features = ["serde"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
-tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
 url = { version = "2", features = ["serde"] }
 
 [build-dependencies]

From 10110bee69d323b234875e1355ce169bf054f0bf Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 3 Jul 2023 16:10:44 +0100
Subject: [PATCH 28/37] fix setup instructions (#4615)

## Problem

1. The local endpoints provision 2 ports (postgres and HTTP) which means
the migration_check endpoint has a different port than what the setup
implies
2. psycopg2-binary 2.9.3 has a deprecated poetry config and doesn't
install.

## Summary of changes

Update psycopg2-binary and update the endpoint ports in the readme

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 README.md      |  16 +++----
 poetry.lock    | 123 +++++++++++++++++++++++++------------------------
 pyproject.toml |   2 +-
 3 files changed, 71 insertions(+), 70 deletions(-)

diff --git a/README.md b/README.md
index efa714e5be23..1c2452f4350d 100644
--- a/README.md
+++ b/README.md
@@ -132,13 +132,13 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > cargo neon init
-Starting pageserver at '127.0.0.1:64000' in '.neon'.
+Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon"
 
 # start pageserver, safekeeper, and broker for their intercommunication
 > cargo neon start
-Starting neon broker at 127.0.0.1:50051
+Starting neon broker at 127.0.0.1:50051.
 storage_broker started, pid: 2918372
-Starting pageserver at '127.0.0.1:64000' in '.neon'.
+Starting pageserver node 1 at '127.0.0.1:64000' in ".neon".
 pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
@@ -152,8 +152,7 @@ Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
-Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
+Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
 
 # check list of running postgres instances
 > cargo neon endpoint list
@@ -189,18 +188,17 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 # start postgres on that branch
 > cargo neon endpoint start migration_check --branch-name migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
-Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
+Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
 
 # check the new list of running postgres instances
 > cargo neon endpoint list
  ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
  main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
- migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
+ migration_check  127.0.0.1:55434  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
 
 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
  key | value
 -----+-------
diff --git a/poetry.lock b/poetry.lock
index 8dc45f68b844..aa76ac6dbdc6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1654,71 +1654,74 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
 [[package]]
 name = "psycopg2-binary"
-version = "2.9.3"
+version = "2.9.6"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-win32.whl", hash = "sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_ppc64le.whl", hash = "sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-win32.whl", hash = "sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"},
+    {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"},
+    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"},
+    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"},
 ]
 
 [[package]]
diff --git a/pyproject.toml b/pyproject.toml
index 2c21af698260..ac4e8fa2dd1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ authors = []
 [tool.poetry.dependencies]
 python = "^3.9"
 pytest = "^7.3.1"
-psycopg2-binary = "^2.9.1"
+psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.31.0"

From 9c8c55e8191fed0c07efebc01cff286d58e6c32d Mon Sep 17 00:00:00 2001
From: arpad-m <arpad-m@users.noreply.github.com>
Date: Mon, 3 Jul 2023 19:34:07 +0200
Subject: [PATCH 29/37] Add _cached and _bytes to
 pageserver_tenant_synthetic_size metric name (#4616)

This renames the `pageserver_tenant_synthetic_size` metric to
`pageserver_tenant_synthetic_cached_size_bytes`, as was requested on
slack (link in the linked issue).

* `_cached` to hint that it is not incrementally calculated
* `_bytes` to indicate the unit the size is measured in

Fixes #3748
---
 pageserver/src/metrics.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 00745143bdc0..8765aa6aff50 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -204,11 +204,11 @@ pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
 
 pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
-        "pageserver_tenant_synthetic_size",
-        "Synthetic size of each tenant",
+        "pageserver_tenant_synthetic_cached_size_bytes",
+        "Synthetic size of each tenant in bytes",
         &["tenant_id"]
     )
-    .expect("Failed to register pageserver_tenant_synthetic_size metric")
+    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });
 
 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,

From ab2ea8cfa53cd32b9075f7c1af32325137cddccf Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 4 Jul 2023 14:54:59 +0100
Subject: [PATCH 30/37] use pbkdf2 crate (#4626)

## Problem

While pbkdf2 is a simple algorithm, we should probably use a well tested
implementation

## Summary of changes

* Use pbkdf2 crate
* Use arrays like the hmac comment says

## Checklist before requesting a review

- [X] I have performed a self-review of my code.
- [X] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 Cargo.lock                  | 11 ++++++
 Cargo.toml                  |  1 +
 proxy/Cargo.toml            |  1 +
 proxy/src/scram.rs          | 71 +++++++++++++++++++++++++++++++++----
 proxy/src/scram/password.rs | 52 ++++++++++++++++++++-------
 5 files changed, 116 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0c0ed530fbbe..b163d4fe4654 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2617,6 +2617,16 @@ dependencies = [
  "windows-sys 0.45.0",
 ]
 
+[[package]]
+name = "pbkdf2"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
+dependencies = [
+ "digest",
+ "hmac",
+]
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -3010,6 +3020,7 @@ dependencies = [
  "once_cell",
  "opentelemetry",
  "parking_lot 0.12.1",
+ "pbkdf2",
  "pin-project-lite",
  "postgres-native-tls",
  "postgres_backend",
diff --git a/Cargo.toml b/Cargo.toml
index 4b9bd8b961e8..f36e8f6569f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,6 +86,7 @@ opentelemetry = "0.18.0"
 opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.10.0"
 parking_lot = "0.12"
+pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index c8b0cc7cf08d..438dd62315be 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -29,6 +29,7 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
+pbkdf2.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 7cc41914359a..85854427ed8f 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -45,17 +45,74 @@ fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator<Item = &'a [u8]>) -> [u8
     let mut mac = Hmac::<Sha256>::new_from_slice(key).expect("bad key size");
     parts.into_iter().for_each(|s| mac.update(s));
 
-    // TODO: maybe newer `hmac` et al already migrated to regular arrays?
-    let mut result = [0u8; 32];
-    result.copy_from_slice(mac.finalize().into_bytes().as_slice());
-    result
+    mac.finalize().into_bytes().into()
 }
 
 fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
     let mut hasher = Sha256::new();
     parts.into_iter().for_each(|s| hasher.update(s));
 
-    let mut result = [0u8; 32];
-    result.copy_from_slice(hasher.finalize().as_slice());
-    result
+    hasher.finalize().into()
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::sasl::{Mechanism, Step};
+
+    use super::{password::SaltedPassword, Exchange, ServerSecret};
+
+    #[test]
+    fn happy_path() {
+        let iterations = 4096;
+        let salt_base64 = "QSXCR+Q6sek8bf92";
+        let pw = SaltedPassword::new(
+            b"pencil",
+            base64::decode(salt_base64).unwrap().as_slice(),
+            iterations,
+        );
+
+        let secret = ServerSecret {
+            iterations,
+            salt_base64: salt_base64.to_owned(),
+            stored_key: pw.client_key().sha256(),
+            server_key: pw.server_key(),
+            doomed: false,
+        };
+        const NONCE: [u8; 18] = [
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+        ];
+        let mut exchange = Exchange::new(&secret, || NONCE, None);
+
+        let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO";
+        let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0=";
+        let server_first =
+            "r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,s=QSXCR+Q6sek8bf92,i=4096";
+        let server_final = "v=qtUDIofVnIhM7tKn93EQUUt5vgMOldcDVu1HC+OH0o0=";
+
+        exchange = match exchange.exchange(client_first).unwrap() {
+            Step::Continue(exchange, message) => {
+                assert_eq!(message, server_first);
+                exchange
+            }
+            Step::Success(_, _) => panic!("expected continue, got success"),
+            Step::Failure(f) => panic!("{f}"),
+        };
+
+        let key = match exchange.exchange(client_final).unwrap() {
+            Step::Success(key, message) => {
+                assert_eq!(message, server_final);
+                key
+            }
+            Step::Continue(_, _) => panic!("expected success, got continue"),
+            Step::Failure(f) => panic!("{f}"),
+        };
+
+        assert_eq!(
+            key.as_bytes(),
+            [
+                74, 103, 1, 132, 12, 31, 200, 48, 28, 54, 82, 232, 207, 12, 138, 189, 40, 32, 134,
+                27, 125, 170, 232, 35, 171, 167, 166, 41, 70, 228, 182, 112,
+            ]
+        );
+    }
 }
diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs
index 656780d853dc..022f2842dd8e 100644
--- a/proxy/src/scram/password.rs
+++ b/proxy/src/scram/password.rs
@@ -14,19 +14,7 @@ impl SaltedPassword {
     /// See `scram-common.c : scram_SaltedPassword` for details.
     /// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
     pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
-        let one = 1_u32.to_be_bytes(); // magic
-
-        let mut current = super::hmac_sha256(password, [salt, &one]);
-        let mut result = current;
-        for _ in 1..iterations {
-            current = super::hmac_sha256(password, [current.as_ref()]);
-            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
-            for (i, x) in current.iter().enumerate() {
-                result[i] ^= x;
-            }
-        }
-
-        result.into()
+        pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
     }
 
     /// Derive `ClientKey` from a salted hashed password.
@@ -46,3 +34,41 @@ impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
         Self { bytes }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::SaltedPassword;
+
+    fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
+        let one = 1_u32.to_be_bytes(); // magic
+
+        let mut current = super::super::hmac_sha256(password, [salt, &one]);
+        let mut result = current;
+        for _ in 1..iterations {
+            current = super::super::hmac_sha256(password, [current.as_ref()]);
+            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
+            for (i, x) in current.iter().enumerate() {
+                result[i] ^= x;
+            }
+        }
+
+        result.into()
+    }
+
+    #[test]
+    fn pbkdf2() {
+        let password = "a-very-secure-password";
+        let salt = "such-a-random-salt";
+        let iterations = 4096;
+        let output = [
+            203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
+            101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
+        ];
+
+        let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
+        let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
+
+        assert_eq!(actual.bytes, output);
+        assert_eq!(actual.bytes, expected.bytes);
+    }
+}

From 10aba174c999c795eb3b2170cfca5597530a7eab Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 4 Jul 2023 17:40:51 +0300
Subject: [PATCH 31/37] metrics: Remove comments regarding upgradeable rwlocks
 (#4622)

Closes #4001 by removing the comments alluding towards
upgradeable/downgradeable RwLocks.
---
 pageserver/src/metrics.rs      | 5 -----
 pageserver/src/virtual_file.rs | 9 ---------
 2 files changed, 14 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 8765aa6aff50..8f85488aec92 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -968,7 +968,6 @@ impl RemoteTimelineClientMetrics {
         op_kind: &RemoteOpKind,
         status: &'static str,
     ) -> Histogram {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.remote_operation_time.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str(), status);
         let metric = guard.entry(key).or_insert_with(move || {
@@ -990,7 +989,6 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> IntGauge {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.calls_unfinished_gauge.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
@@ -1011,7 +1009,6 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> Histogram {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.calls_started_hist.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
@@ -1032,7 +1029,6 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> IntCounter {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.bytes_started_counter.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
@@ -1053,7 +1049,6 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> IntCounter {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.bytes_finished_counter.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index fb216123c109..b86d14f1580e 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -302,15 +302,6 @@ impl VirtualFile {
             .observe_closure_duration(|| self.open_options.open(&self.path))?;
 
         // Perform the requested operation on it
-        //
-        // TODO: We could downgrade the locks to read mode before calling
-        // 'func', to allow a little bit more concurrency, but the standard
-        // library RwLock doesn't allow downgrading without releasing the lock,
-        // and that doesn't seem worth the trouble.
-        //
-        // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and
-        // may deadlock on subsequent read calls.
-        // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly.
         let result = STORAGE_IO_TIME
             .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
             .observe_closure_duration(|| func(&file));

From cbf9a408898d7c5b598842cb2d9b91dd1de9a0a8 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 4 Jul 2023 14:28:27 +0300
Subject: [PATCH 32/37] Set a shorter timeout for the initial connection
 attempti in proxy.

In case we try to connect to an outdated address that is no longer valid, the
default behavior of Kubernetes is to drop the packets, causing us to wait for
the entire timeout period. We want to fail fast in such cases.

A specific case to consider is when we have cached compute node information
with a 5-minute TTL (Time To Live), but the user has executed a `/suspend` API
call, resulting in the nonexistence of the compute node.
---
 proxy/src/compute.rs | 13 +++++++------
 proxy/src/config.rs  |  2 +-
 proxy/src/proxy.rs   | 30 +++++++++++++++++++++++++++---
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 480acb88d912..70b29679b9d9 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -136,18 +136,17 @@ impl Default for ConnCfg {
 
 impl ConnCfg {
     /// Establish a raw TCP connection to the compute node.
-    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> {
+    async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
         use tokio_postgres::config::Host;
 
         // wrap TcpStream::connect with timeout
         let connect_with_timeout = |host, port| {
-            let connection_timeout = Duration::from_millis(10000);
-            tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map(
+            tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
                 move |res| match res {
                     Ok(tcpstream_connect_res) => tcpstream_connect_res,
                     Err(_) => Err(io::Error::new(
                         io::ErrorKind::TimedOut,
-                        format!("exceeded connection timeout {connection_timeout:?}"),
+                        format!("exceeded connection timeout {timeout:?}"),
                     )),
                 },
             )
@@ -223,8 +222,9 @@ impl ConnCfg {
     async fn do_connect(
         &self,
         allow_self_signed_compute: bool,
+        timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
-        let (socket_addr, stream, host) = self.connect_raw().await?;
+        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
 
         let tls_connector = native_tls::TlsConnector::builder()
             .danger_accept_invalid_certs(allow_self_signed_compute)
@@ -264,8 +264,9 @@ impl ConnCfg {
     pub async fn connect(
         &self,
         allow_self_signed_compute: bool,
+        timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
-        self.do_connect(allow_self_signed_compute)
+        self.do_connect(allow_self_signed_compute, timeout)
             .inspect_err(|err| {
                 // Immediately log the error we have at our disposal.
                 error!("couldn't connect to compute node: {err}");
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 6a26cea78e79..00f561fcf2e7 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -212,7 +212,7 @@ pub struct CacheOptions {
 
 impl CacheOptions {
     /// Default options for [`crate::auth::caches::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=5m";
+    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";
 
     /// Parse cache options passed via cmdline.
     /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 8efb7005c89e..2433412c4c23 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -16,7 +16,10 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio::{
+    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
+    time,
+};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
@@ -305,12 +308,13 @@ pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
 #[tracing::instrument(name = "connect_once", skip_all)]
 async fn connect_to_compute_once(
     node_info: &console::CachedNodeInfo,
+    timeout: time::Duration,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
     let allow_self_signed_compute = node_info.allow_self_signed_compute;
 
     node_info
         .config
-        .connect(allow_self_signed_compute)
+        .connect(allow_self_signed_compute, timeout)
         .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
         .await
 }
@@ -328,7 +332,27 @@ async fn connect_to_compute(
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
-        match connect_to_compute_once(node_info).await {
+
+        // Set a shorter timeout for the initial connection attempt.
+        //
+        // In case we try to connect to an outdated address that is no longer valid, the
+        // default behavior of Kubernetes is to drop the packets, causing us to wait for
+        // the entire timeout period. We want to fail fast in such cases.
+        //
+        // A specific case to consider is when we have cached compute node information
+        // with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API
+        // call, resulting in the nonexistence of the compute node.
+        //
+        // We only use caching in case of scram proxy backed by the console, so reduce
+        // the timeout only in that case.
+        let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
+        let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE {
+            time::Duration::from_secs(2)
+        } else {
+            time::Duration::from_secs(10)
+        };
+
+        match connect_to_compute_once(node_info, timeout).await {
             Err(e) if num_retries > 0 => {
                 info!("compute node's state has changed; requesting a wake-up");
                 match creds.wake_compute(extra).map_err(io_error).await? {

From c7143dbde6db30c14807de0c5f70a0599f268e97 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Tue, 4 Jul 2023 19:07:36 -0400
Subject: [PATCH 33/37] compute_ctl: Fix misleading metric (#4608)

---
 compute_tools/src/compute.rs            | 9 +++++++--
 libs/compute_api/src/responses.rs       | 1 +
 test_runner/performance/test_startup.py | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 70d83a7b477a..aec4e4972514 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -516,9 +516,9 @@ impl ComputeNode {
         self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
-
         let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
 
+        let config_time = Utc::now();
         if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
             self.apply_config(&compute_state)?;
         }
@@ -526,11 +526,16 @@ impl ComputeNode {
         let startup_end_time = Utc::now();
         {
             let mut state = self.state.lock().unwrap();
-            state.metrics.config_ms = startup_end_time
+            state.metrics.start_postgres_ms = config_time
                 .signed_duration_since(start_time)
                 .to_std()
                 .unwrap()
                 .as_millis() as u64;
+            state.metrics.config_ms = startup_end_time
+                .signed_duration_since(config_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
             state.metrics.total_startup_ms = startup_end_time
                 .signed_duration_since(compute_state.start_time)
                 .to_std()
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index ce73dda08ad8..80e534121644 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -71,6 +71,7 @@ pub struct ComputeMetrics {
     pub wait_for_spec_ms: u64,
     pub sync_safekeepers_ms: u64,
     pub basebackup_ms: u64,
+    pub start_postgres_ms: u64,
     pub config_ms: u64,
     pub total_startup_ms: u64,
 }
diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py
index 8babbbe13224..4744c1ed2eec 100644
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -52,6 +52,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
             "wait_for_spec_ms": f"{i}_wait_for_spec",
             "sync_safekeepers_ms": f"{i}_sync_safekeepers",
             "basebackup_ms": f"{i}_basebackup",
+            "start_postgres_ms": f"{i}_start_postgres",
             "config_ms": f"{i}_config",
             "total_startup_ms": f"{i}_total_startup",
         }

From 3f9defbfb4428bb087b2d899e8b609f89bc01b7d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 5 Jul 2023 10:38:32 +0200
Subject: [PATCH 34/37] page cache: add access & hit rate metrics (#4628)

Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
---
 pageserver/src/metrics.rs       | 73 +++++++++++++++++++++++++++++++++
 pageserver/src/page_cache.rs    | 41 +++++++++++++++++-
 test_runner/fixtures/metrics.py |  2 +
 3 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 8f85488aec92..db5bccdbbac1 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -130,6 +130,79 @@ pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub struct PageCacheMetrics {
+    pub read_accesses_materialized_page: IntCounter,
+    pub read_accesses_ephemeral: IntCounter,
+    pub read_accesses_immutable: IntCounter,
+
+    pub read_hits_ephemeral: IntCounter,
+    pub read_hits_immutable: IntCounter,
+    pub read_hits_materialized_page_exact: IntCounter,
+    pub read_hits_materialized_page_older_lsn: IntCounter,
+}
+
+static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_cache_read_hits_total",
+        "Number of read accesses to the page cache that hit",
+        &["key_kind", "hit_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_cache_read_accesses_total",
+        "Number of read accesses to the page cache",
+        &["key_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+    read_accesses_materialized_page: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["materialized_page"])
+            .unwrap()
+    },
+
+    read_accesses_ephemeral: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["ephemeral"])
+            .unwrap()
+    },
+
+    read_accesses_immutable: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["immutable"])
+            .unwrap()
+    },
+
+    read_hits_ephemeral: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["ephemeral", "-"])
+            .unwrap()
+    },
+
+    read_hits_immutable: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["immutable", "-"])
+            .unwrap()
+    },
+
+    read_hits_materialized_page_exact: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["materialized_page", "exact"])
+            .unwrap()
+    },
+
+    read_hits_materialized_page_older_lsn: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["materialized_page", "older_lsn"])
+            .unwrap()
+    },
+});
+
 static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_wait_lsn_seconds",
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index d2fe06697ef9..ef0e748d102e 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -313,6 +313,10 @@ impl PageCache {
         key: &Key,
         lsn: Lsn,
     ) -> Option<(Lsn, PageReadGuard)> {
+        crate::metrics::PAGE_CACHE
+            .read_accesses_materialized_page
+            .inc();
+
         let mut cache_key = CacheKey::MaterializedPage {
             hash_key: MaterializedPageHashKey {
                 tenant_id,
@@ -323,8 +327,21 @@ impl PageCache {
         };
 
         if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
-                Some((lsn, guard))
+            if let CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: available_lsn,
+            } = cache_key
+            {
+                if available_lsn == lsn {
+                    crate::metrics::PAGE_CACHE
+                        .read_hits_materialized_page_exact
+                        .inc();
+                } else {
+                    crate::metrics::PAGE_CACHE
+                        .read_hits_materialized_page_older_lsn
+                        .inc();
+                }
+                Some((available_lsn, guard))
             } else {
                 panic!("unexpected key type in slot");
             }
@@ -499,11 +516,31 @@ impl PageCache {
     /// ```
     ///
     fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
+        let (read_access, hit) = match cache_key {
+            CacheKey::MaterializedPage { .. } => {
+                unreachable!("Materialized pages use lookup_materialized_page")
+            }
+            CacheKey::EphemeralPage { .. } => (
+                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
+                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
+            ),
+            CacheKey::ImmutableFilePage { .. } => (
+                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
+                &crate::metrics::PAGE_CACHE.read_hits_immutable,
+            ),
+        };
+        read_access.inc();
+
+        let mut is_first_iteration = true;
         loop {
             // First check if the key already exists in the cache.
             if let Some(read_guard) = self.try_lock_for_read(cache_key) {
+                if is_first_iteration {
+                    hit.inc();
+                }
                 return Ok(ReadBufResult::Found(read_guard));
             }
+            is_first_iteration = false;
 
             // Not found. Find a victim buffer
             let (slot_idx, mut inner) =
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 7ee3c33f9289..a2bc2e28e509 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -59,6 +59,8 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
     "libmetrics_tracing_event_count_total",
     "pageserver_materialized_cache_hits_total",
     "pageserver_materialized_cache_hits_direct_total",
+    "pageserver_page_cache_read_hits_total",
+    "pageserver_page_cache_read_accesses_total",
     "pageserver_getpage_reconstruct_seconds_bucket",
     "pageserver_getpage_reconstruct_seconds_count",
     "pageserver_getpage_reconstruct_seconds_sum",

From f1db87ac361a43e4758a25c982ba4008ba290ce4 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 5 Jul 2023 11:40:38 +0300
Subject: [PATCH 35/37] Check if there is enough memory for HNSW index (#4602)

## Problem

HNSW index is created in memory.
Try to prevent OOM by checking of available RAM.


## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/hnsw/hnsw.c       | 39 +++++++++++++++++++++++++++++++++++++++
 pgxn/hnsw/hnsw.control |  2 +-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c
index 434f4986f8c4..45bf78ed3bc1 100644
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -122,6 +122,43 @@ hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
 						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
 }
 
+#ifdef __APPLE__
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static void
+hnsw_check_available_memory(Size requested)
+{
+	size_t total;
+	if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
+		elog(ERROR, "Failed to get amount of RAM: %m");
+
+	if ((Size)NBuffers*BLCKSZ + requested >= total)
+		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
+			requested, total - (Size)NBuffers*BLCKSZ);
+}
+
+#else
+
+#include <sys/sysinfo.h>
+
+static void
+hnsw_check_available_memory(Size requested)
+{
+	struct sysinfo si;
+	Size total;
+	if (sysinfo(&si) < 0)
+		elog(ERROR, "Failed to get amount of RAM: %n");
+
+	total = si.totalram*si.mem_unit;
+	if ((Size)NBuffers*BLCKSZ + requested >= total)
+		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
+			requested, total - (Size)NBuffers*BLCKSZ);
+}
+
+#endif
+
 static HierarchicalNSW*
 hnsw_get_index(Relation indexRel, Relation heapRel)
 {
@@ -156,6 +193,8 @@ hnsw_get_index(Relation indexRel, Relation heapRel)
 		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
 		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
 
+		hnsw_check_available_memory(shmem_size);
+
 		/* first try to attach to existed index */
 		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
 						 &mapped_address, &mapped_size, DEBUG1))
diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control
index b292b960260c..8b75c350a83f 100644
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -1,4 +1,4 @@
-comment = 'hNsw index'
+comment = 'hnsw index'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true

From dbf88cf2d72c2298ee6606bf89c5b6bf854fc195 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 4 Jul 2023 13:27:08 +0300
Subject: [PATCH 36/37] Minimalistic pool for http endpoint compute connections
 (under opt-in flag)

Cache up to 20 connections per endpoint. Once all pooled connections
are used current implementation can open an extra connection, so the
maximum number of simultaneous connections is not enforced.

There are more things to do here, especially with background clean-up
of closed connections, and checks for transaction state. But current
implementation allows to check for smaller coonection latencies that
this cache should bring.
---
 proxy/src/http.rs               |   1 +
 proxy/src/http/conn_pool.rs     | 278 ++++++++++++++++++++++++++++++++
 proxy/src/http/sql_over_http.rs | 131 +++------------
 proxy/src/http/websocket.rs     |  12 +-
 4 files changed, 307 insertions(+), 115 deletions(-)
 create mode 100644 proxy/src/http/conn_pool.rs

diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 153cdc02e542..e2812403806d 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -2,6 +2,7 @@
 //! Other modules should use stuff from this module instead of
 //! directly relying on deps like `reqwest` (think loose coupling).
 
+pub mod conn_pool;
 pub mod server;
 pub mod sql_over_http;
 pub mod websocket;
diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
new file mode 100644
index 000000000000..52c1e2f2ce69
--- /dev/null
+++ b/proxy/src/http/conn_pool.rs
@@ -0,0 +1,278 @@
+use parking_lot::Mutex;
+use pq_proto::StartupMessageParams;
+use std::fmt;
+use std::{collections::HashMap, sync::Arc};
+
+use futures::TryFutureExt;
+
+use crate::config;
+use crate::{auth, console};
+
+use super::sql_over_http::MAX_RESPONSE_SIZE;
+
+use crate::proxy::invalidate_cache;
+use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
+
+use tracing::error;
+use tracing::info;
+
+pub const APP_NAME: &str = "sql_over_http";
+const MAX_CONNS_PER_ENDPOINT: usize = 20;
+
+#[derive(Debug)]
+pub struct ConnInfo {
+    pub username: String,
+    pub dbname: String,
+    pub hostname: String,
+    pub password: String,
+}
+
+impl ConnInfo {
+    // hm, change to hasher to avoid cloning?
+    pub fn db_and_user(&self) -> (String, String) {
+        (self.dbname.clone(), self.username.clone())
+    }
+}
+
+impl fmt::Display for ConnInfo {
+    // use custom display to avoid logging password
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}@{}/{}", self.username, self.hostname, self.dbname)
+    }
+}
+
+struct ConnPoolEntry {
+    conn: tokio_postgres::Client,
+    _last_access: std::time::Instant,
+}
+
+// Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
+// Number of open connections is limited by the `max_conns_per_endpoint`.
+pub struct EndpointConnPool {
+    pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
+    total_conns: usize,
+}
+
+pub struct GlobalConnPool {
+    // endpoint -> per-endpoint connection pool
+    //
+    // That should be a fairly conteded map, so return reference to the per-endpoint
+    // pool as early as possible and release the lock.
+    global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,
+
+    // Maximum number of connections per one endpoint.
+    // Can mix different (dbname, username) connections.
+    // When running out of free slots for a particular endpoint,
+    // falls back to opening a new connection for each request.
+    max_conns_per_endpoint: usize,
+
+    proxy_config: &'static crate::config::ProxyConfig,
+}
+
+impl GlobalConnPool {
+    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
+        Arc::new(Self {
+            global_pool: Mutex::new(HashMap::new()),
+            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
+            proxy_config: config,
+        })
+    }
+
+    pub async fn get(
+        &self,
+        conn_info: &ConnInfo,
+        force_new: bool,
+    ) -> anyhow::Result<tokio_postgres::Client> {
+        let mut client: Option<tokio_postgres::Client> = None;
+
+        if !force_new {
+            let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+
+            // find a pool entry by (dbname, username) if exists
+            let mut pool = pool.lock();
+            let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
+            if let Some(pool_entries) = pool_entries {
+                if let Some(entry) = pool_entries.pop() {
+                    client = Some(entry.conn);
+                    pool.total_conns -= 1;
+                }
+            }
+        }
+
+        // ok return cached connection if found and establish a new one otherwise
+        if let Some(client) = client {
+            if client.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                connect_to_compute(self.proxy_config, conn_info).await
+            } else {
+                info!("pool: reusing connection '{conn_info}'");
+                Ok(client)
+            }
+        } else {
+            info!("pool: opening a new connection '{conn_info}'");
+            connect_to_compute(self.proxy_config, conn_info).await
+        }
+    }
+
+    pub async fn put(
+        &self,
+        conn_info: &ConnInfo,
+        client: tokio_postgres::Client,
+    ) -> anyhow::Result<()> {
+        let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+
+        // return connection to the pool
+        let mut total_conns;
+        let mut returned = false;
+        let mut per_db_size = 0;
+        {
+            let mut pool = pool.lock();
+            total_conns = pool.total_conns;
+
+            let pool_entries: &mut Vec<ConnPoolEntry> = pool
+                .pools
+                .entry(conn_info.db_and_user())
+                .or_insert_with(|| Vec::with_capacity(1));
+            if total_conns < self.max_conns_per_endpoint {
+                pool_entries.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
+
+                total_conns += 1;
+                returned = true;
+                per_db_size = pool_entries.len();
+
+                pool.total_conns += 1;
+            }
+        }
+
+        // do logging outside of the mutex
+        if returned {
+            info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+
+        Ok(())
+    }
+
+    async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
+        // find or create a pool for this endpoint
+        let mut created = false;
+        let mut global_pool = self.global_pool.lock();
+        let pool = global_pool
+            .entry(endpoint.clone())
+            .or_insert_with(|| {
+                created = true;
+                Arc::new(Mutex::new(EndpointConnPool {
+                    pools: HashMap::new(),
+                    total_conns: 0,
+                }))
+            })
+            .clone();
+        let global_pool_size = global_pool.len();
+        drop(global_pool);
+
+        // log new global pool size
+        if created {
+            info!(
+                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
+            );
+        }
+
+        pool
+    }
+}
+
+//
+// Wake up the destination if needed. Code here is a bit involved because
+// we reuse the code from the usual proxy and we need to prepare few structures
+// that this code expects.
+//
+async fn connect_to_compute(
+    config: &config::ProxyConfig,
+    conn_info: &ConnInfo,
+) -> anyhow::Result<tokio_postgres::Client> {
+    let tls = config.tls_config.as_ref();
+    let common_names = tls.and_then(|tls| tls.common_names.clone());
+
+    let credential_params = StartupMessageParams::new([
+        ("user", &conn_info.username),
+        ("database", &conn_info.dbname),
+        ("application_name", APP_NAME),
+    ]);
+
+    let creds = config
+        .auth_backend
+        .as_ref()
+        .map(|_| {
+            auth::ClientCredentials::parse(
+                &credential_params,
+                Some(&conn_info.hostname),
+                common_names,
+            )
+        })
+        .transpose()?;
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some(APP_NAME),
+    };
+
+    let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
+
+    // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
+    // the difference that it uses `tokio_postgres` for the connection.
+    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    loop {
+        match connect_to_compute_once(node_info, conn_info).await {
+            Err(e) if num_retries > 0 => {
+                info!("compute node's state has changed; requesting a wake-up");
+                match creds.wake_compute(&extra).await? {
+                    // Update `node_info` and try one more time.
+                    Some(new) => {
+                        *node_info = new;
+                    }
+                    // Link auth doesn't work that way, so we just exit.
+                    None => return Err(e),
+                }
+            }
+            other => return other,
+        }
+
+        num_retries -= 1;
+        info!("retrying after wake-up ({num_retries} attempts left)");
+    }
+}
+
+async fn connect_to_compute_once(
+    node_info: &console::CachedNodeInfo,
+    conn_info: &ConnInfo,
+) -> anyhow::Result<tokio_postgres::Client> {
+    let mut config = (*node_info.config).clone();
+
+    let (client, connection) = config
+        .user(&conn_info.username)
+        .password(&conn_info.password)
+        .dbname(&conn_info.dbname)
+        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect(tokio_postgres::NoTls)
+        .inspect_err(|e: &tokio_postgres::Error| {
+            error!(
+                "failed to connect to compute node hosts={:?} ports={:?}: {}",
+                node_info.config.get_hosts(),
+                node_info.config.get_ports(),
+                e
+            );
+            invalidate_cache(node_info)
+        })
+        .await?;
+
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            error!("connection error: {}", e);
+        }
+    });
+
+    Ok(client)
+}
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index e8ad2d04f30c..adf7252f7288 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,25 +1,21 @@
+use std::sync::Arc;
+
 use futures::pin_mut;
 use futures::StreamExt;
-use futures::TryFutureExt;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
 use hyper::{Body, HeaderMap, Request};
-use pq_proto::StartupMessageParams;
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::Row;
-use tracing::error;
-use tracing::info;
-use tracing::instrument;
 use url::Url;
 
-use crate::proxy::invalidate_cache;
-use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
-use crate::{auth, config::ProxyConfig, console};
+use super::conn_pool::ConnInfo;
+use super::conn_pool::GlobalConnPool;
 
 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -27,12 +23,13 @@ struct QueryData {
     params: Vec<serde_json::Value>,
 }
 
-const APP_NAME: &str = "sql_over_http";
-const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
+static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
+
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 
 //
@@ -96,13 +93,6 @@ fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::E
     }
 }
 
-struct ConnInfo {
-    username: String,
-    dbname: String,
-    hostname: String,
-    password: String,
-}
-
 fn get_conn_info(
     headers: &HeaderMap,
     sni_hostname: Option<String>,
@@ -169,50 +159,23 @@ fn get_conn_info(
 
 // TODO: return different http error codes
 pub async fn handle(
-    config: &'static ProxyConfig,
     request: Request<Body>,
     sni_hostname: Option<String>,
+    conn_pool: Arc<GlobalConnPool>,
 ) -> anyhow::Result<Value> {
     //
     // Determine the destination and connection params
     //
     let headers = request.headers();
     let conn_info = get_conn_info(headers, sni_hostname)?;
-    let credential_params = StartupMessageParams::new([
-        ("user", &conn_info.username),
-        ("database", &conn_info.dbname),
-        ("application_name", APP_NAME),
-    ]);
 
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.
     let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
     let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
 
-    //
-    // Wake up the destination if needed. Code here is a bit involved because
-    // we reuse the code from the usual proxy and we need to prepare few structures
-    // that this code expects.
-    //
-    let tls = config.tls_config.as_ref();
-    let common_names = tls.and_then(|tls| tls.common_names.clone());
-    let creds = config
-        .auth_backend
-        .as_ref()
-        .map(|_| {
-            auth::ClientCredentials::parse(
-                &credential_params,
-                Some(&conn_info.hostname),
-                common_names,
-            )
-        })
-        .transpose()?;
-    let extra = console::ConsoleReqExtra {
-        session_id: uuid::Uuid::new_v4(),
-        application_name: Some(APP_NAME),
-    };
-
-    let mut node_info = creds.wake_compute(&extra).await?.expect("msg");
+    // Allow connection pooling only if explicitly requested
+    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
@@ -235,7 +198,8 @@ pub async fn handle(
     //
     // Now execute the query and return the result
     //
-    let client = connect_to_compute(&mut node_info, &extra, &creds, &conn_info).await?;
+    let client = conn_pool.get(&conn_info, !allow_pool).await?;
+
     let row_stream = client.query_raw_txt(query, query_params).await?;
 
     // Manually drain the stream into a vector to leave row_stream hanging
@@ -292,6 +256,13 @@ pub async fn handle(
         .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
         .collect::<Result<Vec<_>, _>>()?;
 
+    if allow_pool {
+        // return connection to the pool
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
+    }
+
     // resulting JSON format is based on the format of node-postgres result
     Ok(json!({
         "command": command_tag_name,
@@ -302,70 +273,6 @@ pub async fn handle(
     }))
 }
 
-/// This function is a copy of `connect_to_compute` from `src/proxy.rs` with
-/// the difference that it uses `tokio_postgres` for the connection.
-#[instrument(skip_all)]
-async fn connect_to_compute(
-    node_info: &mut console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
-
-    loop {
-        match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(extra).await? {
-                    // Update `node_info` and try one more time.
-                    Some(new) => {
-                        *node_info = new;
-                    }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
-                }
-            }
-            other => return other,
-        }
-
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
-    }
-}
-
-async fn connect_to_compute_once(
-    node_info: &console::CachedNodeInfo,
-    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
-    let mut config = (*node_info.config).clone();
-
-    let (client, connection) = config
-        .user(&conn_info.username)
-        .password(&conn_info.password)
-        .dbname(&conn_info.dbname)
-        .max_backend_message_size(MAX_RESPONSE_SIZE)
-        .connect(tokio_postgres::NoTls)
-        .inspect_err(|e: &tokio_postgres::Error| {
-            error!(
-                "failed to connect to compute node hosts={:?} ports={:?}: {}",
-                node_info.config.get_hosts(),
-                node_info.config.get_ports(),
-                e
-            );
-            invalidate_cache(node_info)
-        })
-        .await?;
-
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
-    });
-
-    Ok(client)
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index 9f467aceb768..83ba034e577f 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -35,7 +35,7 @@ use utils::http::{error::ApiError, json::json_response};
 // Tracking issue: https://github.com/rust-lang/rust/issues/98407.
 use sync_wrapper::SyncWrapper;
 
-use super::sql_over_http;
+use super::{conn_pool::GlobalConnPool, sql_over_http};
 
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
@@ -164,6 +164,7 @@ async fn serve_websocket(
 async fn ws_handler(
     mut request: Request<Body>,
     config: &'static ProxyConfig,
+    conn_pool: Arc<GlobalConnPool>,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
     sni_hostname: Option<String>,
@@ -192,7 +193,7 @@ async fn ws_handler(
     // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
     // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(config, request, sni_hostname)
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool)
             .instrument(info_span!("sql-over-http"))
             .await;
         let status_code = match result {
@@ -234,6 +235,8 @@ pub async fn task_main(
         info!("websocket server has shut down");
     }
 
+    let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
+
     let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
     let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
         Some(config) => config.into(),
@@ -258,15 +261,18 @@ pub async fn task_main(
     let make_svc =
         hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
             let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+            let conn_pool = conn_pool.clone();
 
             async move {
                 Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
                     let sni_name = sni_name.clone();
+                    let conn_pool = conn_pool.clone();
+
                     async move {
                         let cancel_map = Arc::new(CancelMap::default());
                         let session_id = uuid::Uuid::new_v4();
 
-                        ws_handler(req, config, cancel_map, session_id, sni_name)
+                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                             .instrument(info_span!(
                                 "ws-client",
                                 session = format_args!("{session_id}")

From c92b7543b5fe70b48d6f71cb9d29b16651154e0c Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 5 Jul 2023 12:39:51 +0200
Subject: [PATCH 37/37] Update `pgvector` to 0.4.4 (#4632)

After announcing `hnsw`, there is a hypothesis that the community will
start comparing it with `pgvector` by themselves. Therefore, let's have
an actual version of `pgvector` in Neon.
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 21877f6f240f..310e4c32a3ca 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -189,8 +189,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
-    echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
+    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \