diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index caa7ebee053d..820848b4fb1b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -404,7 +404,7 @@ jobs: uses: ./.github/actions/save-coverage-data regress-tests: - needs: [ check-permissions, build-neon ] + needs: [ check-permissions, build-neon, tag ] runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -436,6 +436,7 @@ jobs: env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty + BUILD_TAG: ${{ needs.tag.outputs.build-tag }} - name: Merge and upload coverage data if: matrix.build_type == 'debug' && matrix.pg_version == 'v14' diff --git a/Cargo.lock b/Cargo.lock index 48c7bf1795b6..56396657586e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -256,19 +256,21 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "0.56.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3d533e0263bf453cc80af4c8bcc4d64e2aca293bd16f81633a36f1bf4a97cb" +checksum = "80c950a809d39bc9480207cb1cfc879ace88ea7e3a4392a8e9999e45d6e5692e" dependencies = [ "aws-credential-types", "aws-http", + "aws-runtime", "aws-sdk-sso", + "aws-sdk-ssooidc", "aws-sdk-sts", "aws-smithy-async", - "aws-smithy-client", "aws-smithy-http", - "aws-smithy-http-tower", "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", @@ -276,52 +278,46 @@ dependencies = [ "hex", "http", "hyper", - "ring", + "ring 0.17.6", "time", "tokio", - "tower", "tracing", "zeroize", ] [[package]] name = "aws-credential-types" -version = "0.56.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4834ba01c5ad1ed9740aa222de62190e3c565d11ab7e72cc68314a258994567" +checksum = "8c1317e1a3514b103cf7d5828bbab3b4d30f56bd22d684f8568bc51b6cfbbb1c" dependencies = [ "aws-smithy-async", + "aws-smithy-runtime-api", "aws-smithy-types", - "fastrand 2.0.0", - "tokio", - "tracing", "zeroize", ] [[package]] name = "aws-http" -version = "0.56.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72badf9de83cc7d66b21b004f09241836823b8302afb25a24708769e576a8d8f" +checksum = "361c4310fdce94328cc2d1ca0c8a48c13f43009c61d3367585685a50ca8c66b6" dependencies = [ - "aws-credential-types", - "aws-smithy-http", + "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", "http", "http-body", - "lazy_static", - "percent-encoding", "pin-project-lite", "tracing", ] [[package]] name = "aws-runtime" -version = "0.56.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf832f522111225c02547e1e1c28137e840e4b082399d93a236e4b29193a4667" +checksum = "1ed7ef604a15fd0d4d9e43701295161ea6b504b63c44990ead352afea2bc15e9" dependencies = [ "aws-credential-types", "aws-http", @@ -341,9 +337,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "0.29.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e30370b61599168d38190ad272bb91842cd81870a6ca035c05dd5726d22832c" +checksum = "9dcafc2fe52cc30b2d56685e2fa6a879ba50d79704594852112337a472ddbd24" dependencies = [ "aws-credential-types", "aws-http", @@ -351,7 +347,6 @@ dependencies = [ "aws-sigv4", "aws-smithy-async", "aws-smithy-checksums", - "aws-smithy-client", "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-json", @@ -366,22 +361,42 @@ dependencies = [ "once_cell", "percent-encoding", "regex", - "tokio-stream", "tracing", "url", ] [[package]] name = "aws-sdk-sso" -version = "0.29.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f41bf2c28d32dbb9894a8fcfcb148265d034d3f4a170552a47553a09de890895" +checksum = "0619ab97a5ca8982e7de073cdc66f93e5f6a1b05afc09e696bec1cb3607cd4df" +dependencies = [ + "aws-credential-types", + "aws-http", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http", + "regex", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f04b9f5474cc0f35d829510b2ec8c21e352309b46bf9633c5a81fb9321e9b1c7" dependencies = [ "aws-credential-types", "aws-http", "aws-runtime", "aws-smithy-async", - "aws-smithy-client", "aws-smithy-http", "aws-smithy-json", "aws-smithy-runtime", @@ -391,21 +406,19 @@ dependencies = [ "bytes", "http", "regex", - "tokio-stream", "tracing", ] [[package]] name = "aws-sdk-sts" -version = "0.29.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e21aa1a5b0853969a1ef96ccfaa8ff5d57c761549786a4d5f86c1902b2586a" +checksum = "5700da387716ccfc30b27f44b008f457e1baca5b0f05b6b95455778005e3432a" dependencies = [ "aws-credential-types", "aws-http", "aws-runtime", "aws-smithy-async", - "aws-smithy-client", "aws-smithy-http", "aws-smithy-json", "aws-smithy-query", @@ -421,42 +434,49 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "0.56.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cb40a93429794065f41f0581734fc56a345f6a38d8e2e3c25c7448d930cd132" +checksum = "380adcc8134ad8bbdfeb2ace7626a869914ee266322965276cbc54066186d236" dependencies = [ + "aws-credential-types", "aws-smithy-eventstream", "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", "bytes", + "crypto-bigint 0.5.5", "form_urlencoded", "hex", "hmac", "http", "once_cell", + "p256", "percent-encoding", "regex", + "ring 0.17.6", "sha2", + "subtle", "time", "tracing", + "zeroize", ] [[package]] name = "aws-smithy-async" -version = "0.56.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee6d17d487c8b579423067718b3580c0908d0f01d7461813f94ec4323bad623" +checksum = "3e37ca17d25fe1e210b6d4bdf59b81caebfe99f986201a1228cb5061233b4b13" dependencies = [ "futures-util", "pin-project-lite", "tokio", - "tokio-stream", ] [[package]] name = "aws-smithy-checksums" -version = "0.56.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d1849fd5916904513fb0862543b36f8faab43c07984dbc476132b7da1aed056" +checksum = "c5a373ec01aede3dd066ec018c1bc4e8f5dd11b2c11c59c8eef1a5c68101f397" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -473,35 +493,11 @@ dependencies = [ "tracing", ] -[[package]] -name = "aws-smithy-client" -version = "0.56.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdbe0a3ad15283cc5f863a68cb6adc8e256e7c109c43c01bdd09be407219a1e9" -dependencies = [ - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-http-tower", - "aws-smithy-types", - "bytes", - "fastrand 2.0.0", - "http", - "http-body", - "hyper", - "hyper-rustls", - "lazy_static", - "pin-project-lite", - "rustls", - "tokio", - "tower", - "tracing", -] - [[package]] name = "aws-smithy-eventstream" -version = "0.56.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56afef1aa766f512b4970b4c3150b9bf2df8035939723830df4b30267e2d7cb" +checksum = "1c669e1e5fc0d79561bf7a122b118bd50c898758354fe2c53eb8f2d31507cbc3" dependencies = [ "aws-smithy-types", "bytes", @@ -510,57 +506,39 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.56.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34dc313472d727f5ef44fdda93e668ebfe17380c99dee512c403e3ca51863bb9" +checksum = "5b1de8aee22f67de467b2e3d0dd0fb30859dc53f579a63bd5381766b987db644" dependencies = [ "aws-smithy-eventstream", + "aws-smithy-runtime-api", "aws-smithy-types", "bytes", "bytes-utils", "futures-core", "http", "http-body", - "hyper", "once_cell", "percent-encoding", "pin-project-lite", "pin-utils", - "tokio", - "tokio-util", - "tracing", -] - -[[package]] -name = "aws-smithy-http-tower" -version = "0.56.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dd50fca5a4ea4ec3771689ee93bf06b32de02a80af01ed93a8f8a4ed90e8483" -dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "bytes", - "http", - "http-body", - "pin-project-lite", - "tower", "tracing", ] [[package]] name = "aws-smithy-json" -version = "0.56.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3591dd7c2fe01ab8025e4847a0a0f6d0c2b2269714688ffb856f9cf6c6d465cf" +checksum = "6a46dd338dc9576d6a6a5b5a19bd678dcad018ececee11cf28ecd7588bd1a55c" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.56.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbabb1145e65dd57ae72d91a2619d3f5fba40b68a5f40ba009c30571dfd60aff" +checksum = "feb5b8c7a86d4b6399169670723b7e6f21a39fc833a30f5c5a2f997608178129" dependencies = [ "aws-smithy-types", "urlencoding", @@ -568,74 +546,86 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "0.56.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3687fb838d4ad1c883b62eb59115bc9fb02c4f308aac49a7df89627067f6eb0d" +checksum = "273479291efc55e7b0bce985b139d86b6031adb8e50f65c1f712f20ba38f6388" dependencies = [ "aws-smithy-async", - "aws-smithy-client", "aws-smithy-http", "aws-smithy-runtime-api", "aws-smithy-types", "bytes", "fastrand 2.0.0", + "h2", "http", "http-body", + "hyper", + "hyper-rustls", "once_cell", "pin-project-lite", "pin-utils", + "rustls", "tokio", "tracing", ] [[package]] name = "aws-smithy-runtime-api" -version = "0.56.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cfbf1e5c2108b41f5ca607cde40dd5109fecc448f5d30c8e614b61f36dce704" +checksum = "c6cebff0d977b6b6feed2fd07db52aac58ba3ccaf26cdd49f1af4add5061bef9" dependencies = [ "aws-smithy-async", - "aws-smithy-http", "aws-smithy-types", "bytes", "http", + "pin-project-lite", "tokio", "tracing", + "zeroize", ] [[package]] name = "aws-smithy-types" -version = "0.56.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eed0a94eefd845a2a78677f1b72f02fa75802d38f7f59be675add140279aa8bf" +checksum = "d7f48b3f27ddb40ab19892a5abda331f403e3cb877965e4e51171447807104af" dependencies = [ "base64-simd", + "bytes", + "bytes-utils", + "futures-core", + "http", + "http-body", "itoa", "num-integer", + "pin-project-lite", + "pin-utils", "ryu", "serde", "time", + "tokio", + "tokio-util", ] [[package]] name = "aws-smithy-xml" -version = "0.56.0" +version = "0.60.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c88052c812f696143ad7ba729c63535209ff0e0f49e31a6d2b1205208ea6ea79" +checksum = "0ec40d74a67fd395bc3f6b4ccbdf1543672622d905ef3f979689aea5b730cb95" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "0.56.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bceb8cf724ad057ad7f327d0d256d7147b3eac777b39849a26189e003dc9782" +checksum = "8403fc56b1f3761e8efe45771ddc1165e47ec3417c68e68a4519b5cb030159ca" dependencies = [ "aws-credential-types", "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", + "aws-smithy-runtime-api", "aws-smithy-types", "http", "rustc_version", @@ -651,7 +641,7 @@ dependencies = [ "async-trait", "axum-core", "base64 0.21.1", - "bitflags", + "bitflags 1.3.2", "bytes", "futures-util", "http", @@ -705,7 +695,7 @@ dependencies = [ "bytes", "dyn-clone", "futures", - "getrandom 0.2.9", + "getrandom 0.2.11", "http-types", "log", "paste", @@ -799,6 +789,12 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base16ct" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" + [[package]] name = "base64" version = "0.13.1" @@ -848,7 +844,7 @@ version = "0.65.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cexpr", "clang-sys", "lazy_static", @@ -871,6 +867,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + [[package]] name = "block-buffer" version = "0.10.4" @@ -947,11 +949,12 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.79" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "jobserver", + "libc", ] [[package]] @@ -1054,7 +1057,7 @@ checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990" dependencies = [ "anstream", "anstyle", - "bitflags", + "bitflags 1.3.2", "clap_lex", "strsim", ] @@ -1126,6 +1129,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-compression", + "bytes", "cfg-if", "chrono", "clap", @@ -1167,6 +1171,12 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "const-oid" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f" + [[package]] name = "const_fn" version = "0.4.9" @@ -1375,7 +1385,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67" dependencies = [ - "bitflags", + "bitflags 1.3.2", "crossterm_winapi", "libc", "mio", @@ -1394,6 +1404,28 @@ dependencies = [ "winapi", ] +[[package]] +name = "crypto-bigint" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "crypto-common" version = "0.1.6" @@ -1468,6 +1500,16 @@ dependencies = [ "uuid", ] +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + [[package]] name = "der-parser" version = "8.2.0" @@ -1510,12 +1552,44 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23d2f3407d9a573d666de4b5bdf10569d73ca9478087346697dcbae6244bfbcd" +[[package]] +name = "ecdsa" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +dependencies = [ + "der", + "elliptic-curve", + "rfc6979", + "signature", +] + [[package]] name = "either" version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +[[package]] +name = "elliptic-curve" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +dependencies = [ + "base16ct", + "crypto-bigint 0.4.9", + "der", + "digest", + "ff", + "generic-array", + "group", + "pkcs8", + "rand_core 0.6.4", + "sec1", + "subtle", + "zeroize", +] + [[package]] name = "encoding_rs" version = "0.8.32" @@ -1638,6 +1712,16 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +[[package]] +name = "ff" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "filetime" version = "0.2.21" @@ -1848,9 +1932,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.9" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" +checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f" dependencies = [ "cfg-if", "js-sys", @@ -1893,6 +1977,17 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "group" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +dependencies = [ + "ff", + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "h2" version = "0.3.19" @@ -2235,7 +2330,7 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" dependencies = [ - "bitflags", + "bitflags 1.3.2", "inotify-sys", "libc", ] @@ -2246,7 +2341,7 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc" dependencies = [ - "bitflags", + "bitflags 1.3.2", "futures-core", "inotify-sys", "libc", @@ -2287,9 +2382,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.7.2" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "is-terminal" @@ -2344,7 +2439,7 @@ checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378" dependencies = [ "base64 0.21.1", "pem 1.1.1", - "ring", + "ring 0.16.20", "serde", "serde_json", "simple_asn1", @@ -2366,7 +2461,7 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587" dependencies = [ - "bitflags", + "bitflags 1.3.2", "libc", ] @@ -2384,9 +2479,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.144" +version = "0.2.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1" +checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" [[package]] name = "libloading" @@ -2580,7 +2675,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4" dependencies = [ "autocfg", - "bitflags", + "bitflags 1.3.2", "cfg-if", "libc", ] @@ -2591,7 +2686,7 @@ version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ - "bitflags", + "bitflags 1.3.2", "cfg-if", "libc", "memoffset 0.7.1", @@ -2615,7 +2710,7 @@ version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486" dependencies = [ - "bitflags", + "bitflags 1.3.2", "crossbeam-channel", "filetime", "fsevent-sys", @@ -2693,7 +2788,7 @@ checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f" dependencies = [ "base64 0.13.1", "chrono", - "getrandom 0.2.9", + "getrandom 0.2.11", "http", "rand 0.8.5", "serde", @@ -2736,11 +2831,11 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "openssl" -version = "0.10.55" +version = "0.10.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" +checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" dependencies = [ - "bitflags", + "bitflags 2.4.1", "cfg-if", "foreign-types", "libc", @@ -2768,9 +2863,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.90" +version = "0.9.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6" +checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" dependencies = [ "cc", "libc", @@ -2896,6 +2991,17 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "p256" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" +dependencies = [ + "ecdsa", + "elliptic-curve", + "sha2", +] + [[package]] name = "pagectl" version = "0.1.0" @@ -2970,6 +3076,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "serde_path_to_error", "serde_with", "signal-hook", "smallvec", @@ -3010,6 +3117,7 @@ dependencies = [ "serde_with", "strum", "strum_macros", + "thiserror", "utils", "workspace_hack", ] @@ -3188,6 +3296,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs8" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.27" @@ -3394,7 +3512,7 @@ version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" dependencies = [ - "bitflags", + "bitflags 1.3.2", "byteorder", "hex", "lazy_static", @@ -3494,6 +3612,7 @@ dependencies = [ "humantime", "hyper", "hyper-tungstenite", + "ipnet", "itertools", "md5", "metrics", @@ -3504,6 +3623,7 @@ dependencies = [ "pbkdf2", "pin-project-lite", "postgres-native-tls", + "postgres-protocol", "postgres_backend", "pq_proto", "prometheus", @@ -3523,6 +3643,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "smol_str", "socket2 0.5.3", "sync_wrapper", "task-local-extensions", @@ -3623,7 +3744,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.9", + "getrandom 0.2.11", ] [[package]] @@ -3664,7 +3785,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976" dependencies = [ "pem 2.0.1", - "ring", + "ring 0.16.20", "time", "yasna", ] @@ -3675,7 +3796,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -3684,7 +3805,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -3735,8 +3856,7 @@ dependencies = [ "aws-credential-types", "aws-sdk-s3", "aws-smithy-async", - "aws-smithy-http", - "aws-types", + "aws-smithy-types", "azure_core", "azure_identity", "azure_storage", @@ -3834,7 +3954,7 @@ dependencies = [ "async-trait", "chrono", "futures", - "getrandom 0.2.9", + "getrandom 0.2.11", "http", "hyper", "parking_lot 0.11.2", @@ -3855,7 +3975,7 @@ checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8" dependencies = [ "anyhow", "async-trait", - "getrandom 0.2.9", + "getrandom 0.2.11", "matchit", "opentelemetry", "reqwest", @@ -3876,6 +3996,17 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "rfc6979" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +dependencies = [ + "crypto-bigint 0.4.9", + "hmac", + "zeroize", +] + [[package]] name = "ring" version = "0.16.20" @@ -3886,11 +4017,25 @@ dependencies = [ "libc", "once_cell", "spin 0.5.2", - "untrusted", + "untrusted 0.7.1", "web-sys", "winapi", ] +[[package]] +name = "ring" +version = "0.17.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "684d5e6e18f669ccebf64a92236bb7db9a34f07be010e3627368182027180866" +dependencies = [ + "cc", + "getrandom 0.2.11", + "libc", + "spin 0.9.8", + "untrusted 0.9.0", + "windows-sys 0.48.0", +] + [[package]] name = "routerify" version = "3.0.0" @@ -3978,7 +4123,7 @@ version = "0.36.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab" dependencies = [ - "bitflags", + "bitflags 1.3.2", "errno", "io-lifetimes", "libc", @@ -3992,7 +4137,7 @@ version = "0.37.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035" dependencies = [ - "bitflags", + "bitflags 1.3.2", "errno", "io-lifetimes", "libc", @@ -4002,13 +4147,13 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.6" +version = "0.21.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1feddffcfcc0b33f5c6ce9a29e341e4cd59c3f78e7ee45f4a40c038b1d6cbb" +checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9" dependencies = [ "log", - "ring", - "rustls-webpki 0.101.4", + "ring 0.17.6", + "rustls-webpki 0.101.7", "sct", ] @@ -4039,18 +4184,18 @@ version = "0.100.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab" dependencies = [ - "ring", - "untrusted", + "ring 0.16.20", + "untrusted 0.7.1", ] [[package]] name = "rustls-webpki" -version = "0.101.4" +version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d93931baf2d282fff8d3a532bbfd7653f734643161b87e3e01e59a04439bf0d" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "ring", - "untrusted", + "ring 0.17.6", + "untrusted 0.9.0", ] [[package]] @@ -4073,8 +4218,6 @@ dependencies = [ "async-stream", "aws-config", "aws-sdk-s3", - "aws-smithy-http", - "aws-types", "bincode", "bytes", "chrono", @@ -4196,8 +4339,8 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" dependencies = [ - "ring", - "untrusted", + "ring 0.16.20", + "untrusted 0.7.1", ] [[package]] @@ -4206,13 +4349,27 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32" +[[package]] +name = "sec1" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" +dependencies = [ + "base16ct", + "der", + "generic-array", + "pkcs8", + "subtle", + "zeroize", +] + [[package]] name = "security-framework" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8" dependencies = [ - "bitflags", + "bitflags 1.3.2", "core-foundation", "core-foundation-sys", "libc", @@ -4322,7 +4479,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd" dependencies = [ "debugid", - "getrandom 0.2.9", + "getrandom 0.2.11", "hex", "serde", "serde_json", @@ -4510,6 +4667,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + [[package]] name = "simple_asn1" version = "0.6.2" @@ -4543,6 +4710,15 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +[[package]] +name = "smol_str" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c" +dependencies = [ + "serde", +] + [[package]] name = "socket2" version = "0.4.9" @@ -4578,6 +4754,16 @@ dependencies = [ "lock_api", ] +[[package]] +name = "spki" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" +dependencies = [ + "base64ct", + "der", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -4971,7 +5157,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f" dependencies = [ "futures", - "ring", + "ring 0.16.20", "rustls", "tokio", "tokio-postgres", @@ -5427,6 +5613,12 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "ureq" version = "2.7.1" @@ -5528,7 +5720,7 @@ version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2" dependencies = [ - "getrandom 0.2.9", + "getrandom 0.2.11", "serde", ] @@ -5987,9 +6179,13 @@ dependencies = [ "aws-config", "aws-runtime", "aws-sigv4", + "aws-smithy-async", "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", "axum", "base64 0.21.1", + "base64ct", "bytes", "cc", "chrono", @@ -6007,6 +6203,7 @@ dependencies = [ "futures-sink", "futures-util", "hex", + "hmac", "hyper", "itertools", "libc", @@ -6021,12 +6218,13 @@ dependencies = [ "regex", "regex-syntax 0.7.2", "reqwest", - "ring", + "ring 0.16.20", "rustls", "scopeguard", "serde", "serde_json", "smallvec", + "subtle", "syn 1.0.109", "syn 2.0.28", "time", diff --git a/Cargo.toml b/Cargo.toml index 6df48ffc55f8..ba8b49c0e010 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,12 +45,11 @@ azure_storage_blobs = "0.16" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "0.56", default-features = false, features=["rustls"] } -aws-sdk-s3 = "0.29" -aws-smithy-http = "0.56" -aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] } -aws-credential-types = "0.56" -aws-types = "0.56" +aws-config = { version = "1.0", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.0" +aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] } +aws-smithy-types = "1.0" +aws-credential-types = "1.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" @@ -89,6 +88,7 @@ humantime-serde = "1.1.1" hyper = "0.14" hyper-tungstenite = "0.11" inotify = "0.10.2" +ipnet = "2.9.0" itertools = "0.10" jsonwebtoken = "8" libc = "0.2" @@ -126,11 +126,13 @@ sd-notify = "0.4.1" sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" +serde_path_to_error = "0.1" serde_with = "2.0" serde_assert = "0.5.0" sha2 = "0.10.2" signal-hook = "0.3" smallvec = "1.11" +smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" strum = "0.24" strum_macros = "0.24" diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 36c3f874d481..a3772265c060 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -387,18 +387,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH "/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - export TIMESCALEDB_VERSION=2.10.1 \ - export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ - ;; \ - *) \ - echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \ - esac && \ - apt-get update && \ +RUN apt-get update && \ apt-get install -y cmake && \ - wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ - echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ + wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \ + echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \ mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ @@ -714,6 +706,23 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz - cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control +######################################################################################### +# +# Layer "wal2json-build" +# Compile "wal2json" extension +# +######################################################################################### + +FROM build-deps AS wal2json-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ + echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ + mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -750,6 +759,7 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ diff --git a/README.md b/README.md index 75fad605c59b..3e3123f5eeb4 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,9 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one +# create postgres compute node +> cargo neon endpoint create main + # start postgres compute node > cargo neon endpoint start main Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ... @@ -185,8 +188,11 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: (L) main [de200bd42b49cc1814412c7e592dd6e9] (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] +# create postgres on that branch +> cargo neon endpoint create migration_check --branch-name migration_check + # start postgres on that branch -> cargo neon endpoint start migration_check --branch-name migration_check +> cargo neon endpoint start migration_check Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ... Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres' diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 6c93befaa319..47378f1910ab 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -38,3 +38,4 @@ toml_edit.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" } zstd = "0.12.4" +bytes = "1.0" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 7f22bda13ea5..36e9ca0731ae 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -31,7 +31,7 @@ //! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -S /var/db/postgres/specs/current.json \ //! -b /usr/local/bin/postgres \ -//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"} +//! -r http://pg-ext-s3-gateway //! ``` //! use std::collections::HashMap; @@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus; use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_tools::configurator::launch_configurator; -use compute_tools::extension_server::{get_pg_version, init_remote_storage}; +use compute_tools::extension_server::get_pg_version; use compute_tools::http::api::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; @@ -60,7 +60,7 @@ use compute_tools::spec::*; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var -const BUILD_TAG_DEFAULT: &str = "5670669815"; +const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; @@ -74,10 +74,18 @@ fn main() -> Result<()> { let pgbin_default = String::from("postgres"); let pgbin = matches.get_one::("pgbin").unwrap_or(&pgbin_default); - let remote_ext_config = matches.get_one::("remote-ext-config"); - let ext_remote_storage = remote_ext_config.map(|x| { - init_remote_storage(x).expect("cannot initialize remote extension storage from config") - }); + let ext_remote_storage = matches + .get_one::("remote-ext-config") + // Compatibility hack: if the control plane specified any remote-ext-config + // use the default value for extension storage proxy gateway. + // Remove this once the control plane is updated to pass the gateway URL + .map(|conf| { + if conf.starts_with("http") { + conf.trim_end_matches('/') + } else { + "http://pg-ext-s3-gateway" + } + }); let http_port = *matches .get_one::("http-port") @@ -198,7 +206,7 @@ fn main() -> Result<()> { live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), - ext_remote_storage, + ext_remote_storage: ext_remote_storage.map(|s| s.to_string()), ext_download_progress: RwLock::new(HashMap::new()), build_tag, }; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 5ace8ca1d24c..28770acdcdcf 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -25,7 +25,7 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::spec::{ComputeMode, ComputeSpec}; use utils::measured_stream::MeasuredReader; -use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; +use remote_storage::{DownloadError, RemotePath}; use crate::checker::create_availability_check_data; use crate::pg_helpers::*; @@ -59,8 +59,8 @@ pub struct ComputeNode { pub state: Mutex, /// `Condvar` to allow notifying waiters about state changes. pub state_changed: Condvar, - /// the S3 bucket that we search for extensions in - pub ext_remote_storage: Option, + /// the address of extension storage proxy gateway + pub ext_remote_storage: Option, // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub build_tag: String, @@ -728,7 +728,12 @@ impl ComputeNode { // Write new config let pgdata_path = Path::new(&self.pgdata); - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?; + let postgresql_conf_path = pgdata_path.join("postgresql.conf"); + config::write_postgres_conf(&postgresql_conf_path, &spec, None)?; + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are reconfiguring: + // creating new extensions, roles, etc... + config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; self.pg_reload_conf()?; let mut client = Client::connect(self.connstr.as_str(), NoTls)?; @@ -749,6 +754,10 @@ impl ComputeNode { // 'Close' connection drop(client); + // reset max_cluster_size in config back to original value and reload config + config::compute_ctl_temp_override_remove(pgdata_path)?; + self.pg_reload_conf()?; + let unknown_op = "unknown".to_string(); let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op); info!( @@ -809,7 +818,17 @@ impl ComputeNode { let config_time = Utc::now(); if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; + self.pg_reload_conf()?; + self.apply_config(&compute_state)?; + + config::compute_ctl_temp_override_remove(pgdata_path)?; + self.pg_reload_conf()?; } let startup_end_time = Utc::now(); @@ -957,12 +976,12 @@ LIMIT 100", real_ext_name: String, ext_path: RemotePath, ) -> Result { - let remote_storage = self - .ext_remote_storage - .as_ref() - .ok_or(DownloadError::BadInput(anyhow::anyhow!( - "Remote extensions storage is not configured", - )))?; + let ext_remote_storage = + self.ext_remote_storage + .as_ref() + .ok_or(DownloadError::BadInput(anyhow::anyhow!( + "Remote extensions storage is not configured", + )))?; let ext_archive_name = ext_path.object_name().expect("bad path"); @@ -1018,7 +1037,7 @@ LIMIT 100", let download_size = extension_server::download_extension( &real_ext_name, &ext_path, - remote_storage, + ext_remote_storage, &self.pgbin, ) .await diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index bc48a2110dfd..a7ef8cea9289 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -93,5 +93,25 @@ pub fn write_postgres_conf( writeln!(file, "neon.extension_server_port={}", port)?; } + // This is essential to keep this line at the end of the file, + // because it is intended to override any settings above. + writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?; + + Ok(()) +} + +/// create file compute_ctl_temp_override.conf in pgdata_dir +/// add provided options to this file +pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> { + let path = pgdata_path.join("compute_ctl_temp_override.conf"); + let mut file = File::create(path)?; + write!(file, "{}", options)?; + Ok(()) +} + +/// remove file compute_ctl_temp_override.conf in pgdata_dir +pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> { + let path = pgdata_path.join("compute_ctl_temp_override.conf"); + std::fs::remove_file(path)?; Ok(()) } diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 9732d8adeace..2cec12119f79 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,18 +71,16 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::Context; use anyhow::{self, Result}; +use anyhow::{bail, Context}; +use bytes::Bytes; use compute_api::spec::RemoteExtSpec; use regex::Regex; use remote_storage::*; -use serde_json; -use std::io::Read; -use std::num::NonZeroUsize; +use reqwest::StatusCode; use std::path::Path; use std::str; use tar::Archive; -use tokio::io::AsyncReadExt; use tracing::info; use tracing::log::warn; use zstd::stream::read::Decoder; @@ -138,23 +136,31 @@ fn parse_pg_version(human_version: &str) -> &str { pub async fn download_extension( ext_name: &str, ext_path: &RemotePath, - remote_storage: &GenericRemoteStorage, + ext_remote_storage: &str, pgbin: &str, ) -> Result { info!("Download extension {:?} from {:?}", ext_name, ext_path); - let mut download = remote_storage.download(ext_path).await?; - let mut download_buffer = Vec::new(); - download - .download_stream - .read_to_end(&mut download_buffer) - .await?; + + // TODO add retry logic + let download_buffer = + match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await { + Ok(buffer) => buffer, + Err(error_message) => { + return Err(anyhow::anyhow!( + "error downloading extension {:?}: {:?}", + ext_name, + error_message + )); + } + }; + let download_size = download_buffer.len() as u64; + info!("Download size {:?}", download_size); // it's unclear whether it is more performant to decompress into memory or not // TODO: decompressing into memory can be avoided - let mut decoder = Decoder::new(download_buffer.as_slice())?; - let mut decompress_buffer = Vec::new(); - decoder.read_to_end(&mut decompress_buffer)?; - let mut archive = Archive::new(decompress_buffer.as_slice()); + let decoder = Decoder::new(download_buffer.as_ref())?; + let mut archive = Archive::new(decoder); + let unzip_dest = pgbin .strip_suffix("/bin/postgres") .expect("bad pgbin") @@ -222,29 +228,32 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { } } -// This function initializes the necessary structs to use remote storage -pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result { - #[derive(Debug, serde::Deserialize)] - struct RemoteExtJson { - bucket: String, - region: String, - endpoint: Option, - prefix: Option, - } - let remote_ext_json = serde_json::from_str::(remote_ext_config)?; +// Do request to extension storage proxy, i.e. +// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst +// using HHTP GET +// and return the response body as bytes +// +async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { + let uri = format!("{}/{}", ext_remote_storage, ext_path); - let config = S3Config { - bucket_name: remote_ext_json.bucket, - bucket_region: remote_ext_json.region, - prefix_in_bucket: remote_ext_json.prefix, - endpoint: remote_ext_json.endpoint, - concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"), - max_keys_per_list_response: None, - }; - let config = RemoteStorageConfig { - storage: RemoteStorageKind::AwsS3(config), - }; - GenericRemoteStorage::from_config(&config) + info!("Download extension {:?} from uri {:?}", ext_path, uri); + + let resp = reqwest::get(uri).await?; + + match resp.status() { + StatusCode::OK => match resp.bytes().await { + Ok(resp) => { + info!("Download extension {:?} completed successfully", ext_path); + Ok(resp) + } + Err(e) => bail!("could not deserialize remote extension response: {}", e), + }, + StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"), + _ => bail!( + "unexpected remote extension response status code: {}", + resp.status() + ), + } } #[cfg(test)] diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 8851be1ec109..fa2c4cff28d9 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -123,7 +123,7 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving {:?} POST request", route); info!("req.uri {:?}", req.uri()); @@ -227,7 +227,7 @@ async fn handle_configure_request( let parsed_spec = match ParsedSpec::try_from(spec) { Ok(ps) => ps, - Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)), + Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)), }; // XXX: wrap state update under lock in code blocks. Otherwise, diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index dc26cc63eb8d..cedc6ece8f5e 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -156,17 +156,17 @@ paths: description: Error text or 'OK' if download succeeded. example: "OK" 400: - description: Request is invalid. - content: - application/json: - schema: - $ref: "#/components/schemas/GenericError" + description: Request is invalid. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" 500: - description: Extension download request failed. - content: - application/json: - schema: - $ref: "#/components/schemas/GenericError" + description: Extension download request failed. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" components: securitySchemes: diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 8c44c6d519f4..f98333d8bf0e 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -118,19 +118,6 @@ pub fn get_spec_from_control_plane( spec } -/// It takes cluster specification and does the following: -/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file. -/// - Update `pg_hba.conf` to allow external connections. -pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> { - // File `postgresql.conf` is no longer included into `basebackup`, so just - // always write all config into it creating new file. - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?; - - update_pg_hba(pgdata_path)?; - - Ok(()) -} - /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { // XXX: consider making it a part of spec.json diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs index 16577e27d699..be7cff352ca9 100644 --- a/control_plane/src/bin/attachment_service.rs +++ b/control_plane/src/bin/attachment_service.rs @@ -9,6 +9,7 @@ use clap::Parser; use hex::FromHex; use hyper::StatusCode; use hyper::{Body, Request, Response}; +use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; use std::{collections::HashMap, sync::Arc}; @@ -173,7 +174,8 @@ async fn handle_re_attach(mut req: Request) -> Result, ApiE if state.pageserver == Some(reattach_req.node_id) { state.generation += 1; response.tenants.push(ReAttachResponseTenant { - id: *t, + // TODO(sharding): make this shard-aware + id: TenantShardId::unsharded(*t), gen: state.generation, }); } @@ -196,7 +198,8 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr }; for req_tenant in validate_req.tenants { - if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { + // TODO(sharding): make this shard-aware + if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) { let valid = tenant_state.generation == req_tenant.gen; response.tenants.push(ValidateResponseTenant { id: req_tenant.id, diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 384c4ee56d4b..8d53a6a65867 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -415,6 +415,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an None, None, Some(pg_version), + None, )?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; @@ -495,6 +496,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - None, None, Some(pg_version), + None, )?; let new_timeline_id = timeline_info.timeline_id; @@ -582,6 +584,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - start_lsn, Some(ancestor_timeline_id), None, + None, )?; let new_timeline_id = timeline_info.timeline_id; @@ -608,11 +611,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( }; let mut cplane = ComputeControlPlane::load(env.clone())?; - // All subcommands take an optional --tenant-id option - let tenant_id = get_tenant_id(sub_args, env)?; - match sub_name { "list" => { + let tenant_id = get_tenant_id(sub_args, env)?; let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| { eprintln!("Failed to load timeline info: {}", e); HashMap::new() @@ -672,6 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( println!("{table}"); } "create" => { + let tenant_id = get_tenant_id(sub_args, env)?; let branch_name = sub_args .get_one::("branch-name") .map(|s| s.as_str()) @@ -716,6 +718,18 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), }; + match (mode, hot_standby) { + (ComputeMode::Static(_), true) => { + bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") + } + (ComputeMode::Primary, true) => { + bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") + } + _ => {} + } + + cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + cplane.new_endpoint( &endpoint_id, tenant_id, @@ -728,8 +742,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( )?; } "start" => { - let pg_port: Option = sub_args.get_one::("pg-port").copied(); - let http_port: Option = sub_args.get_one::("http-port").copied(); let endpoint_id = sub_args .get_one::("endpoint_id") .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; @@ -758,80 +770,28 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( env.safekeepers.iter().map(|sk| sk.id).collect() }; - let endpoint = cplane.endpoints.get(endpoint_id.as_str()); + let endpoint = cplane + .endpoints + .get(endpoint_id.as_str()) + .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?; + + cplane.check_conflicting_endpoints( + endpoint.mode, + endpoint.tenant_id, + endpoint.timeline_id, + )?; let ps_conf = env.get_pageserver_conf(pageserver_id)?; let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) { - let claims = Claims::new(Some(tenant_id), Scope::Tenant); + let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) } else { None }; - let hot_standby = sub_args - .get_one::("hot-standby") - .copied() - .unwrap_or(false); - - if let Some(endpoint) = endpoint { - match (&endpoint.mode, hot_standby) { - (ComputeMode::Static(_), true) => { - bail!("Cannot start a node in hot standby mode when it is already configured as a static replica") - } - (ComputeMode::Primary, true) => { - bail!("Cannot start a node as a hot standby replica, it is already configured as primary node") - } - _ => {} - } - println!("Starting existing endpoint {endpoint_id}..."); - endpoint.start(&auth_token, safekeepers, remote_ext_config)?; - } else { - let branch_name = sub_args - .get_one::("branch-name") - .map(|s| s.as_str()) - .unwrap_or(DEFAULT_BRANCH_NAME); - let timeline_id = env - .get_branch_timeline_id(branch_name, tenant_id) - .ok_or_else(|| { - anyhow!("Found no timeline id for branch name '{branch_name}'") - })?; - let lsn = sub_args - .get_one::("lsn") - .map(|lsn_str| Lsn::from_str(lsn_str)) - .transpose() - .context("Failed to parse Lsn from the request")?; - let pg_version = sub_args - .get_one::("pg-version") - .copied() - .context("Failed to `pg-version` from the argument string")?; - - let mode = match (lsn, hot_standby) { - (Some(lsn), false) => ComputeMode::Static(lsn), - (None, true) => ComputeMode::Replica, - (None, false) => ComputeMode::Primary, - (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"), - }; - - // when used with custom port this results in non obvious behaviour - // port is remembered from first start command, i e - // start --port X - // stop - // start <-- will also use port X even without explicit port argument - println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ..."); - - let ep = cplane.new_endpoint( - endpoint_id, - tenant_id, - timeline_id, - pg_port, - http_port, - pg_version, - mode, - pageserver_id, - )?; - ep.start(&auth_token, safekeepers, remote_ext_config)?; - } + println!("Starting existing endpoint {endpoint_id}..."); + endpoint.start(&auth_token, safekeepers, remote_ext_config)?; } "reconfigure" => { let endpoint_id = sub_args @@ -1252,7 +1212,7 @@ fn cli() -> Command { let remote_ext_config_args = Arg::new("remote-ext-config") .long("remote-ext-config") .num_args(1) - .help("Configure the S3 bucket that we search for extensions in.") + .help("Configure the remote extensions storage proxy gateway to request for extensions.") .required(false); let lsn_arg = Arg::new("lsn") @@ -1437,15 +1397,7 @@ fn cli() -> Command { .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") .arg(endpoint_id_arg.clone()) - .arg(tenant_id_arg.clone()) - .arg(branch_name_arg.clone()) - .arg(timeline_id_arg.clone()) - .arg(lsn_arg) - .arg(pg_port_arg) - .arg(http_port_arg) .arg(endpoint_pageserver_id_arg.clone()) - .arg(pg_version_arg) - .arg(hot_standby_arg) .arg(safekeepers_arg) .arg(remote_ext_config_args) ) @@ -1458,7 +1410,6 @@ fn cli() -> Command { .subcommand( Command::new("stop") .arg(endpoint_id_arg) - .arg(tenant_id_arg.clone()) .arg( Arg::new("destroy") .help("Also delete data directory (now optional, should be default in future)") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 4443fd870432..12b12507647b 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -45,6 +45,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{anyhow, bail, Context, Result}; +use compute_api::spec::RemoteExtSpec; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId, TimelineId}; @@ -124,6 +125,7 @@ impl ComputeControlPlane { let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); let pageserver = PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?); + let ep = Arc::new(Endpoint { endpoint_id: endpoint_id.to_owned(), pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), @@ -168,6 +170,30 @@ impl ComputeControlPlane { Ok(ep) } + + pub fn check_conflicting_endpoints( + &self, + mode: ComputeMode, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result<()> { + if matches!(mode, ComputeMode::Primary) { + // this check is not complete, as you could have a concurrent attempt at + // creating another primary, both reading the state before checking it here, + // but it's better than nothing. + let mut duplicates = self.endpoints.iter().filter(|(_k, v)| { + v.tenant_id == tenant_id + && v.timeline_id == timeline_id + && v.mode == mode + && v.status() != "stopped" + }); + + if let Some((key, _)) = duplicates.next() { + bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported."); + } + } + Ok(()) + } } /////////////////////////////////////////////////////////////////////////////// @@ -476,6 +502,18 @@ impl Endpoint { } } + // check for file remote_extensions_spec.json + // if it is present, read it and pass to compute_ctl + let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json"); + let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path); + let remote_extensions: Option; + + if let Ok(spec_file) = remote_extensions_spec { + remote_extensions = serde_json::from_reader(spec_file).ok(); + } else { + remote_extensions = None; + }; + // Create spec file let spec = ComputeSpec { skip_pg_catalog_updates: self.skip_pg_catalog_updates, @@ -497,7 +535,7 @@ impl Endpoint { pageserver_connstring: Some(pageserver_connstring), safekeeper_connstrings, storage_auth_token: auth_token.clone(), - remote_extensions: None, + remote_extensions, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 237df485432c..96a41874fdf0 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -11,6 +11,7 @@ use std::io::{BufReader, Write}; use std::num::NonZeroU64; use std::path::PathBuf; use std::process::{Child, Command}; +use std::time::Duration; use std::{io, result}; use anyhow::{bail, Context}; @@ -522,19 +523,24 @@ impl PageServerNode { &self, tenant_id: TenantId, config: LocationConfig, + flush_ms: Option, ) -> anyhow::Result<()> { let req_body = TenantLocationConfigRequest { tenant_id, config }; - self.http_request( - Method::PUT, - format!( - "{}/tenant/{}/location_config", - self.http_base_url, tenant_id - ), - )? - .json(&req_body) - .send()? - .error_from_body()?; + let path = format!( + "{}/tenant/{}/location_config", + self.http_base_url, tenant_id + ); + let path = if let Some(flush_ms) = flush_ms { + format!("{}?flush_ms={}", path, flush_ms.as_millis()) + } else { + path + }; + + self.http_request(Method::PUT, path)? + .json(&req_body) + .send()? + .error_from_body()?; Ok(()) } @@ -559,6 +565,7 @@ impl PageServerNode { ancestor_start_lsn: Option, ancestor_timeline_id: Option, pg_version: Option, + existing_initdb_timeline_id: Option, ) -> anyhow::Result { // If timeline ID was not specified, generate one let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate()); @@ -572,6 +579,7 @@ impl PageServerNode { ancestor_start_lsn, ancestor_timeline_id, pg_version, + existing_initdb_timeline_id, }) .send()? .error_from_body()? diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs index d28d1f9fe846..c0c44e279f32 100644 --- a/control_plane/src/tenant_migration.rs +++ b/control_plane/src/tenant_migration.rs @@ -14,7 +14,6 @@ use pageserver_api::models::{ use std::collections::HashMap; use std::time::Duration; use utils::{ - generation::Generation, id::{TenantId, TimelineId}, lsn::Lsn, }; @@ -93,6 +92,22 @@ pub fn migrate_tenant( // Get a new generation let attachment_service = AttachmentService::from_env(env); + fn build_location_config( + mode: LocationConfigMode, + generation: Option, + secondary_conf: Option, + ) -> LocationConfig { + LocationConfig { + mode, + generation, + secondary_conf, + tenant_conf: TenantConfig::default(), + shard_number: 0, + shard_count: 0, + shard_stripe_size: 0, + } + } + let previous = attachment_service.inspect(tenant_id)?; let mut baseline_lsns = None; if let Some((generation, origin_ps_id)) = &previous { @@ -101,40 +116,26 @@ pub fn migrate_tenant( if origin_ps_id == &dest_ps.conf.id { println!("🔁 Already attached to {origin_ps_id}, freshening..."); let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?; - let dest_conf = LocationConfig { - mode: LocationConfigMode::AttachedSingle, - generation: gen.map(Generation::new), - secondary_conf: None, - tenant_conf: TenantConfig::default(), - }; - dest_ps.location_config(tenant_id, dest_conf)?; + let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None); + dest_ps.location_config(tenant_id, dest_conf, None)?; println!("✅ Migration complete"); return Ok(()); } println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode"); - let stale_conf = LocationConfig { - mode: LocationConfigMode::AttachedStale, - generation: Some(Generation::new(*generation)), - secondary_conf: None, - tenant_conf: TenantConfig::default(), - }; - origin_ps.location_config(tenant_id, stale_conf)?; + let stale_conf = + build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None); + origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?; baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?); } let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?; - let dest_conf = LocationConfig { - mode: LocationConfigMode::AttachedMulti, - generation: gen.map(Generation::new), - secondary_conf: None, - tenant_conf: TenantConfig::default(), - }; + let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None); println!("🔁 Attaching to pageserver {}", dest_ps.conf.id); - dest_ps.location_config(tenant_id, dest_conf)?; + dest_ps.location_config(tenant_id, dest_conf, None)?; if let Some(baseline) = baseline_lsns { println!("🕑 Waiting for LSN to catch up..."); @@ -170,31 +171,25 @@ pub fn migrate_tenant( } // Downgrade to a secondary location - let secondary_conf = LocationConfig { - mode: LocationConfigMode::Secondary, - generation: None, - secondary_conf: Some(LocationConfigSecondary { warm: true }), - tenant_conf: TenantConfig::default(), - }; + let secondary_conf = build_location_config( + LocationConfigMode::Secondary, + None, + Some(LocationConfigSecondary { warm: true }), + ); println!( "💤 Switching to secondary mode on pageserver {}", other_ps.conf.id ); - other_ps.location_config(tenant_id, secondary_conf)?; + other_ps.location_config(tenant_id, secondary_conf, None)?; } println!( "🔁 Switching to AttachedSingle mode on pageserver {}", dest_ps.conf.id ); - let dest_conf = LocationConfig { - mode: LocationConfigMode::AttachedSingle, - generation: gen.map(Generation::new), - secondary_conf: None, - tenant_conf: TenantConfig::default(), - }; - dest_ps.location_config(tenant_id, dest_conf)?; + let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None); + dest_ps.location_config(tenant_id, dest_conf, None)?; println!("✅ Migration complete"); diff --git a/docs/rfcs/029-pageserver-wal-disaster-recovery.md b/docs/rfcs/029-pageserver-wal-disaster-recovery.md new file mode 100644 index 000000000000..15ebd72bfe77 --- /dev/null +++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md @@ -0,0 +1,205 @@ +# Name + +Created on: 2023-09-08 +Author: Arpad Müller + +## Summary + +Enable the pageserver to recover from data corruption events by implementing +a feature to re-apply historic WAL records in parallel to the already occurring +WAL replay. + +The feature is outside of the user-visible backup and history story, and only +serves as a second-level backup for the case that there is a bug in the +pageservers that corrupted the served pages. + +The RFC proposes the addition of two new features: +* recover a broken branch from WAL (downtime is allowed) +* a test recovery system to recover random branches to make sure recovery works + +## Motivation + +The historic WAL is currently stored in S3 even after it has been replayed by +the pageserver and thus been integrated into the pageserver's storage system. +This is done to defend from data corruption failures inside the pageservers. + +However, application of this WAL in the disaster recovery setting is currently +very manual and we want to automate this to make it easier. + +### Use cases + +There are various use cases for this feature, like: + +* The main motivation is replaying in the instance of pageservers corrupting + data. +* We might want to, beyond the user-visible history features, through our + support channels and upon customer request, in select instances, recover + historic versions beyond the range of history that we officially support. +* Running the recovery process in the background for random tenant timelines + to figure out if there was a corruption of data (we would compare with what + the pageserver stores for the "official" timeline). +* Using the WAL to arrive at historic pages we can then back up to S3 so that + WAL itself can be discarded, or at least not used for future replays. + Again, this sounds a lot like what the pageserver is already doing, but the + point is to provide a fallback to the service provided by the pageserver. + +## Design + +### Design constraints + +The main design constraint is that the feature needs to be *simple* enough that +the number of bugs are as low, and reliability as high as possible: the main +goal of this endeavour is to achieve higher correctness than the pageserver. + +For the background process, we cannot afford a downtime of the timeline that is +being cloned, as we don't want to restrict ourselves to offline tenants only. +In the scenario where we want to recover from disasters or roll back to a +historic lsn through support staff, downtimes are more affordable, and +inevitable if the original had been subject to the corruption. Ideally, the +two code paths would share code, so the solution would be designed for not +requiring downtimes. + +### API endpoint changes + +This RFC proposes two API endpoint changes in the safekeeper and the +pageserver. + +Remember, the pageserver timeline API creation endpoint is to this URL: + +``` +/v1/tenant/{tenant_id}/timeline/ +``` + +Where `{tenant_id}` is the ID of the tenant the timeline is created for, +and specified as part of the URL. The timeline ID is passed via the POST +request body as the only required parameter `new_timeline_id`. + +This proposal adds one optional parameter called +`existing_initdb_timeline_id` to the request's json body. If the parameter +is not specified, behaviour should be as existing, so the pageserver runs +initdb. +If the parameter is specified, it is expected to point to a timeline ID. +In fact that ID might match `new_timeline_id`, what's important is that +S3 storage contains a matching initdb under the URL matching the given +tenant and timeline. + +Having both `ancestor_timeline_id` and `existing_initdb_timeline_id` +specified is illegal and will yield in an HTTP error. This feature is +only meant for the "main" branch that doesn't have any ancestors +of its own, as only here initdb is relevant. + +For the safekeeper, we propose the addition of the following copy endpoint: + +``` +/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy +``` +it is meant for POST requests with json, and the two URL parameters +`tenant_id` and `source_timeline_id`. The json request body contains +the two required parameters `target_timeline_id` and `until_lsn`. + +After invoking, the copy endpoint starts a copy process of the WAL from +the source ID to the target ID. The lsn is updated according to the +progress of the API call. + +### Higher level features + +We want the API changes to support the following higher level features: + +* recovery-after-corruption DR of the main timeline of a tenant. This + feature allows for downtime. +* test DR of the main timeline into a special copy timeline. this feature + is meant to run against selected production tenants in the background, + without the user noticing, so it does not allow for downtime. + +The recovery-after-corruption DR only needs the pageserver changes. +It works as follows: + +* delete the timeline from the pageservers via timeline deletion API +* re-create it via timeline creation API (same ID as before) and set + `existing_initdb_timeline_id` to the same timeline ID + +The test DR requires also the copy primitive and works as follows: + +* copy the WAL of the timeline to a new place +* create a new timeline for the tenant + +## Non Goals + +At the danger of being repetitive, the main goal of this feature is to be a +backup method, so reliability is very important. This implies that other +aspects like performance or space reduction are less important. + +### Corrupt WAL + +The process suggested by this RFC assumes that the WAL is free of corruption. +In some instances, corruption can make it into WAL, like for example when +higher level components like postgres or the application first read corrupt +data, and then execute a write with data derived from that earlier read. That +written data might then contain the corruption. + +Common use cases can hit this quite easily. For example, an application reads +some counter, increments it, and then writes the new counter value to the +database. +On a lower level, the compute might put FPIs (Full Page Images) into the WAL, +which have corrupt data for rows unrelated to the write operation at hand. + +Separating corrupt writes from non-corrupt ones is a hard problem in general, +and if the application was involved in making the corrupt write, a recovery +would also involve the application. Therefore, corruption that has made it into +the WAL is outside of the scope of this feature. However, the WAL replay can be +issued to right before the point in time where the corruption occured. Then the +data loss is isolated to post-corruption writes only. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +Most changes would happen to the pageservers. +For the higher level features, maybe other components like the console would +be involved. + +We need to make sure that the shadow timelines are not subject to the usual +limits and billing we apply to existing timelines. + +## Proposed implementation + +The first problem to keep in mind is the reproducability of `initdb`. +So an initial step would be to upload `initdb` snapshots to S3. + +After that, we'd have the endpoint spawn a background process which +performs the replay of the WAL to that new timeline. This process should +follow the existing workflows as closely as possible, just using the +WAL records of a different timeline. + +The timeline created will be in a special state that solely looks for WAL +entries of the timeline it is trying to copy. Once the target LSN is reached, +it turns into a normal timeline that also accepts writes to its own +timeline ID. + +### Scalability + +For now we want to run this entire process on a single node, and as +it is by nature linear, it's hard to parallelize. However, for the +verification workloads, we can easily start the WAL replay in parallel +for different points in time. This is valuable especially for tenants +with large WAL records. + +Compare this with the tricks to make addition circuits execute with +lower latency by making them perform the addition for both possible +values of the carry bit, and then, in a second step, taking the +result for the carry bit that was actually obtained. + +The other scalability dimension to consider is the WAL length, which +is a growing question as tenants accumulate changes. There are +possible approaches to this, including creating snapshots of the +page files and uploading them to S3, but if we do this for every single +branch, we lose the cheap branching property. + +### Implementation by component + +The proposed changes for the various components of the neon architecture +are written up in this notion page: + +https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2 + +### Unresolved questions + +none known (outside of the mentioned ones). diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index df9796b03901..4d08d78e8741 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -18,6 +18,7 @@ enum-map.workspace = true strum.workspace = true strum_macros.workspace = true hex.workspace = true +thiserror.workspace = true workspace_hack.workspace = true diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/control_api.rs index 8232e81b9887..0acc3a7bb0ae 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/control_api.rs @@ -4,7 +4,9 @@ //! See docs/rfcs/025-generation-numbers.md use serde::{Deserialize, Serialize}; -use utils::id::{NodeId, TenantId}; +use utils::id::NodeId; + +use crate::shard::TenantShardId; #[derive(Serialize, Deserialize)] pub struct ReAttachRequest { @@ -13,7 +15,7 @@ pub struct ReAttachRequest { #[derive(Serialize, Deserialize)] pub struct ReAttachResponseTenant { - pub id: TenantId, + pub id: TenantShardId, pub gen: u32, } @@ -24,7 +26,7 @@ pub struct ReAttachResponse { #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { - pub id: TenantId, + pub id: TenantShardId, pub gen: u32, } @@ -40,6 +42,6 @@ pub struct ValidateResponse { #[derive(Serialize, Deserialize)] pub struct ValidateResponseTenant { - pub id: TenantId, + pub id: TenantShardId, pub valid: bool, } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 71e32e479f20..2234a06501a9 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -10,7 +10,6 @@ use serde_with::serde_as; use strum_macros; use utils::{ completion, - generation::Generation, history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, @@ -180,6 +179,8 @@ pub struct TimelineCreateRequest { #[serde(default)] pub ancestor_timeline_id: Option, #[serde(default)] + pub existing_initdb_timeline_id: Option, + #[serde(default)] pub ancestor_start_lsn: Option, pub pg_version: Option, } @@ -262,10 +263,19 @@ pub struct LocationConfig { pub mode: LocationConfigMode, /// If attaching, in what generation? #[serde(default)] - pub generation: Option, + pub generation: Option, #[serde(default)] pub secondary_conf: Option, + // Shard parameters: if shard_count is nonzero, then other shard_* fields + // must be set accurately. + #[serde(default)] + pub shard_number: u8, + #[serde(default)] + pub shard_count: u8, + #[serde(default)] + pub shard_stripe_size: u32, + // If requesting mode `Secondary`, configuration for that. // Custom storage configuration for the tenant, if any pub tenant_conf: TenantConfig, @@ -306,25 +316,7 @@ impl std::ops::Deref for TenantConfigRequest { impl TenantConfigRequest { pub fn new(tenant_id: TenantId) -> TenantConfigRequest { - let config = TenantConfig { - checkpoint_distance: None, - checkpoint_timeout: None, - compaction_target_size: None, - compaction_period: None, - compaction_threshold: None, - gc_horizon: None, - gc_period: None, - image_creation_threshold: None, - pitr_interval: None, - walreceiver_connect_timeout: None, - lagging_wal_timeout: None, - max_lsn_wal_lag: None, - trace_read_requests: None, - eviction_policy: None, - min_resident_size_override: None, - evictions_low_residence_duration_metric_threshold: None, - gc_feedback: None, - }; + let config = TenantConfig::default(); TenantConfigRequest { tenant_id, config } } } @@ -392,7 +384,9 @@ pub struct TimelineInfo { /// The LSN that we are advertizing to safekeepers pub remote_consistent_lsn_visible: Lsn, - pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_logical_size: u64, + pub current_logical_size_is_accurate: bool, + /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 32a834a26a5e..3510b4dbcadd 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -2,12 +2,13 @@ use std::{ops::RangeInclusive, str::FromStr}; use hex::FromHex; use serde::{Deserialize, Serialize}; +use thiserror; use utils::id::TenantId; -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)] +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)] +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardCount(pub u8); impl ShardCount { @@ -38,7 +39,7 @@ impl ShardNumber { /// Note that the binary encoding is _not_ backward compatible, because /// at the time sharding is introduced, there are no existing binary structures /// containing TenantId that we need to handle. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)] +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct TenantShardId { pub tenant_id: TenantId, pub shard_number: ShardNumber, @@ -139,6 +140,89 @@ impl From<[u8; 18]> for TenantShardId { } } +/// For use within the context of a particular tenant, when we need to know which +/// shard we're dealing with, but do not need to know the full ShardIdentity (because +/// we won't be doing any page->shard mapping), and do not need to know the fully qualified +/// TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +impl ShardIndex { + pub fn new(number: ShardNumber, count: ShardCount) -> Self { + Self { + shard_number: number, + shard_count: count, + } + } + pub fn unsharded() -> Self { + Self { + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + } + } + + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + } + + /// For use in constructing remote storage paths: concatenate this with a TenantId + /// to get a fully qualified TenantShardId. + /// + /// Backward compat: this function returns an empty string if Self::is_unsharded, such + /// that the legacy pre-sharding remote key format is preserved. + pub fn get_suffix(&self) -> String { + if self.is_unsharded() { + "".to_string() + } else { + format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + } + } +} + +impl std::fmt::Display for ShardIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + } +} + +impl std::fmt::Debug for ShardIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Debug is the same as Display: the compact hex representation + write!(f, "{}", self) + } +} + +impl std::str::FromStr for ShardIndex { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + // Expect format: 1 byte shard number, 1 byte shard count + if s.len() == 4 { + let bytes = s.as_bytes(); + let mut shard_parts: [u8; 2] = [0u8; 2]; + hex::decode_to_slice(bytes, &mut shard_parts)?; + Ok(Self { + shard_number: ShardNumber(shard_parts[0]), + shard_count: ShardCount(shard_parts[1]), + }) + } else { + Err(hex::FromHexError::InvalidStringLength) + } + } +} + +impl From<[u8; 2]> for ShardIndex { + fn from(b: [u8; 2]) -> Self { + Self { + shard_number: ShardNumber(b[0]), + shard_count: ShardCount(b[1]), + } + } +} + impl Serialize for TenantShardId { fn serialize(&self, serializer: S) -> Result where @@ -209,6 +293,151 @@ impl<'de> Deserialize<'de> for TenantShardId { } } +/// Stripe size in number of pages +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardStripeSize(pub u32); + +/// Layout version: for future upgrades where we might change how the key->shard mapping works +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardLayout(u8); + +const LAYOUT_V1: ShardLayout = ShardLayout(1); + +/// Default stripe size in pages: 256MiB divided by 8kiB page size. +const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); + +/// The ShardIdentity contains the information needed for one member of map +/// to resolve a key to a shard, and then check whether that shard is ==self. +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardIdentity { + pub layout: ShardLayout, + pub number: ShardNumber, + pub count: ShardCount, + pub stripe_size: ShardStripeSize, +} + +#[derive(thiserror::Error, Debug, PartialEq, Eq)] +pub enum ShardConfigError { + #[error("Invalid shard count")] + InvalidCount, + #[error("Invalid shard number")] + InvalidNumber, + #[error("Invalid stripe size")] + InvalidStripeSize, +} + +impl ShardIdentity { + /// An identity with number=0 count=0 is a "none" identity, which represents legacy + /// tenants. Modern single-shard tenants should not use this: they should + /// have number=0 count=1. + pub fn unsharded() -> Self { + Self { + number: ShardNumber(0), + count: ShardCount(0), + layout: LAYOUT_V1, + stripe_size: DEFAULT_STRIPE_SIZE, + } + } + + pub fn is_unsharded(&self) -> bool { + self.number == ShardNumber(0) && self.count == ShardCount(0) + } + + /// Count must be nonzero, and number must be < count. To construct + /// the legacy case (count==0), use Self::unsharded instead. + pub fn new( + number: ShardNumber, + count: ShardCount, + stripe_size: ShardStripeSize, + ) -> Result { + if count.0 == 0 { + Err(ShardConfigError::InvalidCount) + } else if number.0 > count.0 - 1 { + Err(ShardConfigError::InvalidNumber) + } else if stripe_size.0 == 0 { + Err(ShardConfigError::InvalidStripeSize) + } else { + Ok(Self { + number, + count, + layout: LAYOUT_V1, + stripe_size, + }) + } + } +} + +impl Serialize for ShardIndex { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if serializer.is_human_readable() { + serializer.collect_str(self) + } else { + // Binary encoding is not used in index_part.json, but is included in anticipation of + // switching various structures (e.g. inter-process communication, remote metadata) to more + // compact binary encodings in future. + let mut packed: [u8; 2] = [0; 2]; + packed[0] = self.shard_number.0; + packed[1] = self.shard_count.0; + packed.serialize(serializer) + } + } +} + +impl<'de> Deserialize<'de> for ShardIndex { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct IdVisitor { + is_human_readable_deserializer: bool, + } + + impl<'de> serde::de::Visitor<'de> for IdVisitor { + type Value = ShardIndex; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + if self.is_human_readable_deserializer { + formatter.write_str("value in form of hex string") + } else { + formatter.write_str("value in form of integer array([u8; 2])") + } + } + + fn visit_seq(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let s = serde::de::value::SeqAccessDeserializer::new(seq); + let id: [u8; 2] = Deserialize::deserialize(s)?; + Ok(ShardIndex::from(id)) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + ShardIndex::from_str(v).map_err(E::custom) + } + } + + if deserializer.is_human_readable() { + deserializer.deserialize_str(IdVisitor { + is_human_readable_deserializer: true, + }) + } else { + deserializer.deserialize_tuple( + 2, + IdVisitor { + is_human_readable_deserializer: false, + }, + ) + } + } +} + #[cfg(test)] mod tests { use std::str::FromStr; @@ -318,4 +547,66 @@ mod tests { Ok(()) } + + #[test] + fn shard_identity_validation() -> Result<(), ShardConfigError> { + // Happy cases + ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?; + ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?; + ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?; + + assert_eq!( + ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE), + Err(ShardConfigError::InvalidCount) + ); + assert_eq!( + ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE), + Err(ShardConfigError::InvalidNumber) + ); + assert_eq!( + ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE), + Err(ShardConfigError::InvalidNumber) + ); + assert_eq!( + ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE), + Err(ShardConfigError::InvalidNumber) + ); + assert_eq!( + ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)), + Err(ShardConfigError::InvalidStripeSize) + ); + + Ok(()) + } + + #[test] + fn shard_index_human_encoding() -> Result<(), hex::FromHexError> { + let example = ShardIndex { + shard_number: ShardNumber(13), + shard_count: ShardCount(17), + }; + let expected: String = "0d11".to_string(); + let encoded = format!("{example}"); + assert_eq!(&encoded, &expected); + + let decoded = ShardIndex::from_str(&encoded)?; + assert_eq!(example, decoded); + Ok(()) + } + + #[test] + fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> { + let example = ShardIndex { + shard_number: ShardNumber(13), + shard_count: ShardCount(17), + }; + let expected: [u8; 2] = [0x0d, 0x11]; + + let encoded = bincode::serialize(&example).unwrap(); + assert_eq!(Hex(&encoded), Hex(&expected)); + let decoded = bincode::deserialize(&encoded).unwrap(); + assert_eq!(example, decoded); + + Ok(()) + } } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index d7bcce28cb48..e8bfc005d32d 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -9,8 +9,7 @@ anyhow.workspace = true async-trait.workspace = true once_cell.workspace = true aws-smithy-async.workspace = true -aws-smithy-http.workspace = true -aws-types.workspace = true +aws-smithy-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true aws-credential-types.workspace = true diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index ab3fd3fe629f..0cb73f73b77d 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -14,18 +14,20 @@ use aws_config::{ provider_config::ProviderConfig, retry::{RetryConfigBuilder, RetryMode}, web_identity_token::WebIdentityTokenCredentialsProvider, + BehaviorVersion, }; -use aws_credential_types::cache::CredentialsCache; +use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ - config::{AsyncSleep, Config, Region, SharedAsyncSleep}, + config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, - primitives::ByteStream, types::{Delete, ObjectIdentifier}, Client, }; use aws_smithy_async::rt::sleep::TokioSleep; -use aws_smithy_http::body::SdkBody; + +use aws_smithy_types::body::SdkBody; +use aws_smithy_types::byte_stream::ByteStream; use hyper::Body; use scopeguard::ScopeGuard; use tokio::io::{self, AsyncRead}; @@ -78,7 +80,6 @@ impl S3Bucket { // needed to access remote extensions bucket .or_else("token", { let provider_conf = ProviderConfig::without_region().with_region(region.clone()); - WebIdentityTokenCredentialsProvider::builder() .configure(&provider_conf) .build() @@ -98,18 +99,20 @@ impl S3Bucket { .set_max_attempts(Some(1)) .set_mode(Some(RetryMode::Adaptive)); - let mut config_builder = Config::builder() + let mut config_builder = Builder::default() + .behavior_version(BehaviorVersion::v2023_11_09()) .region(region) - .credentials_cache(CredentialsCache::lazy()) - .credentials_provider(credentials_provider) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)) - .retry_config(retry_config.build()); + .identity_cache(IdentityCache::lazy().build()) + .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .retry_config(retry_config.build()) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)); if let Some(custom_endpoint) = aws_config.endpoint.clone() { config_builder = config_builder .endpoint_url(custom_endpoint) .force_path_style(true); } + let client = Client::from_conf(config_builder.build()); let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { @@ -371,7 +374,7 @@ impl RemoteStorage for S3Bucket { let response = response?; - let keys = response.contents().unwrap_or_default(); + let keys = response.contents(); let empty = Vec::new(); let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty); @@ -411,7 +414,7 @@ impl RemoteStorage for S3Bucket { let started_at = start_measuring_requests(kind); let body = Body::wrap_stream(ReaderStream::new(from)); - let bytes_stream = ByteStream::new(SdkBody::from(body)); + let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); let res = self .client @@ -474,7 +477,7 @@ impl RemoteStorage for S3Bucket { for path in paths { let obj_id = ObjectIdentifier::builder() .set_key(Some(self.relative_path_to_s3_object(path))) - .build(); + .build()?; delete_objects.push(obj_id); } @@ -485,7 +488,11 @@ impl RemoteStorage for S3Bucket { .client .delete_objects() .bucket(self.bucket_name.clone()) - .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build()) + .delete( + Delete::builder() + .set_objects(Some(chunk.to_vec())) + .build()?, + ) .send() .await; diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 3eb01003dfd2..35c260740c9e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -51,6 +51,7 @@ regex.workspace = true scopeguard.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } +serde_path_to_error.workspace = true serde_with.workspace = true signal-hook.workspace = true smallvec = { workspace = true, features = ["write"] } diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 735f358d8b9d..5d05af0c0023 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -3,6 +3,7 @@ use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::LayerFileName; use pageserver::tenant::storage_layer::PersistentLayerDesc; +use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; @@ -211,7 +212,7 @@ fn bench_sequential(c: &mut Criterion) { let i32 = (i as u32) % 100; let zero = Key::from_hex("000000000000000000000000000000000000").unwrap(); let layer = PersistentLayerDesc::new_img( - TenantId::generate(), + TenantShardId::unsharded(TenantId::generate()), TimelineId::generate(), zero.add(10 * i32)..zero.add(10 * i32 + 1), Lsn(i), diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 22ebe70b1669..ebf4a4bec3f2 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -1,13 +1,15 @@ use std::path::{Path, PathBuf}; use anyhow::Result; -use camino::Utf8Path; +use camino::{Utf8Path, Utf8PathBuf}; use clap::Subcommand; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; use pageserver::tenant::block_io::BlockCursor; use pageserver::tenant::disk_btree::DiskBtreeReader; use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary}; +use pageserver::tenant::storage_layer::{delta_layer, image_layer}; +use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer}; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::{page_cache, virtual_file}; use pageserver::{ @@ -20,6 +22,7 @@ use pageserver::{ }; use std::fs; use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::parse_filename; @@ -45,6 +48,13 @@ pub(crate) enum LayerCmd { /// The id from list-layer command id: usize, }, + RewriteSummary { + layer_file_path: Utf8PathBuf, + #[clap(long)] + new_tenant_id: Option, + #[clap(long)] + new_timeline_id: Option, + }, } async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { @@ -100,6 +110,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { println!("- timeline {}", timeline.file_name().to_string_lossy()); } } + Ok(()) } LayerCmd::ListLayer { path, @@ -128,6 +139,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { idx += 1; } } + Ok(()) } LayerCmd::DumpLayer { path, @@ -168,7 +180,63 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { idx += 1; } } + Ok(()) + } + LayerCmd::RewriteSummary { + layer_file_path, + new_tenant_id, + new_timeline_id, + } => { + pageserver::virtual_file::init(10); + pageserver::page_cache::init(100); + + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + + macro_rules! rewrite_closure { + ($($summary_ty:tt)*) => {{ + |summary| $($summary_ty)* { + tenant_id: new_tenant_id.unwrap_or(summary.tenant_id), + timeline_id: new_timeline_id.unwrap_or(summary.timeline_id), + ..summary + } + }}; + } + + let res = ImageLayer::rewrite_summary( + layer_file_path, + rewrite_closure!(image_layer::Summary), + &ctx, + ) + .await; + match res { + Ok(()) => { + println!("Successfully rewrote summary of image layer {layer_file_path}"); + return Ok(()); + } + Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough + Err(image_layer::RewriteSummaryError::Other(e)) => { + return Err(e); + } + } + + let res = DeltaLayer::rewrite_summary( + layer_file_path, + rewrite_closure!(delta_layer::Summary), + &ctx, + ) + .await; + match res { + Ok(()) => { + println!("Successfully rewrote summary of delta layer {layer_file_path}"); + return Ok(()); + } + Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough + Err(delta_layer::RewriteSummaryError::Other(e)) => { + return Err(e); + } + } + + anyhow::bail!("not an image or delta layer: {layer_file_path}"); } } - Ok(()) } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 87d9cc522e81..13d1fc775bb1 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,6 +5,7 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; +use pageserver_api::shard::TenantShardId; use remote_storage::{RemotePath, RemoteStorageConfig}; use serde::de::IntoDeserializer; use std::env; @@ -25,7 +26,7 @@ use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; use postgres_backend::AuthType; use utils::{ - id::{NodeId, TenantId, TimelineId}, + id::{NodeId, TimelineId}, logging::LogFormat, }; @@ -628,12 +629,13 @@ impl PageServerConf { self.deletion_prefix().join(format!("header-{VERSION:02x}")) } - pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenants_path().join(tenant_id.to_string()) + pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenants_path().join(tenant_shard_id.to_string()) } - pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME) + pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) + .join(IGNORED_TENANT_FILE_NAME) } /// Points to a place in pageserver's local directory, @@ -641,47 +643,53 @@ impl PageServerConf { /// /// Legacy: superseded by tenant_location_config_path. Eventually /// remove this function. - pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME) + pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME) } - pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id) + pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) .join(TENANT_LOCATION_CONFIG_NAME) } - pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) + pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) + .join(TIMELINES_SEGMENT_NAME) } - pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf { - self.timelines_path(tenant_id).join(timeline_id.to_string()) + pub fn timeline_path( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Utf8PathBuf { + self.timelines_path(tenant_shard_id) + .join(timeline_id.to_string()) } pub fn timeline_uninit_mark_file_path( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Utf8PathBuf { path_with_suffix_extension( - self.timeline_path(&tenant_id, &timeline_id), + self.timeline_path(&tenant_shard_id, &timeline_id), TIMELINE_UNINIT_MARK_SUFFIX, ) } pub fn timeline_delete_mark_file_path( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Utf8PathBuf { path_with_suffix_extension( - self.timeline_path(&tenant_id, &timeline_id), + self.timeline_path(&tenant_shard_id, &timeline_id), TIMELINE_DELETE_MARK_SUFFIX, ) } - pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.tenant_path(tenant_id) + pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + self.tenant_path(tenant_shard_id) .join(TENANT_DELETED_MARKER_FILE_NAME) } @@ -691,20 +699,24 @@ impl PageServerConf { pub fn trace_path( &self, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, connection_id: &ConnectionId, ) -> Utf8PathBuf { self.traces_path() - .join(tenant_id.to_string()) + .join(tenant_shard_id.to_string()) .join(timeline_id.to_string()) .join(connection_id.to_string()) } /// Points to a place in pageserver's local directory, /// where certain timeline's metadata file should be located. - pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf { - self.timeline_path(tenant_id, timeline_id) + pub fn metadata_path( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + ) -> Utf8PathBuf { + self.timeline_path(tenant_shard_id, timeline_id) .join(METADATA_FILE_NAME) } @@ -767,7 +779,7 @@ impl PageServerConf { builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) } "tenant_config" => { - t_conf = Self::parse_toml_tenant_conf(item)?; + t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; } "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), @@ -841,114 +853,10 @@ impl PageServerConf { Ok(conf) } - // subroutine of parse_and_validate to parse `[tenant_conf]` section - - pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result { - let mut t_conf: TenantConfOpt = Default::default(); - if let Some(checkpoint_distance) = item.get("checkpoint_distance") { - t_conf.checkpoint_distance = - Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?); - } - - if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") { - t_conf.checkpoint_timeout = Some(parse_toml_duration( - "checkpoint_timeout", - checkpoint_timeout, - )?); - } - - if let Some(compaction_target_size) = item.get("compaction_target_size") { - t_conf.compaction_target_size = Some(parse_toml_u64( - "compaction_target_size", - compaction_target_size, - )?); - } - - if let Some(compaction_period) = item.get("compaction_period") { - t_conf.compaction_period = - Some(parse_toml_duration("compaction_period", compaction_period)?); - } - - if let Some(compaction_threshold) = item.get("compaction_threshold") { - t_conf.compaction_threshold = - Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?); - } - - if let Some(image_creation_threshold) = item.get("image_creation_threshold") { - t_conf.image_creation_threshold = Some( - parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?, - ); - } - - if let Some(gc_horizon) = item.get("gc_horizon") { - t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?); - } - - if let Some(gc_period) = item.get("gc_period") { - t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?); - } - - if let Some(pitr_interval) = item.get("pitr_interval") { - t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?); - } - if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") { - t_conf.walreceiver_connect_timeout = Some(parse_toml_duration( - "walreceiver_connect_timeout", - walreceiver_connect_timeout, - )?); - } - if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") { - t_conf.lagging_wal_timeout = Some(parse_toml_duration( - "lagging_wal_timeout", - lagging_wal_timeout, - )?); - } - if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") { - t_conf.max_lsn_wal_lag = - Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?); - } - if let Some(trace_read_requests) = item.get("trace_read_requests") { - t_conf.trace_read_requests = - Some(trace_read_requests.as_bool().with_context(|| { - "configure option trace_read_requests is not a bool".to_string() - })?); - } - - if let Some(eviction_policy) = item.get("eviction_policy") { - t_conf.eviction_policy = Some( - deserialize_from_item("eviction_policy", eviction_policy) - .context("parse eviction_policy")?, - ); - } - - if let Some(item) = item.get("min_resident_size_override") { - t_conf.min_resident_size_override = Some( - deserialize_from_item("min_resident_size_override", item) - .context("parse min_resident_size_override")?, - ); - } - - if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") { - t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration( - "evictions_low_residence_duration_metric_threshold", - item, - )?); - } - - if let Some(gc_feedback) = item.get("gc_feedback") { - t_conf.gc_feedback = Some( - gc_feedback - .as_bool() - .with_context(|| "configure option gc_feedback is not a bool".to_string())?, - ); - } - - Ok(t_conf) - } - #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf { - Utf8PathBuf::from(format!("../tmp_check/test_{test_name}")) + let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into()); + Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}")) } pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { @@ -1417,6 +1325,37 @@ trace_read_requests = {trace_read_requests}"#, Ok(()) } + #[test] + fn parse_incorrect_tenant_config() -> anyhow::Result<()> { + let config_string = r#" + [tenant_config] + checkpoint_distance = -1 # supposed to be an u64 + "# + .to_string(); + + let toml: Document = config_string.parse()?; + let item = toml.get("tenant_config").unwrap(); + let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err(); + + let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64"; + assert_eq!(error.to_string(), expected_error_str); + + Ok(()) + } + + #[test] + fn parse_override_tenant_config() -> anyhow::Result<()> { + let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string(); + + let toml: Document = config_string.parse()?; + let item = toml.get("tenant_config").unwrap(); + let conf = TenantConfOpt::try_from(item.to_owned()).unwrap(); + + assert_eq!(conf.min_resident_size_override, Some(400)); + + Ok(()) + } + #[test] fn eviction_pageserver_config_parse() -> anyhow::Result<()> { let tempdir = tempdir()?; diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 2989e15e8eaa..c6ff91e560e3 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,5 +1,4 @@ -use crate::context::RequestContext; -use anyhow::Context; +use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; @@ -351,14 +350,12 @@ impl TimelineSnapshot { let last_record_lsn = t.get_last_record_lsn(); let current_exact_logical_size = { - let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id); - let res = span - .in_scope(|| t.get_current_logical_size(ctx)) - .context("get_current_logical_size"); - match res? { + let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id); + let size = span.in_scope(|| t.get_current_logical_size(ctx)); + match size { // Only send timeline logical size when it is fully calculated. - (size, is_exact) if is_exact => Some(size), - (_, _) => None, + CurrentLogicalSize::Exact(ref size) => Some(size.into()), + CurrentLogicalSize::Approximate(_) => None, } }; diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index f50c19a6295a..25ae3d1b0168 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -1,16 +1,15 @@ use std::collections::HashMap; -use pageserver_api::control_api::{ - ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, +use pageserver_api::{ + control_api::{ + ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, + }, + shard::TenantShardId, }; use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{ - backoff, - generation::Generation, - id::{NodeId, TenantId}, -}; +use utils::{backoff, generation::Generation, id::NodeId}; use crate::config::PageServerConf; @@ -31,11 +30,11 @@ pub enum RetryForeverError { #[async_trait::async_trait] pub trait ControlPlaneGenerationsApi { - async fn re_attach(&self) -> Result, RetryForeverError>; + async fn re_attach(&self) -> Result, RetryForeverError>; async fn validate( &self, - tenants: Vec<(TenantId, Generation)>, - ) -> Result, RetryForeverError>; + tenants: Vec<(TenantShardId, Generation)>, + ) -> Result, RetryForeverError>; } impl ControlPlaneClient { @@ -127,7 +126,7 @@ impl ControlPlaneClient { #[async_trait::async_trait] impl ControlPlaneGenerationsApi for ControlPlaneClient { /// Block until we get a successful response, or error out if we are shut down - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach(&self) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") @@ -154,8 +153,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { /// Block until we get a successful response, or error out if we are shut down async fn validate( &self, - tenants: Vec<(TenantId, Generation)>, - ) -> Result, RetryForeverError> { + tenants: Vec<(TenantShardId, Generation)>, + ) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("validate") diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 86be1b7094f0..7b0574548395 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -10,11 +10,12 @@ use crate::control_plane_client::ControlPlaneGenerationsApi; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_timeline_path; +use crate::tenant::remote_timeline_client::LayerFileMetadata; use crate::virtual_file::MaybeFatalIo; use crate::virtual_file::VirtualFile; use anyhow::Context; use camino::Utf8PathBuf; -use hex::FromHex; +use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, RemotePath}; use serde::Deserialize; use serde::Serialize; @@ -25,7 +26,7 @@ use tracing::Instrument; use tracing::{self, debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; use utils::lsn::AtomicLsn; use utils::lsn::Lsn; @@ -159,11 +160,10 @@ pub struct DeletionQueueClient { lsn_table: Arc>, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct TenantDeletionList { /// For each Timeline, a list of key fragments to append to the timeline remote path /// when reconstructing a full key - #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] timelines: HashMap>, /// The generation in which this deletion was emitted: note that this may not be the @@ -178,43 +178,11 @@ impl TenantDeletionList { } } -/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string -fn to_hex_map(input: &HashMap, serializer: S) -> Result -where - S: serde::Serializer, - V: Serialize, - I: AsRef<[u8]>, -{ - let transformed = input.iter().map(|(k, v)| (hex::encode(k), v)); - - transformed - .collect::>() - .serialize(serializer) -} - -/// For HashMaps using a FromHex key, where we would like to decode the key -fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, - V: Deserialize<'de>, - I: FromHex + std::hash::Hash + Eq, -{ - let hex_map = HashMap::::deserialize(deserializer)?; - hex_map - .into_iter() - .map(|(k, v)| { - I::from_hex(k) - .map(|k| (k, v)) - .map_err(|_| serde::de::Error::custom("Invalid hex ID")) - }) - .collect() -} - /// Files ending with this suffix will be ignored and erased /// during recovery as startup. const TEMP_SUFFIX: &str = "tmp"; -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct DeletionList { /// Serialization version, for future use version: u8, @@ -226,8 +194,7 @@ struct DeletionList { /// nested HashMaps by TenantTimelineID. Each Tenant only appears once /// with one unique generation ID: if someone tries to push a second generation /// ID for the same tenant, we will start a new DeletionList. - #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] - tenants: HashMap, + tenants: HashMap, /// Avoid having to walk `tenants` to calculate the number of keys in /// the nested deletion lists @@ -299,7 +266,7 @@ impl DeletionList { /// deletion list. fn push( &mut self, - tenant: &TenantId, + tenant: &TenantShardId, timeline: &TimelineId, generation: Generation, objects: &mut Vec, @@ -391,7 +358,7 @@ struct TenantLsnState { #[derive(Default)] struct VisibleLsnUpdates { - tenants: HashMap, + tenants: HashMap, } impl VisibleLsnUpdates { @@ -448,7 +415,7 @@ impl DeletionQueueClient { pub(crate) fn recover( &self, - attached_tenants: HashMap, + attached_tenants: HashMap, ) -> Result<(), DeletionQueueError> { self.do_push( &self.tx, @@ -465,7 +432,7 @@ impl DeletionQueueClient { /// backend will later wake up and notice that the tenant's generation requires validation. pub(crate) async fn update_remote_consistent_lsn( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, lsn: Lsn, @@ -476,10 +443,13 @@ impl DeletionQueueClient { .write() .expect("Lock should never be poisoned"); - let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState { - timelines: HashMap::new(), - generation: current_generation, - }); + let tenant_entry = locked + .tenants + .entry(tenant_shard_id) + .or_insert(TenantLsnState { + timelines: HashMap::new(), + generation: current_generation, + }); if tenant_entry.generation != current_generation { // Generation might have changed if we were detached and then re-attached: in this case, @@ -506,28 +476,29 @@ impl DeletionQueueClient { /// generations in `layers` are the generations in which those layers were written. pub(crate) async fn push_layers( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, Generation)>, + layers: Vec<(LayerFileName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { if current_generation.is_none() { debug!("Enqueuing deletions in legacy mode, skipping queue"); let mut layer_paths = Vec::new(); - for (layer, generation) in layers { + for (layer, meta) in layers { layer_paths.push(remote_layer_path( - &tenant_id, + &tenant_shard_id.tenant_id, &timeline_id, + meta.shard, &layer, - generation, + meta.generation, )); } self.push_immediate(layer_paths).await?; return self.flush_immediate().await; } - self.push_layers_sync(tenant_id, timeline_id, current_generation, layers) + self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers) } /// When a Tenant has a generation, push_layers is always synchronous because @@ -537,10 +508,10 @@ impl DeletionQueueClient { /// support (``) pub(crate) fn push_layers_sync( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, Generation)>, + layers: Vec<(LayerFileName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted @@ -548,7 +519,7 @@ impl DeletionQueueClient { self.do_push( &self.tx, ListWriterQueueMessage::Delete(DeletionOp { - tenant_id, + tenant_shard_id, timeline_id, layers, generation: current_generation, @@ -751,6 +722,7 @@ impl DeletionQueue { mod test { use camino::Utf8Path; use hex_literal::hex; + use pageserver_api::shard::ShardIndex; use std::{io::ErrorKind, time::Duration}; use tracing::info; @@ -815,12 +787,12 @@ mod test { } fn set_latest_generation(&self, gen: Generation) { - let tenant_id = self.harness.tenant_id; + let tenant_shard_id = self.harness.tenant_shard_id; self.mock_control_plane .latest_generation .lock() .unwrap() - .insert(tenant_id, gen); + .insert(tenant_shard_id, gen); } /// Returns remote layer file name, suitable for use in assert_remote_files @@ -829,8 +801,8 @@ mod test { file_name: LayerFileName, gen: Generation, ) -> anyhow::Result { - let tenant_id = self.harness.tenant_id; - let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let tenant_shard_id = self.harness.tenant_shard_id; + let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); std::fs::create_dir_all(&remote_timeline_path)?; let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix()); @@ -848,7 +820,7 @@ mod test { #[derive(Debug, Clone)] struct MockControlPlane { - pub latest_generation: std::sync::Arc>>, + pub latest_generation: std::sync::Arc>>, } impl MockControlPlane { @@ -862,20 +834,20 @@ mod test { #[async_trait::async_trait] impl ControlPlaneGenerationsApi for MockControlPlane { #[allow(clippy::diverging_sub_expression)] // False positive via async_trait - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach(&self) -> Result, RetryForeverError> { unimplemented!() } async fn validate( &self, - tenants: Vec<(TenantId, Generation)>, - ) -> Result, RetryForeverError> { + tenants: Vec<(TenantShardId, Generation)>, + ) -> Result, RetryForeverError> { let mut result = HashMap::new(); let latest_generation = self.latest_generation.lock().unwrap(); - for (tenant_id, generation) in tenants { - if let Some(latest) = latest_generation.get(&tenant_id) { - result.insert(tenant_id, *latest == generation); + for (tenant_shard_id, generation) in tenants { + if let Some(latest) = latest_generation.get(&tenant_shard_id) { + result.insert(tenant_shard_id, *latest == generation); } } @@ -979,10 +951,10 @@ mod test { client.recover(HashMap::new())?; let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); - let tenant_id = ctx.harness.tenant_id; + let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); - let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); let deletion_prefix = ctx.harness.conf.deletion_prefix(); @@ -990,6 +962,8 @@ mod test { // we delete, and the generation of the running Tenant. let layer_generation = Generation::new(0xdeadbeef); let now_generation = Generation::new(0xfeedbeef); + let layer_metadata = + LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded()); let remote_layer_file_name_1 = format!("{}{}", layer_file_name_1, layer_generation.get_suffix()); @@ -1010,10 +984,10 @@ mod test { info!("Pushing"); client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, now_generation, - [(layer_file_name_1.clone(), layer_generation)].to_vec(), + [(layer_file_name_1.clone(), layer_metadata)].to_vec(), ) .await?; assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); @@ -1052,11 +1026,13 @@ mod test { let stale_generation = latest_generation.previous(); // Generation that our example layer file was written with let layer_generation = stale_generation.previous(); + let layer_metadata = + LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded()); ctx.set_latest_generation(latest_generation); - let tenant_id = ctx.harness.tenant_id; - let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let tenant_shard_id = ctx.harness.tenant_shard_id; + let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); // Initial state: a remote layer exists @@ -1066,10 +1042,10 @@ mod test { tracing::debug!("Pushing..."); client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, stale_generation, - [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), ) .await?; @@ -1081,10 +1057,10 @@ mod test { tracing::debug!("Pushing..."); client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, latest_generation, - [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), ) .await?; @@ -1103,14 +1079,16 @@ mod test { let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; - let tenant_id = ctx.harness.tenant_id; + let tenant_shard_id = ctx.harness.tenant_shard_id; - let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID); + let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path()); let deletion_prefix = ctx.harness.conf.deletion_prefix(); let layer_generation = Generation::new(0xdeadbeef); let now_generation = Generation::new(0xfeedbeef); + let layer_metadata = + LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded()); // Inject a deletion in the generation before generation_now: after restart, // this deletion should _not_ get executed (only the immediately previous @@ -1119,10 +1097,10 @@ mod test { ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, now_generation.previous(), - [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(), + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), ) .await?; @@ -1133,10 +1111,10 @@ mod test { ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?; client .push_layers( - tenant_id, + tenant_shard_id, TIMELINE_ID, now_generation, - [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(), + [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), ) .await?; @@ -1164,7 +1142,7 @@ mod test { drop(client); ctx.restart().await; let client = ctx.deletion_queue.new_client(); - client.recover(HashMap::from([(tenant_id, now_generation)]))?; + client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?; info!("Flush-executing"); client.flush_execute().await?; @@ -1226,12 +1204,13 @@ pub(crate) mod mock { match msg { ListWriterQueueMessage::Delete(op) => { let mut objects = op.objects; - for (layer, generation) in op.layers { + for (layer, meta) in op.layers { objects.push(remote_layer_path( - &op.tenant_id, + &op.tenant_shard_id.tenant_id, &op.timeline_id, + meta.shard, &layer, - generation, + meta.generation, )); } @@ -1311,4 +1290,34 @@ pub(crate) mod mock { } } } + + /// Test round-trip serialization/deserialization, and test stability of the format + /// vs. a static expected string for the serialized version. + #[test] + fn deletion_list_serialization() -> anyhow::Result<()> { + let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c" + .to_string() + .parse::()?; + let timeline_id = "be322c834ed9e709e63b5c9698691910" + .to_string() + .parse::()?; + let generation = Generation::new(123); + + let object = + RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?; + let mut objects = [object].to_vec(); + + let mut example = DeletionList::new(1); + example.push(&tenant_id, &timeline_id, generation, &mut objects); + + let encoded = serde_json::to_string(&example)?; + + let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string(); + assert_eq!(encoded, expected); + + let decoded = serde_json::from_str::(&encoded)?; + assert_eq!(example, decoded); + + Ok(()) + } } diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 28daae2da573..7ff27ceb4413 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -19,6 +19,7 @@ use std::collections::HashMap; use std::fs::create_dir_all; use std::time::Duration; +use pageserver_api::shard::TenantShardId; use regex::Regex; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; @@ -26,13 +27,13 @@ use tracing::debug; use tracing::info; use tracing::warn; use utils::generation::Generation; -use utils::id::TenantId; use utils::id::TimelineId; use crate::config::PageServerConf; use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; +use crate::tenant::remote_timeline_client::LayerFileMetadata; use crate::tenant::storage_layer::LayerFileName; use crate::virtual_file::on_fatal_io_error; use crate::virtual_file::MaybeFatalIo; @@ -53,22 +54,22 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100); #[derive(Debug)] pub(super) struct DeletionOp { - pub(super) tenant_id: TenantId, + pub(super) tenant_shard_id: TenantShardId, pub(super) timeline_id: TimelineId, // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker // to do it for you. - pub(super) layers: Vec<(LayerFileName, Generation)>, + pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>, pub(super) objects: Vec, - /// The _current_ generation of the Tenant attachment in which we are enqueuing + /// The _current_ generation of the Tenant shard attachment in which we are enqueuing /// this deletion. pub(super) generation: Generation, } #[derive(Debug)] pub(super) struct RecoverOp { - pub(super) attached_tenants: HashMap, + pub(super) attached_tenants: HashMap, } #[derive(Debug)] @@ -205,7 +206,7 @@ impl ListWriter { async fn recover( &mut self, - attached_tenants: HashMap, + attached_tenants: HashMap, ) -> Result<(), anyhow::Error> { debug!( "recovering with {} attached tenants", @@ -308,8 +309,8 @@ impl ListWriter { // generation was issued to another node in the interval while we restarted, // then we may treat deletion lists from the previous generation as if they // belong to our currently attached generation, and proceed to validate & execute. - for (tenant_id, tenant_list) in &mut deletion_list.tenants { - if let Some(attached_gen) = attached_tenants.get(tenant_id) { + for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants { + if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) { if attached_gen.previous() == tenant_list.generation { tenant_list.generation = *attached_gen; } @@ -387,25 +388,26 @@ impl ListWriter { ); let mut layer_paths = Vec::new(); - for (layer, generation) in op.layers { + for (layer, meta) in op.layers { layer_paths.push(remote_layer_path( - &op.tenant_id, + &op.tenant_shard_id.tenant_id, &op.timeline_id, + meta.shard, &layer, - generation, + meta.generation, )); } layer_paths.extend(op.objects); if !self.pending.push( - &op.tenant_id, + &op.tenant_shard_id, &op.timeline_id, op.generation, &mut layer_paths, ) { self.flush().await; let retry_succeeded = self.pending.push( - &op.tenant_id, + &op.tenant_shard_id, &op.timeline_id, op.generation, &mut layer_paths, diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 72bdbdefd6a3..bf06c78e673f 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -178,7 +178,14 @@ where .unwrap_or(false); if valid && *validated_generation == tenant_lsn_state.generation { - for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines { + for (timeline_id, pending_lsn) in tenant_lsn_state.timelines { + tracing::debug!( + %tenant_id, + %timeline_id, + current = %pending_lsn.result_slot.load(), + projected = %pending_lsn.projected, + "advancing validated remote_consistent_lsn", + ); pending_lsn.result_slot.store(pending_lsn.projected); } } else { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 642cafad285b..f01cd1cf8c9a 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -310,7 +310,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( .unwrap() .as_micros(), partition, - desc.tenant_id, + desc.tenant_shard_id, desc.timeline_id, candidate.layer, ); @@ -380,7 +380,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size))); for (timeline, batch) in batched { - let tenant_id = timeline.tenant_id; + let tenant_shard_id = timeline.tenant_shard_id; let timeline_id = timeline.timeline_id; let batch_size = u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning"); @@ -431,7 +431,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( (evicted_bytes, evictions_failed) } } - .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size)); + .instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size)); js.spawn(evict); @@ -572,7 +572,7 @@ async fn collect_eviction_candidates( continue; } let info = tl.get_local_layers_for_disk_usage_eviction().await; - debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); tenant_candidates.extend( info.resident_layers .into_iter() diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 4d455243f0eb..237109abfec8 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -624,6 +624,99 @@ paths: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_id}/location_config: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: flush_ms + in: query + required: false + schema: + type: integer + put: + description: | + Configures a _tenant location_, that is how a particular pageserver handles + a particular tenant. This includes _attached_ tenants, i.e. those ingesting WAL + and page service requests, and _secondary_ tenants, i.e. those which are just keeping + a warm cache in anticipation of transitioning to attached state in the future. + + This is a declarative, idempotent API: there are not separate endpoints + for different tenant location configurations. Rather, this single endpoint accepts + a description of the desired location configuration, and makes whatever changes + are required to reach that state. + + In imperative terms, this API is used to attach and detach tenants, and + to transition tenants to and from secondary mode. + + This is a synchronous API: there is no 202 response. State transitions should always + be fast (milliseconds), with the exception of requests setting `flush_ms`, in which case + the caller controls the runtime of the request. + + In some state transitions, it makes sense to flush dirty data to remote storage: this includes transitions + to AttachedStale and Detached. Flushing is never necessary for correctness, but is an + important optimization when doing migrations. The `flush_ms` parameter controls whether + flushing should be attempted, and how much time is allowed for flushing. If the time limit expires, + the requested transition will continue without waiting for any outstanding data to flush. Callers + should use a duration which is substantially less than their HTTP client's request + timeout. It is safe to supply flush_ms irrespective of the request body: in state transitions + where flushing doesn't make sense, the server will ignore it. + + It is safe to retry requests, but if one receives a 409 or 503 response, it is not + useful to retry aggressively: there is probably an existing request still ongoing. + requestBody: + required: false + content: + application/json: + schema: + $ref: "#/components/schemas/TenantLocationConfigRequest" + responses: + "200": + description: Tenant is now in requested state + "503": + description: Tenant's state cannot be changed right now. Wait a few seconds and retry. + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "409": + description: | + The tenant is already known to Pageserver in some way, + and hence this `/attach` call has been rejected. + + Some examples of how this can happen: + - tenant was created on this pageserver + - tenant attachment was started by an earlier call to `/attach`. + + Callers should poll the tenant status's `attachment_status` field, + like for status 202. See the longer description for `POST /attach` + for details. + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/detach: parameters: - name: tenant_id @@ -935,6 +1028,9 @@ paths: format: hex pg_version: type: integer + existing_initdb_timeline_id: + type: string + format: hex responses: "201": description: TimelineInfo @@ -1274,6 +1370,31 @@ components: tenant_id: type: string format: hex + TenantLocationConfigRequest: + type: object + required: + - tenant_id + properties: + tenant_id: + type: string + format: hex + mode: + type: string + enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] + description: Mode of functionality that this pageserver will run in for this tenant. + generation: + type: integer + description: Attachment generation number, mandatory when `mode` is an attached state + secondary_conf: + $ref: '#/components/schemas/SecondaryConfig' + tenant_conf: + $ref: '#/components/schemas/TenantConfig' + SecondaryConfig: + type: object + properties: + warm: + type: boolean + description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything. TenantConfig: type: object properties: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5ce09500ee81..29a1ff52e818 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -4,6 +4,7 @@ use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use anyhow::{anyhow, Context, Result}; use enumset::EnumSet; @@ -337,13 +338,7 @@ async fn build_timeline_info_common( Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), }; - let current_logical_size = match timeline.get_current_logical_size(ctx) { - Ok((size, _)) => Some(size), - Err(err) => { - error!("Timeline info creation failed to get current logical size: {err:?}"); - None - } - }; + let current_logical_size = timeline.get_current_logical_size(ctx); let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); let remote_consistent_lsn_projected = timeline @@ -356,7 +351,8 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); let info = TimelineInfo { - tenant_id: timeline.tenant_id, + // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id + tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, ancestor_timeline_id, ancestor_lsn, @@ -366,7 +362,11 @@ async fn build_timeline_info_common( last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), - current_logical_size, + current_logical_size: current_logical_size.size_dont_care_about_accuracy(), + current_logical_size_is_accurate: match current_logical_size.accuracy() { + tenant::timeline::logical_size::Accuracy::Approximate => false, + tenant::timeline::logical_size::Accuracy::Exact => true, + }, current_physical_size, current_logical_size_non_incremental: None, timeline_dir_layer_file_size_sum: None, @@ -439,6 +439,7 @@ async fn timeline_create_handler( request_data.ancestor_timeline_id.map(TimelineId::from), request_data.ancestor_start_lsn, request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), + request_data.existing_initdb_timeline_id, state.broker_client.clone(), &ctx, ) @@ -1157,6 +1158,7 @@ async fn put_tenant_location_config_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let request_data: TenantLocationConfigRequest = json_request(&mut request).await?; + let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); @@ -1189,7 +1191,7 @@ async fn put_tenant_location_config_handler( state .tenant_manager - .upsert_location(tenant_shard_id, location_conf, &ctx) + .upsert_location(tenant_shard_id, location_conf, flush, &ctx) .await // TODO: badrequest assumes the caller was asking for something unreasonable, but in // principle we might have hit something like concurrent API calls to the same tenant, diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 770458e02e8d..452cd73f76b9 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -7,12 +7,13 @@ use std::pin::Pin; use std::task::{self, Poll}; use anyhow::{bail, ensure, Context, Result}; +use async_compression::tokio::bufread::ZstdDecoder; use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; use nix::NixPath; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio_tar::Archive; use tokio_tar::Builder; use tokio_tar::HeaderMode; @@ -732,3 +733,13 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result> { } Ok(compressed.buf) } + +pub async fn extract_tar_zst( + pgdata_path: &Utf8Path, + tar_zst: impl AsyncBufRead + Unpin, +) -> Result<()> { + let tar = Box::pin(ZstdDecoder::new(tar_zst)); + let mut archive = Archive::new(tar); + archive.unpack(pgdata_path).await?; + Ok(()) +} diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index d5915f4c9839..d2684691e01a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -7,6 +7,7 @@ use metrics::{ HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; +use pageserver_api::shard::TenantShardId; use strum::{EnumCount, IntoEnumIterator, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; use utils::id::{TenantId, TimelineId}; @@ -402,6 +403,126 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); +pub(crate) mod initial_logical_size { + use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; + use once_cell::sync::Lazy; + + use crate::task_mgr::TaskKind; + + pub(crate) struct StartCalculation(IntCounterVec); + pub(crate) static START_CALCULATION: Lazy = Lazy::new(|| { + StartCalculation( + register_int_counter_vec!( + "pageserver_initial_logical_size_start_calculation", + "Incremented each time we start an initial logical size calculation attempt. \ + The `task_kind` label is for the task kind that caused this attempt.", + &["attempt", "task_kind"] + ) + .unwrap(), + ) + }); + + struct DropCalculation { + first: IntCounter, + retry: IntCounter, + } + + static DROP_CALCULATION: Lazy = Lazy::new(|| { + let vec = register_int_counter_vec!( + "pageserver_initial_logical_size_drop_calculation", + "Incremented each time we abort a started size calculation attmpt.", + &["attempt"] + ) + .unwrap(); + DropCalculation { + first: vec.with_label_values(&["first"]), + retry: vec.with_label_values(&["retry"]), + } + }); + + pub(crate) struct Calculated { + pub(crate) births: IntCounter, + pub(crate) deaths: IntCounter, + } + + pub(crate) static CALCULATED: Lazy = Lazy::new(|| Calculated { + births: register_int_counter!( + "pageserver_initial_logical_size_finish_calculation", + "Incremented every time we finish calculation of initial logical size.\ + If everything is working well, this should happen at most once per Timeline object." + ) + .unwrap(), + deaths: register_int_counter!( + "pageserver_initial_logical_size_drop_finished_calculation", + "Incremented when we drop a finished initial logical size calculation result.\ + Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge." + ) + .unwrap(), + }); + + pub(crate) struct OngoingCalculationGuard { + inc_drop_calculation: Option, + } + + impl StartCalculation { + pub(crate) fn first(&self, causing_task_kind: Option) -> OngoingCalculationGuard { + let task_kind_label: &'static str = + causing_task_kind.map(|k| k.into()).unwrap_or_default(); + self.0.with_label_values(&["first", task_kind_label]); + OngoingCalculationGuard { + inc_drop_calculation: Some(DROP_CALCULATION.first.clone()), + } + } + pub(crate) fn retry(&self, causing_task_kind: Option) -> OngoingCalculationGuard { + let task_kind_label: &'static str = + causing_task_kind.map(|k| k.into()).unwrap_or_default(); + self.0.with_label_values(&["retry", task_kind_label]); + OngoingCalculationGuard { + inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()), + } + } + } + + impl Drop for OngoingCalculationGuard { + fn drop(&mut self) { + if let Some(counter) = self.inc_drop_calculation.take() { + counter.inc(); + } + } + } + + impl OngoingCalculationGuard { + pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard { + drop(self.inc_drop_calculation.take()); + CALCULATED.births.inc(); + FinishedCalculationGuard { + inc_on_drop: CALCULATED.deaths.clone(), + } + } + } + + pub(crate) struct FinishedCalculationGuard { + inc_on_drop: IntCounter, + } + + impl Drop for FinishedCalculationGuard { + fn drop(&mut self) { + self.inc_on_drop.inc(); + } + } + + // context: https://github.com/neondatabase/neon/issues/5963 + pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy = + Lazy::new(|| { + register_int_counter!( + "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size", + "Counter for the following event: walreceiver calls\ + Timeline::get_current_logical_size() and it returns `Approximate` for the first time." + ) + .unwrap() + }); +} + pub(crate) static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", @@ -1252,6 +1373,15 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { .unwrap() }); +pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_process_launch_duration", + "Histogram of the duration of successful WalRedoProcess::launch calls", + redo_histogram_time_buckets!(), + ) + .expect("failed to define a metric") +}); + pub(crate) struct WalRedoProcessCounters { pub(crate) started: IntCounter, pub(crate) killed_by_cause: enum_map::EnumMap, @@ -1571,9 +1701,9 @@ pub struct RemoteTimelineClientMetrics { } impl RemoteTimelineClientMetrics { - pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { RemoteTimelineClientMetrics { - tenant_id: tenant_id.to_string(), + tenant_id: tenant_shard_id.tenant_id.to_string(), timeline_id: timeline_id.to_string(), calls_unfinished_gauge: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), @@ -1961,6 +2091,7 @@ pub fn preinitialize_metrics() { &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, &WAL_REDO_BYTES_HISTOGRAM, + &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, ] .into_iter() .for_each(|h| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index ee5f1732e4ac..82c16eb9bd09 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -399,6 +399,9 @@ impl PageServerHandler { { debug_assert_current_span_has_tenant_and_timeline_id(); + // TODO(sharding): enumerate local tenant shards for this tenant, and select the one + // that should serve this request. + // Make request tracer if needed let tenant = mgr::get_active_tenant_with_timeout( tenant_id, @@ -408,9 +411,10 @@ impl PageServerHandler { .await?; let mut tracer = if tenant.get_trace_read_requests() { let connection_id = ConnectionId::generate(); - let path = tenant - .conf - .trace_path(&tenant_id, &timeline_id, &connection_id); + let path = + tenant + .conf + .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id); Some(Tracer::new(path)) } else { None diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 24f47df92ef6..97d731bf4909 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -138,6 +138,14 @@ pub struct GcResult { #[serde(serialize_with = "serialize_duration_as_millis")] pub elapsed: Duration, + + /// The layers which were garbage collected. + /// + /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be + /// dropped in tests. + #[cfg(feature = "testing")] + #[serde(skip)] + pub(crate) doomed_layers: Vec, } // helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds @@ -158,5 +166,11 @@ impl AddAssign for GcResult { self.layers_removed += other.layers_removed; self.elapsed += other.elapsed; + + #[cfg(feature = "testing")] + { + let mut other = other; + self.doomed_layers.append(&mut other.doomed_layers); + } } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7384459ab523..422cb671fe82 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -15,11 +15,16 @@ use anyhow::{bail, Context}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use enumset::EnumSet; +use futures::stream::FuturesUnordered; use futures::FutureExt; +use futures::StreamExt; use pageserver_api::models::TimelineState; +use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; +use std::fmt; use storage_broker::BrokerClientChannel; +use tokio::io::BufReader; use tokio::runtime::Handle; use tokio::sync::watch; use tokio::task::JoinSet; @@ -30,26 +35,7 @@ use utils::completion; use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext; use utils::sync::gate::Gate; - -use std::cmp::min; -use std::collections::hash_map::Entry; -use std::collections::BTreeSet; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fmt::Debug; -use std::fmt::Display; -use std::fs; -use std::fs::File; -use std::io; -use std::ops::Bound::Included; -use std::process::Command; -use std::process::Stdio; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; -use std::sync::Arc; -use std::sync::MutexGuard; -use std::sync::{Mutex, RwLock}; -use std::time::{Duration, Instant}; +use utils::sync::gate::GateGuard; use self::config::AttachedLocationConfig; use self::config::AttachmentMode; @@ -69,6 +55,7 @@ use self::timeline::TimelineResources; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; +use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::TENANT_ACTIVATION; @@ -84,14 +71,35 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::InitializationOrder; +use std::cmp::min; +use std::collections::hash_map::Entry; +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::collections::HashSet; +use std::fmt::Debug; +use std::fmt::Display; +use std::fs; +use std::fs::File; +use std::io; +use std::ops::Bound::Included; +use std::process::Stdio; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::sync::MutexGuard; +use std::sync::{Mutex, RwLock}; +use std::time::{Duration, Instant}; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; use crate::walredo::PostgresRedoManager; use crate::TEMP_FILE_SUFFIX; +use once_cell::sync::Lazy; pub use pageserver_api::models::TenantState; +use tokio::sync::Semaphore; +static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); use toml_edit; use utils::{ crashsafe, @@ -226,7 +234,7 @@ pub struct Tenant { // This is necessary to allow global config updates. tenant_conf: Arc>, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, /// The remote storage generation, used to protect S3 objects from split-brain. /// Does not change over the lifetime of the [`Tenant`] object. @@ -270,7 +278,7 @@ pub struct Tenant { impl std::fmt::Debug for Tenant { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{} ({})", self.tenant_id, self.current_state()) + write!(f, "{} ({})", self.tenant_shard_id, self.current_state()) } } @@ -403,6 +411,36 @@ pub enum CreateTimelineError { Other(#[from] anyhow::Error), } +#[derive(thiserror::Error, Debug)] +enum InitdbError { + Other(anyhow::Error), + Cancelled, + Spawn(std::io::Result<()>), + Failed(std::process::ExitStatus, Vec), +} + +impl fmt::Display for InitdbError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + InitdbError::Cancelled => write!(f, "Operation was cancelled"), + InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e), + InitdbError::Failed(status, stderr) => write!( + f, + "Command failed with status {:?}: {}", + status, + String::from_utf8_lossy(stderr) + ), + InitdbError::Other(e) => write!(f, "Error: {:?}", e), + } + } +} + +impl From for InitdbError { + fn from(error: std::io::Error) -> Self { + InitdbError::Spawn(Err(error)) + } +} + struct TenantDirectoryScan { sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>, timelines_to_resume_deletion: Vec<(TimelineId, Option)>, @@ -434,7 +472,7 @@ impl Tenant { init_order: Option<&InitializationOrder>, _ctx: &RequestContext, ) -> anyhow::Result<()> { - let tenant_id = self.tenant_id; + let tenant_id = self.tenant_shard_id; let timeline = self.create_timeline_struct( timeline_id, @@ -526,7 +564,7 @@ impl Tenant { #[allow(clippy::too_many_arguments)] pub(crate) fn spawn( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, resources: TenantSharedResources, attached_conf: AttachedTenantConf, init_order: Option, @@ -534,8 +572,10 @@ impl Tenant { mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result> { + // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, tenant_id, + conf, + tenant_shard_id.tenant_id, ))); let TenantSharedResources { @@ -549,7 +589,7 @@ impl Tenant { conf, attached_conf, wal_redo_manager, - tenant_id, + tenant_shard_id, remote_storage.clone(), deletion_queue_client, )); @@ -561,7 +601,7 @@ impl Tenant { task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, - Some(tenant_id), + Some(tenant_shard_id.tenant_id), None, "attach tenant", false, @@ -600,7 +640,7 @@ impl Tenant { match tenant_clone .preload(remote_storage, task_mgr::shutdown_token()) .instrument( - tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_id), + tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()), ) .await { Ok(p) => p, @@ -682,7 +722,7 @@ impl Tenant { Ok(()) } .instrument({ - let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_id); + let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); span.follows_from(Span::current()); span }), @@ -700,7 +740,7 @@ impl Tenant { info!("listing remote timelines"); let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines( remote_storage, - self.tenant_id, + self.tenant_shard_id, cancel.clone(), ) .await?; @@ -733,7 +773,7 @@ impl Tenant { /// async fn attach( self: &Arc, - mut init_order: Option, + init_order: Option, preload: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -750,31 +790,37 @@ impl Tenant { } }; - // Signal that we have completed remote phase - init_order - .as_mut() - .and_then(|x| x.initial_tenant_load_remote.take()); - let mut timelines_to_resume_deletions = vec![]; let mut remote_index_and_client = HashMap::new(); let mut timeline_ancestors = HashMap::new(); let mut existent_timelines = HashSet::new(); for (timeline_id, preload) in preload.timelines { - // In this context a timeline "exists" if it has any content in remote storage: this will - // be our cue to not delete any corresponding local directory - existent_timelines.insert(timeline_id); - let index_part = match preload.index_part { Ok(i) => { debug!("remote index part exists for timeline {timeline_id}"); + // We found index_part on the remote, this is the standard case. + existent_timelines.insert(timeline_id); i } + Err(DownloadError::NotFound) => { + // There is no index_part on the remote. We only get here + // if there is some prefix for the timeline in the remote storage. + // This can e.g. be the initdb.tar.zst archive, maybe a + // remnant from a prior incomplete creation or deletion attempt. + // Delete the local directory as the deciding criterion for a + // timeline's existence is presence of index_part. + info!(%timeline_id, "index_part not found on remote"); + continue; + } Err(e) => { - // Timeline creation is not atomic: we might upload a layer but no index_part. We expect - // that the creation will be retried by the control plane and eventually result in - // a valid loadable state. + // Some (possibly ephemeral) error happened during index_part download. + // Pretend the timeline exists to not delete the timeline directory, + // as it might be a temporary issue and we don't want to re-download + // everything after it resolves. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); + + existent_timelines.insert(timeline_id); continue; } }; @@ -817,7 +863,7 @@ impl Tenant { .with_context(|| { format!( "failed to load remote timeline {} for tenant {}", - timeline_id, self.tenant_id + timeline_id, self.tenant_shard_id ) })?; } @@ -857,7 +903,7 @@ impl Tenant { /// timeline that still exists: this can happen if we crashed during a deletion/creation, or /// if a timeline was deleted while the tenant was attached to a different pageserver. fn clean_up_timelines(&self, existent_timelines: &HashSet) -> anyhow::Result<()> { - let timelines_dir = self.conf.timelines_path(&self.tenant_id); + let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id); let entries = match timelines_dir.read_dir_utf8() { Ok(d) => d, @@ -943,7 +989,7 @@ impl Tenant { span::debug_assert_current_span_has_tenant_id(); info!("downloading index file for timeline {}", timeline_id); - tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_id, &timeline_id)) + tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_shard_id, &timeline_id)) .await .context("Failed to create new timeline directory")?; @@ -965,10 +1011,15 @@ impl Tenant { let init_order = None; // timeline loading after attach expects to find metadata file for each metadata - save_metadata(self.conf, &self.tenant_id, &timeline_id, &remote_metadata) - .await - .context("save_metadata") - .map_err(LoadLocalTimelineError::Load)?; + save_metadata( + self.conf, + &self.tenant_shard_id, + &timeline_id, + &remote_metadata, + ) + .await + .context("save_metadata") + .map_err(LoadLocalTimelineError::Load)?; self.timeline_init_and_sync( timeline_id, @@ -985,11 +1036,13 @@ impl Tenant { /// Create a placeholder Tenant object for a broken tenant pub fn create_broken_tenant( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, reason: String, ) -> Arc { + // TODO(sharding): make WalRedoManager shard-aware let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, tenant_id, + conf, + tenant_shard_id.tenant_id, ))); Arc::new(Tenant::new( TenantState::Broken { @@ -999,7 +1052,7 @@ impl Tenant { conf, AttachedTenantConf::try_from(LocationConf::default()).unwrap(), wal_redo_manager, - tenant_id, + tenant_shard_id, None, DeletionQueueClient::broken(), )) @@ -1012,7 +1065,7 @@ impl Tenant { // completed in non topological order (for example because parent has smaller number of layer files in it) let mut timelines_to_resume_deletion: Vec<(TimelineId, Option)> = vec![]; - let timelines_dir = self.conf.timelines_path(&self.tenant_id); + let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id); for entry in timelines_dir .read_dir_utf8() @@ -1043,7 +1096,7 @@ impl Tenant { "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}", ) })?; - let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id); + let timeline_dir = self.conf.timeline_path(&self.tenant_shard_id, &timeline_id); if let Err(e) = remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) { @@ -1060,7 +1113,7 @@ impl Tenant { info!("Found deletion mark for timeline {}", timeline_id); - match load_metadata(self.conf, &self.tenant_id, &timeline_id) { + match load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) { Ok(metadata) => { timelines_to_resume_deletion.push((timeline_id, Some(metadata))) } @@ -1104,7 +1157,7 @@ impl Tenant { })?; let timeline_uninit_mark_file = self .conf - .timeline_uninit_mark_file_path(self.tenant_id, timeline_id); + .timeline_uninit_mark_file_path(self.tenant_shard_id, timeline_id); if timeline_uninit_mark_file.exists() { info!( %timeline_id, @@ -1120,7 +1173,7 @@ impl Tenant { let timeline_delete_mark_file = self .conf - .timeline_delete_mark_file_path(self.tenant_id, timeline_id); + .timeline_delete_mark_file_path(self.tenant_shard_id, timeline_id); if timeline_delete_mark_file.exists() { // Cleanup should be done in `is_delete_mark` branch above continue; @@ -1128,7 +1181,7 @@ impl Tenant { let file_name = entry.file_name(); if let Ok(timeline_id) = file_name.parse::() { - let metadata = load_metadata(self.conf, &self.tenant_id, &timeline_id) + let metadata = load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) .context("failed to load metadata")?; timelines_to_load.insert(timeline_id, metadata); } else { @@ -1160,7 +1213,7 @@ impl Tenant { remote_storage.clone(), self.deletion_queue_client.clone(), self.conf, - self.tenant_id, + self.tenant_shard_id, timeline_id, self.generation, ); @@ -1366,8 +1419,12 @@ impl Tenant { .map_err(LoadLocalTimelineError::Load) } - pub fn tenant_id(&self) -> TenantId { - self.tenant_id + pub(crate) fn tenant_id(&self) -> TenantId { + self.tenant_shard_id.tenant_id + } + + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { + self.tenant_shard_id } /// Get Timeline handle for given Neon timeline ID. @@ -1381,13 +1438,13 @@ impl Tenant { let timeline = timelines_accessor .get(&timeline_id) .ok_or(GetTimelineError::NotFound { - tenant_id: self.tenant_id, + tenant_id: self.tenant_shard_id.tenant_id, timeline_id, })?; if active_only && !timeline.is_active() { Err(GetTimelineError::NotActive { - tenant_id: self.tenant_id, + tenant_id: self.tenant_shard_id.tenant_id, timeline_id, state: timeline.current_state(), }) @@ -1513,12 +1570,14 @@ impl Tenant { /// /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists. + #[allow(clippy::too_many_arguments)] pub async fn create_timeline( &self, new_timeline_id: TimelineId, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, pg_version: u32, + load_existing_initdb: Option, broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, ) -> Result, CreateTimelineError> { @@ -1593,7 +1652,7 @@ impl Tenant { .await? } None => { - self.bootstrap_timeline(new_timeline_id, pg_version, ctx) + self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx) .await? } }; @@ -1745,7 +1804,7 @@ impl Tenant { *current_state = TenantState::Activating(ActivatingFrom::Attaching); } } - debug!(tenant_id = %self.tenant_id, "Activating tenant"); + debug!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), "Activating tenant"); activating = true; // Continue outside the closure. We need to grab timelines.lock() // and we plan to turn it into a tokio::sync::Mutex in a future patch. @@ -1782,7 +1841,8 @@ impl Tenant { // times to activate. see https://github.com/neondatabase/neon/issues/4025 info!( since_creation_millis = elapsed.as_millis(), - tenant_id = %self.tenant_id, + tenant_id = %self.tenant_shard_id.tenant_id, + shard_id = %self.tenant_shard_id.shard_slug(), activated_timelines, total_timelines, post_state = <&'static str>::from(&*current_state), @@ -1879,7 +1939,7 @@ impl Tenant { // // this will additionally shutdown and await all timeline tasks. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await; + task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await; // Wait for any in-flight operations to complete self.gate.close().await; @@ -2054,7 +2114,7 @@ impl Tenant { receiver.changed().await.map_err( |_e: tokio::sync::watch::error::RecvError| // Tenant existed but was dropped: report it as non-existent - GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_id)) + GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) )?; } TenantState::Active { .. } => { @@ -2128,9 +2188,6 @@ where } impl Tenant { - pub fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } pub fn tenant_specific_overrides(&self) -> TenantConfOpt { self.tenant_conf.read().unwrap().tenant_conf } @@ -2280,7 +2337,7 @@ impl Tenant { new_metadata, ancestor, new_timeline_id, - self.tenant_id, + self.tenant_shard_id, self.generation, Arc::clone(&self.walredo_mgr), resources, @@ -2302,14 +2359,14 @@ impl Tenant { conf: &'static PageServerConf, attached_conf: AttachedTenantConf, walredo_mgr: Arc, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, remote_storage: Option, deletion_queue_client: DeletionQueueClient, ) -> Tenant { let (state, mut rx) = watch::channel(state); tokio::spawn(async move { - let tid = tenant_id.to_string(); + let tid = tenant_shard_id.to_string(); fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) { ([state.into()], matches!(state, TenantState::Broken { .. })) @@ -2361,7 +2418,7 @@ impl Tenant { }); Tenant { - tenant_id, + tenant_shard_id, generation: attached_conf.location.generation, conf, // using now here is good enough approximation to catch tenants with really long @@ -2379,17 +2436,17 @@ impl Tenant { eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(), - gate: Gate::new(format!("Tenant<{tenant_id}>")), + gate: Gate::new(format!("Tenant<{tenant_shard_id}>")), } } /// Locate and load config pub(super) fn load_tenant_config( conf: &'static PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> anyhow::Result { - let legacy_config_path = conf.tenant_config_path(tenant_id); - let config_path = conf.tenant_location_config_path(tenant_id); + let legacy_config_path = conf.tenant_config_path(tenant_shard_id); + let config_path = conf.tenant_location_config_path(tenant_shard_id); if config_path.exists() { // New-style config takes precedence @@ -2403,9 +2460,7 @@ impl Tenant { for (key, item) in deserialized.iter() { match key { "tenant_config" => { - tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| { - format!("Failed to parse config from file '{legacy_config_path}' as pageserver config") - })?; + tenant_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("Failed to parse config from file '{legacy_config_path}' as pageserver config"))?; } _ => bail!( "config file {legacy_config_path} has unrecognized pageserver option '{key}'" @@ -2443,29 +2498,34 @@ impl Tenant { .with_context(|| format!("Failed to parse config from file '{path}' as toml file")) } - #[tracing::instrument(skip_all, fields(%tenant_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config( conf: &'static PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, location_conf: &LocationConf, ) -> anyhow::Result<()> { - let legacy_config_path = conf.tenant_config_path(tenant_id); - let config_path = conf.tenant_location_config_path(tenant_id); - - Self::persist_tenant_config_at(tenant_id, &config_path, &legacy_config_path, location_conf) - .await + let legacy_config_path = conf.tenant_config_path(tenant_shard_id); + let config_path = conf.tenant_location_config_path(tenant_shard_id); + + Self::persist_tenant_config_at( + tenant_shard_id, + &config_path, + &legacy_config_path, + location_conf, + ) + .await } - #[tracing::instrument(skip_all, fields(%tenant_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config_at( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, config_path: &Utf8Path, legacy_config_path: &Utf8Path, location_conf: &LocationConf, ) -> anyhow::Result<()> { // Forward compat: write out an old-style configuration that old versions can read, in case we roll back Self::persist_tenant_config_legacy( - tenant_id, + tenant_shard_id, legacy_config_path, &location_conf.tenant_conf, ) @@ -2492,14 +2552,16 @@ impl Tenant { let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX); - let tenant_id = *tenant_id; + let tenant_shard_id = *tenant_shard_id; let config_path = config_path.to_owned(); tokio::task::spawn_blocking(move || { Handle::current().block_on(async move { let conf_content = conf_content.as_bytes(); VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content) .await - .with_context(|| format!("write tenant {tenant_id} config to {config_path}")) + .with_context(|| { + format!("write tenant {tenant_shard_id} config to {config_path}") + }) }) }) .await??; @@ -2507,9 +2569,9 @@ impl Tenant { Ok(()) } - #[tracing::instrument(skip_all, fields(%tenant_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] async fn persist_tenant_config_legacy( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, target_config_path: &Utf8Path, tenant_conf: &TenantConfOpt, ) -> anyhow::Result<()> { @@ -2527,7 +2589,7 @@ impl Tenant { let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX); - let tenant_id = *tenant_id; + let tenant_shard_id = *tenant_shard_id; let target_config_path = target_config_path.to_owned(); tokio::task::spawn_blocking(move || { Handle::current().block_on(async move { @@ -2535,7 +2597,7 @@ impl Tenant { VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content) .await .with_context(|| { - format!("write tenant {tenant_id} config to {target_config_path}") + format!("write tenant {tenant_shard_id} config to {target_config_path}") }) }) }) @@ -2611,14 +2673,12 @@ impl Tenant { // Perform GC for each timeline. // - // Note that we don't hold the GC lock here because we don't want - // to delay the branch creation task, which requires the GC lock. - // A timeline GC iteration can be slow because it may need to wait for - // compaction (both require `layer_removal_cs` lock), - // but the GC iteration can run concurrently with branch creation. + // Note that we don't hold the `Tenant::gc_cs` lock here because we don't want to delay the + // branch creation task, which requires the GC lock. A GC iteration can run concurrently + // with branch creation. // - // See comments in [`Tenant::branch_timeline`] for more information - // about why branch creation task can run concurrently with timeline's GC iteration. + // See comments in [`Tenant::branch_timeline`] for more information about why branch + // creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { if task_mgr::is_shutdown_requested() || cancel.is_cancelled() { // We were requested to shut down. Stop and return with the progress we @@ -2901,10 +2961,11 @@ impl Tenant { /// - after initialization completes, tar up the temp dir and upload it to S3. /// /// The caller is responsible for activating the returned timeline. - async fn bootstrap_timeline( + pub(crate) async fn bootstrap_timeline( &self, timeline_id: TimelineId, pg_version: u32, + load_existing_initdb: Option, ctx: &RequestContext, ) -> anyhow::Result> { let timeline_uninit_mark = { @@ -2913,55 +2974,79 @@ impl Tenant { }; // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` // temporary directory for basebackup files for the given timeline. - let initdb_path = path_with_suffix_extension( + let pgdata_path = path_with_suffix_extension( self.conf - .timelines_path(&self.tenant_id) + .timelines_path(&self.tenant_shard_id) .join(format!("basebackup-{timeline_id}")), TEMP_FILE_SUFFIX, ); // an uninit mark was placed before, nothing else can access this timeline files // current initdb was not run yet, so remove whatever was left from the previous runs - if initdb_path.exists() { - fs::remove_dir_all(&initdb_path).with_context(|| { - format!("Failed to remove already existing initdb directory: {initdb_path}") + if pgdata_path.exists() { + fs::remove_dir_all(&pgdata_path).with_context(|| { + format!("Failed to remove already existing initdb directory: {pgdata_path}") })?; } - // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path - run_initdb(self.conf, &initdb_path, pg_version)?; // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it scopeguard::defer! { - if let Err(e) = fs::remove_dir_all(&initdb_path) { + if let Err(e) = fs::remove_dir_all(&pgdata_path) { // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call - error!("Failed to remove temporary initdb directory '{initdb_path}': {e}"); + error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}"); } } - let pgdata_path = &initdb_path; - let pgdata_lsn = import_datadir::get_lsn_from_controlfile(pgdata_path)?.align(); - - // Upload the created data dir to S3 - if let Some(storage) = &self.remote_storage { - let pgdata_zstd = import_datadir::create_tar_zst(pgdata_path).await?; - let pgdata_zstd = Bytes::from(pgdata_zstd); - backoff::retry( - || async { - self::remote_timeline_client::upload_initdb_dir( - storage, - &self.tenant_id, - &timeline_id, - pgdata_zstd.clone(), - ) + if let Some(existing_initdb_timeline_id) = load_existing_initdb { + let Some(storage) = &self.remote_storage else { + bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}"); + }; + let (initdb_tar_zst_path, initdb_tar_zst) = + self::remote_timeline_client::download_initdb_tar_zst( + self.conf, + storage, + &self.tenant_shard_id, + &existing_initdb_timeline_id, + ) + .await + .context("download initdb tar")?; + let buf_read = Box::pin(BufReader::new(initdb_tar_zst)); + import_datadir::extract_tar_zst(&pgdata_path, buf_read) + .await + .context("extract initdb tar")?; + + if initdb_tar_zst_path.exists() { + tokio::fs::remove_file(&initdb_tar_zst_path) .await - }, - |_| false, - 3, - u32::MAX, - "persist_initdb_tar_zst", - // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) - backoff::Cancel::new(CancellationToken::new(), || unreachable!()), - ) - .await?; + .context("tempfile removal")?; + } + } else { + // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path + run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; + + // Upload the created data dir to S3 + if let Some(storage) = &self.remote_storage { + let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?; + let pgdata_zstd = Bytes::from(pgdata_zstd); + backoff::retry( + || async { + self::remote_timeline_client::upload_initdb_dir( + storage, + &self.tenant_shard_id.tenant_id, + &timeline_id, + pgdata_zstd.clone(), + ) + .await + }, + |_| false, + 3, + u32::MAX, + "persist_initdb_tar_zst", + // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) + backoff::Cancel::new(CancellationToken::new(), || unreachable!()), + ) + .await?; + } } + let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. @@ -2986,18 +3071,18 @@ impl Tenant { ) .await?; - let tenant_id = raw_timeline.owning_tenant.tenant_id; + let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; let unfinished_timeline = raw_timeline.raw_timeline()?; import_datadir::import_timeline_from_postgres_datadir( unfinished_timeline, - pgdata_path, + &pgdata_path, pgdata_lsn, ctx, ) .await .with_context(|| { - format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") + format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}") })?; // Flush the new layer files to disk, before we make the timeline as available to @@ -3015,7 +3100,7 @@ impl Tenant { .await .with_context(|| { format!( - "Failed to flush after pgdatadir import for timeline {tenant_id}/{timeline_id}" + "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}" ) })?; @@ -3038,7 +3123,7 @@ impl Tenant { remote_storage.clone(), self.deletion_queue_client.clone(), self.conf, - self.tenant_id, + self.tenant_shard_id, timeline_id, self.generation, ); @@ -3067,7 +3152,7 @@ impl Tenant { start_lsn: Lsn, ancestor: Option>, ) -> anyhow::Result { - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); if let Some(remote_client) = &resources.remote_client { @@ -3091,12 +3176,14 @@ impl Tenant { .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata) .await { - error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); + error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); cleanup_timeline_directory(uninit_mark); return Err(e); } - debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}"); + debug!( + "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}" + ); Ok(UninitializedTimeline::new( self, @@ -3117,9 +3204,14 @@ impl Tenant { anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); }); - save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata) - .await - .context("Failed to create timeline metadata")?; + save_metadata( + self.conf, + &self.tenant_shard_id, + new_timeline_id, + new_metadata, + ) + .await + .context("Failed to create timeline metadata")?; Ok(()) } @@ -3132,13 +3224,13 @@ impl Tenant { timeline_id: TimelineId, timelines: &MutexGuard>>, ) -> anyhow::Result { - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_shard_id; anyhow::ensure!( timelines.get(&timeline_id).is_none(), - "Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory" + "Timeline {tenant_shard_id}/{timeline_id} already exists in pageserver's memory" ); - let timeline_path = self.conf.timeline_path(&tenant_id, &timeline_id); + let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); anyhow::ensure!( !timeline_path.exists(), "Timeline {timeline_path} already exists, cannot create its uninit mark file", @@ -3146,7 +3238,7 @@ impl Tenant { let uninit_mark_path = self .conf - .timeline_uninit_mark_file_path(tenant_id, timeline_id); + .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id); fs::OpenOptions::new() .write(true) .create_new(true) @@ -3157,7 +3249,7 @@ impl Tenant { .context("Failed to fsync uninit mark file") }) .with_context(|| { - format!("Failed to crate uninit mark for timeline {tenant_id}/{timeline_id}") + format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}") })?; let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path); @@ -3168,7 +3260,7 @@ impl Tenant { /// Gathers inputs from all of the timelines to produce a sizing model input. /// /// Future is cancellation safe. Only one calculation can be running at once per tenant. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub async fn gather_size_inputs( &self, // `max_retention_period` overrides the cutoff that is used to calculate the size @@ -3207,7 +3299,7 @@ impl Tenant { /// Calculate synthetic tenant size and cache the result. /// This is periodically called by background worker. /// result is cached in tenant struct - #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub async fn calculate_synthetic_size( &self, cause: LogicalSizeCalculationCause, @@ -3229,7 +3321,7 @@ impl Tenant { .store(size, Ordering::Relaxed); TENANT_SYNTHETIC_SIZE_METRIC - .get_metric_with_label_values(&[&self.tenant_id.to_string()]) + .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) .unwrap() .set(size); } @@ -3237,6 +3329,66 @@ impl Tenant { pub fn cached_synthetic_size(&self) -> u64 { self.cached_synthetic_tenant_size.load(Ordering::Relaxed) } + + /// Flush any in-progress layers, schedule uploads, and wait for uploads to complete. + /// + /// This function can take a long time: callers should wrap it in a timeout if calling + /// from an external API handler. + /// + /// Cancel-safety: cancelling this function may leave I/O running, but such I/O is + /// still bounded by tenant/timeline shutdown. + #[tracing::instrument(skip_all)] + pub(crate) async fn flush_remote(&self) -> anyhow::Result<()> { + let timelines = self.timelines.lock().unwrap().clone(); + + async fn flush_timeline(_gate: GateGuard, timeline: Arc) -> anyhow::Result<()> { + tracing::info!(timeline_id=%timeline.timeline_id, "Flushing..."); + timeline.freeze_and_flush().await?; + tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads..."); + if let Some(client) = &timeline.remote_client { + client.wait_completion().await?; + } + + Ok(()) + } + + // We do not use a JoinSet for these tasks, because we don't want them to be + // aborted when this function's future is cancelled: they should stay alive + // holding their GateGuard until they complete, to ensure their I/Os complete + // before Timeline shutdown completes. + let mut results = FuturesUnordered::new(); + + for (_timeline_id, timeline) in timelines { + // Run each timeline's flush in a task holding the timeline's gate: this + // means that if this function's future is cancelled, the Timeline shutdown + // will still wait for any I/O in here to complete. + let gate = match timeline.gate.enter() { + Ok(g) => g, + Err(_) => continue, + }; + let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await }); + results.push(jh); + } + + while let Some(r) = results.next().await { + if let Err(e) = r { + if !e.is_cancelled() && !e.is_panic() { + tracing::error!("unexpected join error: {e:?}"); + } + } + } + + // The flushes we did above were just writes, but the Tenant might have had + // pending deletions as well from recent compaction/gc: we want to flush those + // as well. This requires flushing the global delete queue. This is cheap + // because it's typically a no-op. + match self.deletion_queue_client.flush_execute().await { + Ok(_) => {} + Err(DeletionQueueError::ShuttingDown) => {} + } + + Ok(()) + } } fn remove_timeline_and_uninit_mark( @@ -3265,9 +3417,9 @@ fn remove_timeline_and_uninit_mark( pub(crate) async fn create_tenant_files( conf: &'static PageServerConf, location_conf: &LocationConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> anyhow::Result { - let target_tenant_directory = conf.tenant_path(tenant_id); + let target_tenant_directory = conf.tenant_path(tenant_shard_id); anyhow::ensure!( !target_tenant_directory .try_exists() @@ -3287,14 +3439,16 @@ pub(crate) async fn create_tenant_files( let creation_result = try_create_target_tenant_dir( conf, location_conf, - tenant_id, + tenant_shard_id, &temporary_tenant_dir, &target_tenant_directory, ) .await; if creation_result.is_err() { - error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data"); + error!( + "Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data" + ); if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) { error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}") } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) { @@ -3312,31 +3466,31 @@ pub(crate) async fn create_tenant_files( async fn try_create_target_tenant_dir( conf: &'static PageServerConf, location_conf: &LocationConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, temporary_tenant_dir: &Utf8Path, target_tenant_directory: &Utf8Path, ) -> Result<(), anyhow::Error> { let temporary_tenant_timelines_dir = rebase_directory( - &conf.timelines_path(tenant_id), + &conf.timelines_path(tenant_shard_id), target_tenant_directory, temporary_tenant_dir, ) - .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?; + .with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?; let temporary_legacy_tenant_config_path = rebase_directory( - &conf.tenant_config_path(tenant_id), + &conf.tenant_config_path(tenant_shard_id), target_tenant_directory, temporary_tenant_dir, ) - .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; + .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?; let temporary_tenant_config_path = rebase_directory( - &conf.tenant_location_config_path(tenant_id), + &conf.tenant_location_config_path(tenant_shard_id), target_tenant_directory, temporary_tenant_dir, ) - .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; + .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?; Tenant::persist_tenant_config_at( - tenant_id, + tenant_shard_id, &temporary_tenant_config_path, &temporary_legacy_tenant_config_path, location_conf, @@ -3346,7 +3500,7 @@ async fn try_create_target_tenant_dir( crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( "create tenant {} temporary timelines directory {}", - tenant_id, temporary_tenant_timelines_dir, + tenant_shard_id, temporary_tenant_timelines_dir, ) })?; fail::fail_point!("tenant-creation-before-tmp-rename", |_| { @@ -3361,19 +3515,19 @@ async fn try_create_target_tenant_dir( fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( "move tenant {} temporary directory {} into the permanent one {}", - tenant_id, temporary_tenant_dir, target_tenant_directory + tenant_shard_id, temporary_tenant_dir, target_tenant_directory ) })?; let target_dir_parent = target_tenant_directory.parent().with_context(|| { format!( "get tenant {} dir parent for {}", - tenant_id, target_tenant_directory, + tenant_shard_id, target_tenant_directory, ) })?; crashsafe::fsync(target_dir_parent).with_context(|| { format!( "fsync renamed directory's parent {} for tenant {}", - target_dir_parent, tenant_id, + target_dir_parent, tenant_shard_id, ) })?; @@ -3396,42 +3550,54 @@ fn rebase_directory( /// Create the cluster temporarily in 'initdbpath' directory inside the repository /// to get bootstrap data for timeline initialization. -fn run_initdb( +async fn run_initdb( conf: &'static PageServerConf, initdb_target_dir: &Utf8Path, pg_version: u32, -) -> anyhow::Result<()> { - let initdb_bin_path = conf.pg_bin_dir(pg_version)?.join("initdb"); - let initdb_lib_dir = conf.pg_lib_dir(pg_version)?; + cancel: &CancellationToken, +) -> Result<(), InitdbError> { + let initdb_bin_path = conf + .pg_bin_dir(pg_version) + .map_err(InitdbError::Other)? + .join("initdb"); + let initdb_lib_dir = conf.pg_lib_dir(pg_version).map_err(InitdbError::Other)?; info!( "running {} in {}, libdir: {}", initdb_bin_path, initdb_target_dir, initdb_lib_dir, ); - let initdb_output = Command::new(&initdb_bin_path) + let _permit = INIT_DB_SEMAPHORE.acquire().await; + + let initdb_command = tokio::process::Command::new(&initdb_bin_path) .args(["-D", initdb_target_dir.as_ref()]) .args(["-U", &conf.superuser]) .args(["-E", "utf8"]) .arg("--no-instructions") - // This is only used for a temporary installation that is deleted shortly after, - // so no need to fsync it .arg("--no-sync") .env_clear() .env("LD_LIBRARY_PATH", &initdb_lib_dir) .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) - .stdout(Stdio::null()) - .output() - .with_context(|| { - format!( - "failed to execute {} at target dir {}", - initdb_bin_path, initdb_target_dir, - ) - })?; - if !initdb_output.status.success() { - bail!( - "initdb failed: '{}'", - String::from_utf8_lossy(&initdb_output.stderr) - ); + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + // If the `select!` below doesn't finish the `wait_with_output`, + // let the task get `wait()`ed for asynchronously by tokio. + // This means there is a slim chance we can go over the INIT_DB_SEMAPHORE. + // TODO: fix for this is non-trivial, see + // https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021 + // + .kill_on_drop(true) + .spawn()?; + + tokio::select! { + initdb_output = initdb_command.wait_with_output() => { + let initdb_output = initdb_output?; + if !initdb_output.status.success() { + return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr)); + } + } + _ = cancel.cancelled() => { + return Err(InitdbError::Cancelled); + } } Ok(()) @@ -3439,7 +3605,7 @@ fn run_initdb( impl Drop for Tenant { fn drop(&mut self) { - remove_tenant_metrics(&self.tenant_id); + remove_tenant_metrics(&self.tenant_shard_id.tenant_id); } } /// Dump contents of a layer file to stdout. @@ -3477,6 +3643,7 @@ pub async fn dump_layerfile_from_path( pub(crate) mod harness { use bytes::{Bytes, BytesMut}; use once_cell::sync::OnceCell; + use pageserver_api::shard::ShardIndex; use std::fs; use std::sync::Arc; use utils::logging; @@ -3541,8 +3708,11 @@ pub(crate) mod harness { pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, - pub tenant_id: TenantId, + // TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id + pub(crate) tenant_id: TenantId, + pub tenant_shard_id: TenantShardId, pub generation: Generation, + pub shard: ShardIndex, pub remote_storage: GenericRemoteStorage, pub remote_fs_dir: Utf8PathBuf, pub deletion_queue: MockDeletionQueue, @@ -3585,8 +3755,9 @@ pub(crate) mod harness { }; let tenant_id = TenantId::generate(); - fs::create_dir_all(conf.tenant_path(&tenant_id))?; - fs::create_dir_all(conf.timelines_path(&tenant_id))?; + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?; + fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; let remote_fs_dir = conf.workdir.join("localfs"); @@ -3601,7 +3772,9 @@ pub(crate) mod harness { conf, tenant_conf, tenant_id, + tenant_shard_id, generation: Generation::new(0xdeadbeef), + shard: ShardIndex::unsharded(), remote_storage, remote_fs_dir, deletion_queue, @@ -3619,7 +3792,7 @@ pub(crate) mod harness { } fn remote_empty(&self) -> bool { - let tenant_path = self.conf.tenant_path(&self.tenant_id); + let tenant_path = self.conf.tenant_path(&self.tenant_shard_id); let remote_tenant_dir = self .remote_fs_dir .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap()); @@ -3659,7 +3832,7 @@ pub(crate) mod harness { )) .unwrap(), walredo_mgr, - self.tenant_id, + self.tenant_shard_id, Some(self.remote_storage.clone()), self.deletion_queue.new_client(), )); @@ -3668,17 +3841,17 @@ pub(crate) mod harness { LoadMode::Local => { tenant .load_local(None, ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) + .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; } LoadMode::Remote => { let preload = tenant .preload(&self.remote_storage, CancellationToken::new()) - .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_id)) + .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; tenant .attach(None, Some(preload), ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) + .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) .await?; } } @@ -3712,7 +3885,7 @@ pub(crate) mod harness { } pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { - self.conf.timeline_path(&self.tenant_id, timeline_id) + self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } } @@ -3828,7 +4001,7 @@ mod tests { e.to_string(), format!( "Timeline {}/{} already exists in pageserver's memory", - tenant.tenant_id, TIMELINE_ID + tenant.tenant_shard_id, TIMELINE_ID ) ), } @@ -4212,7 +4385,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) .await .ok() .unwrap(); @@ -4253,7 +4426,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) .await .ok() .unwrap(); @@ -4315,7 +4488,7 @@ mod tests { // so that all uploads finish & we can call harness.try_load() below again tenant .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) .await .ok() .unwrap(); @@ -4848,7 +5021,7 @@ mod tests { let raw_tline = tline.raw_timeline().unwrap(); raw_tline .shutdown() - .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id)) + .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id)) .await; std::mem::forget(tline); } @@ -4860,7 +5033,7 @@ mod tests { assert_eq!( e, GetTimelineError::NotFound { - tenant_id: tenant.tenant_id, + tenant_id: tenant.tenant_shard_id.tenant_id, timeline_id: TIMELINE_ID, } ) @@ -4869,12 +5042,12 @@ mod tests { assert!(!harness .conf - .timeline_path(&tenant.tenant_id, &TIMELINE_ID) + .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) .exists()); assert!(!harness .conf - .timeline_uninit_mark_file_path(tenant.tenant_id, TIMELINE_ID) + .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID) .exists()); Ok(()) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 5f8c7f6c591c..7a454b53d247 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,9 +8,12 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! -use anyhow::Context; +use anyhow::bail; use pageserver_api::models; +use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; +use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; +use serde_json::Value; use std::num::NonZeroU64; use std::time::Duration; use utils::generation::Generation; @@ -88,6 +91,14 @@ pub(crate) struct LocationConf { /// The location-specific part of the configuration, describes the operating /// mode of this pageserver for this tenant. pub(crate) mode: LocationMode, + + /// The detailed shard identity. This structure is already scoped within + /// a TenantShardId, but we need the full ShardIdentity to enable calculating + /// key->shard mappings. + #[serde(default = "ShardIdentity::unsharded")] + #[serde(skip_serializing_if = "ShardIdentity::is_unsharded")] + pub(crate) shard: ShardIdentity, + /// The pan-cluster tenant configuration, the same on all locations pub(crate) tenant_conf: TenantConfOpt, } @@ -160,6 +171,8 @@ impl LocationConf { generation, attach_mode: AttachmentMode::Single, }), + // Legacy configuration loads are always from tenants created before sharding existed. + shard: ShardIdentity::unsharded(), tenant_conf, } } @@ -187,6 +200,7 @@ impl LocationConf { fn get_generation(conf: &'_ models::LocationConfig) -> Result { conf.generation + .map(Generation::new) .ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching")) } @@ -226,7 +240,21 @@ impl LocationConf { } }; - Ok(Self { mode, tenant_conf }) + let shard = if conf.shard_count == 0 { + ShardIdentity::unsharded() + } else { + ShardIdentity::new( + ShardNumber(conf.shard_number), + ShardCount(conf.shard_count), + ShardStripeSize(conf.shard_stripe_size), + )? + }; + + Ok(Self { + shard, + mode, + tenant_conf, + }) } } @@ -241,6 +269,7 @@ impl Default for LocationConf { attach_mode: AttachmentMode::Single, }), tenant_conf: TenantConfOpt::default(), + shard: ShardIdentity::unsharded(), } } } @@ -494,105 +523,49 @@ impl Default for TenantConf { } } -// Helper function to standardize the error messages we produce on bad durations -// -// Intended to be used with anyhow's `with_context`, e.g.: -// -// let value = result.with_context(bad_duration("name", &value))?; -// -fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { - move || format!("Cannot parse `{field_name}` duration {value:?}") -} - impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { type Error = anyhow::Error; fn try_from(request_data: &'_ models::TenantConfig) -> Result { - let mut tenant_conf = TenantConfOpt::default(); - - if let Some(gc_period) = &request_data.gc_period { - tenant_conf.gc_period = Some( - humantime::parse_duration(gc_period) - .with_context(bad_duration("gc_period", gc_period))?, - ); - } - tenant_conf.gc_horizon = request_data.gc_horizon; - tenant_conf.image_creation_threshold = request_data.image_creation_threshold; - - if let Some(pitr_interval) = &request_data.pitr_interval { - tenant_conf.pitr_interval = Some( - humantime::parse_duration(pitr_interval) - .with_context(bad_duration("pitr_interval", pitr_interval))?, - ); - } + // Convert the request_data to a JSON Value + let json_value: Value = serde_json::to_value(request_data)?; - if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout { - tenant_conf.walreceiver_connect_timeout = Some( - humantime::parse_duration(walreceiver_connect_timeout).with_context( - bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout), - )?, - ); - } - if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout { - tenant_conf.lagging_wal_timeout = Some( - humantime::parse_duration(lagging_wal_timeout) - .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?, - ); - } - if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { - tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); - } - if let Some(trace_read_requests) = request_data.trace_read_requests { - tenant_conf.trace_read_requests = Some(trace_read_requests); - } - - tenant_conf.checkpoint_distance = request_data.checkpoint_distance; - if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout { - tenant_conf.checkpoint_timeout = Some( - humantime::parse_duration(checkpoint_timeout) - .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?, - ); - } + // Create a Deserializer from the JSON Value + let deserializer = json_value.into_deserializer(); - tenant_conf.compaction_target_size = request_data.compaction_target_size; - tenant_conf.compaction_threshold = request_data.compaction_threshold; + // Use serde_path_to_error to deserialize the JSON Value into TenantConfOpt + let tenant_conf: TenantConfOpt = serde_path_to_error::deserialize(deserializer)?; - if let Some(compaction_period) = &request_data.compaction_period { - tenant_conf.compaction_period = Some( - humantime::parse_duration(compaction_period) - .with_context(bad_duration("compaction_period", compaction_period))?, - ); - } + Ok(tenant_conf) + } +} - if let Some(eviction_policy) = &request_data.eviction_policy { - tenant_conf.eviction_policy = Some( - serde::Deserialize::deserialize(eviction_policy) - .context("parse field `eviction_policy`")?, - ); - } +impl TryFrom for TenantConfOpt { + type Error = anyhow::Error; - tenant_conf.min_resident_size_override = request_data.min_resident_size_override; - - if let Some(evictions_low_residence_duration_metric_threshold) = - &request_data.evictions_low_residence_duration_metric_threshold - { - tenant_conf.evictions_low_residence_duration_metric_threshold = Some( - humantime::parse_duration(evictions_low_residence_duration_metric_threshold) - .with_context(bad_duration( - "evictions_low_residence_duration_metric_threshold", - evictions_low_residence_duration_metric_threshold, - ))?, - ); + fn try_from(item: toml_edit::Item) -> Result { + match item { + toml_edit::Item::Value(value) => { + let d = value.into_deserializer(); + return serde_path_to_error::deserialize(d) + .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); + } + toml_edit::Item::Table(table) => { + let deserializer = toml_edit::de::Deserializer::new(table.into()); + return serde_path_to_error::deserialize(deserializer) + .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); + } + _ => { + bail!("expected non-inline table but found {item}") + } } - tenant_conf.gc_feedback = request_data.gc_feedback; - - Ok(tenant_conf) } } #[cfg(test)] mod tests { use super::*; + use models::TenantConfig; #[test] fn de_serializing_pageserver_config_omits_empty_values() { @@ -609,4 +582,38 @@ mod tests { assert_eq!(json_form, "{\"gc_horizon\":42}"); assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); } + + #[test] + fn test_try_from_models_tenant_config_err() { + let tenant_config = models::TenantConfig { + lagging_wal_timeout: Some("5a".to_string()), + ..TenantConfig::default() + }; + + let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config); + + assert!( + tenant_conf_opt.is_err(), + "Suceeded to convert TenantConfig to TenantConfOpt" + ); + + let expected_error_str = + "lagging_wal_timeout: invalid value: string \"5a\", expected a duration"; + assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str); + } + + #[test] + fn test_try_from_models_tenant_config_success() { + let tenant_config = models::TenantConfig { + lagging_wal_timeout: Some("5s".to_string()), + ..TenantConfig::default() + }; + + let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config).unwrap(); + + assert_eq!( + tenant_conf_opt.lagging_wal_timeout, + Some(Duration::from_secs(5)) + ); + } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 066f239ff0b2..b7b2ef9c79cb 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -2,21 +2,19 @@ use std::sync::Arc; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use pageserver_api::models::TenantState; +use pageserver_api::{models::TenantState, shard::TenantShardId}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::sync::OwnedMutexGuard; use tokio_util::sync::CancellationToken; -use tracing::{error, instrument, warn, Instrument, Span}; +use tracing::{error, instrument, Instrument, Span}; -use utils::{ - backoff, completion, crashsafe, fs_ext, - id::{TenantId, TimelineId}, -}; +use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId}; use crate::{ config::PageServerConf, context::RequestContext, task_mgr::{self, TaskKind}, + tenant::mgr::{TenantSlot, TenantsMapRemoveResult}, InitializationOrder, }; @@ -59,10 +57,10 @@ type DeletionGuard = tokio::sync::OwnedMutexGuard; fn remote_tenant_delete_mark_path( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> anyhow::Result { let tenant_remote_path = conf - .tenant_path(tenant_id) + .tenant_path(tenant_shard_id) .strip_prefix(&conf.workdir) .context("Failed to strip workdir prefix") .and_then(RemotePath::new) @@ -73,9 +71,9 @@ fn remote_tenant_delete_mark_path( async fn create_remote_delete_mark( conf: &PageServerConf, remote_storage: &GenericRemoteStorage, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result<(), DeleteTenantError> { - let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?; + let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; let data: &[u8] = &[]; backoff::retry( @@ -99,9 +97,9 @@ async fn create_remote_delete_mark( async fn create_local_delete_mark( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result<(), DeleteTenantError> { - let marker_path = conf.tenant_deleted_mark_file_path(tenant_id); + let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id); // Note: we're ok to replace existing file. let _ = std::fs::OpenOptions::new() @@ -170,10 +168,10 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del async fn remove_tenant_remote_delete_mark( conf: &PageServerConf, remote_storage: Option<&GenericRemoteStorage>, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result<(), DeleteTenantError> { if let Some(remote_storage) = remote_storage { - let path = remote_tenant_delete_mark_path(conf, tenant_id)?; + let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; backoff::retry( || async { remote_storage.delete(&path).await }, |_e| false, @@ -192,7 +190,7 @@ async fn remove_tenant_remote_delete_mark( // Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir async fn cleanup_remaining_fs_traces( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result<(), DeleteTenantError> { let rm = |p: Utf8PathBuf, is_dir: bool| async move { if is_dir { @@ -204,8 +202,8 @@ async fn cleanup_remaining_fs_traces( .with_context(|| format!("failed to delete {p}")) }; - rm(conf.tenant_config_path(tenant_id), false).await?; - rm(conf.tenant_location_config_path(tenant_id), false).await?; + rm(conf.tenant_config_path(tenant_shard_id), false).await?; + rm(conf.tenant_location_config_path(tenant_shard_id), false).await?; fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| { Err(anyhow::anyhow!( @@ -213,7 +211,7 @@ async fn cleanup_remaining_fs_traces( ))? }); - rm(conf.timelines_path(tenant_id), true).await?; + rm(conf.timelines_path(tenant_shard_id), true).await?; fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| { Err(anyhow::anyhow!( @@ -227,14 +225,14 @@ async fn cleanup_remaining_fs_traces( // to be reordered later and thus missed if a crash occurs. // Note that we dont need to sync after mark file is removed // because we can tolerate the case when mark file reappears on startup. - let tenant_path = &conf.tenant_path(tenant_id); + let tenant_path = &conf.tenant_path(tenant_shard_id); if tenant_path.exists() { - crashsafe::fsync_async(&conf.tenant_path(tenant_id)) + crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id)) .await .context("fsync_pre_mark_remove")?; } - rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?; + rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?; fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { Err(anyhow::anyhow!( @@ -242,7 +240,7 @@ async fn cleanup_remaining_fs_traces( ))? }); - rm(conf.tenant_path(tenant_id), true).await?; + rm(conf.tenant_path(tenant_shard_id), true).await?; Ok(()) } @@ -287,6 +285,8 @@ impl DeleteTenantFlow { ) -> Result<(), DeleteTenantError> { span::debug_assert_current_span_has_tenant_id(); + pausable_failpoint!("tenant-delete-before-run"); + let mut guard = Self::prepare(&tenant).await?; if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { @@ -321,7 +321,7 @@ impl DeleteTenantFlow { // Though sounds scary, different mark name? // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id) + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id) .await .context("remote_mark")? } @@ -332,7 +332,7 @@ impl DeleteTenantFlow { ))? }); - create_local_delete_mark(conf, &tenant.tenant_id) + create_local_delete_mark(conf, &tenant.tenant_shard_id) .await .context("local delete mark")?; @@ -374,9 +374,11 @@ impl DeleteTenantFlow { return Ok(acquire(tenant)); } - let tenant_id = tenant.tenant_id; // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. - if conf.tenant_deleted_mark_file_path(&tenant_id).exists() { + if conf + .tenant_deleted_mark_file_path(&tenant.tenant_shard_id) + .exists() + { Ok(acquire(tenant)) } else { Ok(None) @@ -459,12 +461,12 @@ impl DeleteTenantFlow { tenants: &'static std::sync::RwLock, tenant: Arc, ) { - let tenant_id = tenant.tenant_id; + let tenant_shard_id = tenant.tenant_shard_id; task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_id), + Some(tenant_shard_id.tenant_id), None, "tenant_delete", false, @@ -478,7 +480,7 @@ impl DeleteTenantFlow { Ok(()) } .instrument({ - let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id); + let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); span.follows_from(Span::current()); span }), @@ -516,7 +518,7 @@ impl DeleteTenantFlow { } } - let timelines_path = conf.timelines_path(&tenant.tenant_id); + let timelines_path = conf.timelines_path(&tenant.tenant_shard_id); // May not exist if we fail in cleanup_remaining_fs_traces after removing it if timelines_path.exists() { // sanity check to guard against layout changes @@ -525,7 +527,8 @@ impl DeleteTenantFlow { .context("timelines dir not empty")?; } - remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?; + remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id) + .await?; fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { Err(anyhow::anyhow!( @@ -533,21 +536,73 @@ impl DeleteTenantFlow { ))? }); - cleanup_remaining_fs_traces(conf, &tenant.tenant_id) + cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id) .await .context("cleanup_remaining_fs_traces")?; { - let mut locked = tenants.write().unwrap(); - if locked.remove(&tenant.tenant_id).is_none() { - warn!("Tenant got removed from tenants map during deletion"); - }; - - // FIXME: we should not be modifying this from outside of mgr.rs. - // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) - crate::metrics::TENANT_MANAGER - .tenant_slots - .set(locked.len() as u64); + pausable_failpoint!("tenant-delete-before-map-remove"); + + // This block is simply removing the TenantSlot for this tenant. It requires a loop because + // we might conflict with a TenantSlot::InProgress marker and need to wait for it. + // + // This complexity will go away when we simplify how deletion works: + // https://github.com/neondatabase/neon/issues/5080 + loop { + // Under the TenantMap lock, try to remove the tenant. We usually succeed, but if + // we encounter an InProgress marker, yield the barrier it contains and wait on it. + let barrier = { + let mut locked = tenants.write().unwrap(); + let removed = locked.remove(&tenant.tenant_shard_id.tenant_id); + + // FIXME: we should not be modifying this from outside of mgr.rs. + // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) + crate::metrics::TENANT_MANAGER + .tenant_slots + .set(locked.len() as u64); + + match removed { + TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => { + match tenant.current_state() { + TenantState::Stopping { .. } | TenantState::Broken { .. } => { + // Expected: we put the tenant into stopping state before we start deleting it + } + state => { + // Unexpected state + tracing::warn!( + "Tenant in unexpected state {state} after deletion" + ); + } + } + break; + } + TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => { + // This is unexpected: this secondary tenants should not have been created, and we + // are not in a position to shut it down from here. + tracing::warn!("Tenant transitioned to secondary mode while deleting!"); + break; + } + TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => { + unreachable!("TenantsMap::remove handles InProgress separately, should never return it here"); + } + TenantsMapRemoveResult::Vacant => { + tracing::warn!( + "Tenant removed from TenantsMap before deletion completed" + ); + break; + } + TenantsMapRemoveResult::InProgress(barrier) => { + // An InProgress entry was found, we must wait on its barrier + barrier + } + } + }; + + tracing::info!( + "Waiting for competing operation to complete before deleting state for tenant" + ); + barrier.wait().await; + } } *guard = Self::Finished; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 9a06d9df611d..591eacd1046e 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -7,18 +7,19 @@ use crate::page_cache::{self, PAGE_SZ}; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; use crate::virtual_file::VirtualFile; use camino::Utf8PathBuf; +use pageserver_api::shard::TenantShardId; use std::cmp::min; use std::fs::OpenOptions; use std::io::{self, ErrorKind}; use std::ops::DerefMut; use std::sync::atomic::AtomicU64; use tracing::*; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; pub struct EphemeralFile { page_cache_file_id: page_cache::FileId, - _tenant_id: TenantId, + _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, file: VirtualFile, len: u64, @@ -31,7 +32,7 @@ pub struct EphemeralFile { impl EphemeralFile { pub async fn create( conf: &PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); @@ -39,7 +40,7 @@ impl EphemeralFile { NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed); let filename = conf - .timeline_path(&tenant_id, &timeline_id) + .timeline_path(&tenant_shard_id, &timeline_id) .join(Utf8PathBuf::from(format!( "ephemeral-{filename_disambiguator}" ))); @@ -52,7 +53,7 @@ impl EphemeralFile { Ok(EphemeralFile { page_cache_file_id: page_cache::next_file_id(), - _tenant_id: tenant_id, + _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, file, len: 0, @@ -282,7 +283,7 @@ mod tests { ) -> Result< ( &'static PageServerConf, - TenantId, + TenantShardId, TimelineId, RequestContext, ), @@ -295,13 +296,13 @@ mod tests { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap(); + let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap(); let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); - fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?; + fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?; let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); - Ok((conf, tenant_id, timeline_id, ctx)) + Ok((conf, tenant_shard_id, timeline_id, ctx)) } #[tokio::test] diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 38fd42674605..6fb86c65e27f 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -11,15 +11,12 @@ use std::io::{self}; use anyhow::{ensure, Context}; +use pageserver_api::shard::TenantShardId; use serde::{de::Error, Deserialize, Serialize, Serializer}; use thiserror::Error; use utils::bin_ser::SerializeError; use utils::crashsafe::path_with_suffix_extension; -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; use crate::config::PageServerConf; use crate::virtual_file::VirtualFile; @@ -272,14 +269,14 @@ impl Serialize for TimelineMetadata { } /// Save timeline metadata to file -#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))] +#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))] pub async fn save_metadata( conf: &'static PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, data: &TimelineMetadata, ) -> anyhow::Result<()> { - let path = conf.metadata_path(tenant_id, timeline_id); + let path = conf.metadata_path(tenant_shard_id, timeline_id); let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX); let metadata_bytes = data.to_bytes().context("serialize metadata")?; VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes) @@ -299,10 +296,10 @@ pub enum LoadMetadataError { pub fn load_metadata( conf: &'static PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, ) -> Result { - let metadata_path = conf.metadata_path(tenant_id, timeline_id); + let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id); let metadata_bytes = std::fs::read(metadata_path)?; Ok(TimelineMetadata::from_bytes(&metadata_bytes)?) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 3ff7425bc24c..f34d62ba5348 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -29,7 +29,9 @@ use crate::control_plane_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::metrics::TENANT_MANAGER as METRICS; use crate::task_mgr::{self, TaskKind}; -use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt}; +use crate::tenant::config::{ + AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt, +}; use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState}; @@ -122,6 +124,12 @@ fn exactly_one_or_none<'a>( } } +pub(crate) enum TenantsMapRemoveResult { + Occupied(TenantSlot), + Vacant, + InProgress(utils::completion::Barrier), +} + impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, @@ -136,12 +144,28 @@ impl TenantsMap { } } - pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option { + /// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map. + /// + /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded + /// slot if the enclosed tenant is shutdown. + pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult { + use std::collections::btree_map::Entry; match self { - TenantsMap::Initializing => None, + TenantsMap::Initializing => TenantsMapRemoveResult::Vacant, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k); - key.and_then(|key| m.remove(&key)) + match key { + Some(key) => match m.entry(key) { + Entry::Occupied(entry) => match entry.get() { + TenantSlot::InProgress(barrier) => { + TenantsMapRemoveResult::InProgress(barrier.clone()) + } + _ => TenantsMapRemoveResult::Occupied(entry.remove()), + }, + Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, + }, + None => TenantsMapRemoveResult::Vacant, + } } } } @@ -250,8 +274,8 @@ pub struct TenantManager { } fn emergency_generations( - tenant_confs: &HashMap>, -) -> HashMap { + tenant_confs: &HashMap>, +) -> HashMap { tenant_confs .iter() .filter_map(|(tid, lc)| { @@ -271,10 +295,10 @@ fn emergency_generations( async fn init_load_generations( conf: &'static PageServerConf, - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let generations = if conf.control_plane_emergency_mode { error!( "Emergency mode! Tenants will be attached unsafely using their last known generation" @@ -317,7 +341,7 @@ async fn init_load_generations( fn load_tenant_config( conf: &'static PageServerConf, dentry: Utf8DirEntry, -) -> anyhow::Result)>> { +) -> anyhow::Result)>> { let tenant_dir_path = dentry.path().to_path_buf(); if crate::is_temporary(&tenant_dir_path) { info!("Found temporary tenant directory, removing: {tenant_dir_path}"); @@ -353,10 +377,10 @@ fn load_tenant_config( return Ok(None); } - let tenant_id = match tenant_dir_path + let tenant_shard_id = match tenant_dir_path .file_name() .unwrap_or_default() - .parse::() + .parse::() { Ok(id) => id, Err(_) => { @@ -366,8 +390,8 @@ fn load_tenant_config( }; Ok(Some(( - tenant_id, - Tenant::load_tenant_config(conf, &tenant_id), + tenant_shard_id, + Tenant::load_tenant_config(conf, &tenant_shard_id), ))) } @@ -378,7 +402,7 @@ fn load_tenant_config( /// seconds even on reasonably fast drives. async fn init_load_tenant_configs( conf: &'static PageServerConf, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let tenants_dir = conf.tenants_path(); let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result> { @@ -428,19 +452,19 @@ pub async fn init_tenant_mgr( init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; // Construct `Tenant` objects and start them running - for (tenant_id, location_conf) in tenant_configs { - let tenant_dir_path = conf.tenant_path(&tenant_id); + for (tenant_shard_id, location_conf) in tenant_configs { + let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let mut location_conf = match location_conf { Ok(l) => l, Err(e) => { - warn!(%tenant_id, "Marking tenant broken, failed to {e:#}"); + warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}"); tenants.insert( - TenantShardId::unsharded(tenant_id), + tenant_shard_id, TenantSlot::Attached(Tenant::create_broken_tenant( conf, - tenant_id, + tenant_shard_id, format!("{}", e), )), ); @@ -451,7 +475,7 @@ pub async fn init_tenant_mgr( let generation = if let Some(generations) = &tenant_generations { // We have a generation map: treat it as the authority for whether // this tenant is really attached. - if let Some(gen) = generations.get(&tenant_id) { + if let Some(gen) = generations.get(&tenant_shard_id) { *gen } else { match &location_conf.mode { @@ -459,8 +483,8 @@ pub async fn init_tenant_mgr( // We do not require the control plane's permission for secondary mode // tenants, because they do no remote writes and hence require no // generation number - info!(%tenant_id, "Loaded tenant in secondary mode"); - tenants.insert(TenantShardId::unsharded(tenant_id), TenantSlot::Secondary); + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode"); + tenants.insert(tenant_shard_id, TenantSlot::Secondary); } LocationMode::Attached(_) => { // TODO: augment re-attach API to enable the control plane to @@ -468,9 +492,9 @@ pub async fn init_tenant_mgr( // away local state, we can gracefully fall back to secondary here, if the control // plane tells us so. // (https://github.com/neondatabase/neon/issues/5377) - info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response"); + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!(%tenant_id, + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", ); } @@ -482,18 +506,18 @@ pub async fn init_tenant_mgr( } else { // Legacy mode: no generation information, any tenant present // on local disk may activate - info!(%tenant_id, "Starting tenant in legacy mode, no generation",); + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",); Generation::none() }; // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. location_conf.attach_in_generation(generation); - Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; + Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; match tenant_spawn( conf, - tenant_id, + tenant_shard_id, &tenant_dir_path, resources.clone(), AttachedTenantConf::try_from(location_conf)?, @@ -509,7 +533,7 @@ pub async fn init_tenant_mgr( ); } Err(e) => { - error!(%tenant_id, "Failed to start tenant: {e:#}"); + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); } } } @@ -533,7 +557,7 @@ pub async fn init_tenant_mgr( #[allow(clippy::too_many_arguments)] pub(crate) fn tenant_spawn( conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, tenant_path: &Utf8Path, resources: TenantSharedResources, location_conf: AttachedTenantConf, @@ -557,16 +581,16 @@ pub(crate) fn tenant_spawn( "Cannot load tenant from empty directory {tenant_path:?}" ); - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); anyhow::ensure!( - !conf.tenant_ignore_mark_file_path(&tenant_id).exists(), + !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(), "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); - info!("Attaching tenant {tenant_id}"); + info!("Attaching tenant {tenant_shard_id}"); let tenant = match Tenant::spawn( conf, - tenant_id, + tenant_shard_id, resources, location_conf, init_order, @@ -576,8 +600,8 @@ pub(crate) fn tenant_spawn( ) { Ok(tenant) => tenant, Err(e) => { - error!("Failed to spawn tenant {tenant_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}")) + error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}"); + Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}")) } }; @@ -732,16 +756,15 @@ pub(crate) async fn create_tenant( ctx: &RequestContext, ) -> Result, TenantMapInsertError> { let location_conf = LocationConf::attached_single(tenant_conf, generation); + info!("Creating tenant at location {location_conf:?}"); let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - // TODO(sharding): make local paths shard-aware - let tenant_path = - super::create_tenant_files(conf, &location_conf, &tenant_shard_id.tenant_id).await?; + let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; let created_tenant = tenant_spawn( conf, - tenant_shard_id.tenant_id, + tenant_shard_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, @@ -781,8 +804,9 @@ pub(crate) async fn set_new_tenant_config( // API to use is the location_config/ endpoint, which lets the caller provide // the full LocationConf. let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); - Tenant::persist_tenant_config(conf, &tenant_id, &location_conf) + Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) .await .map_err(SetNewTenantConfigError::Persist)?; tenant.set_new_tenant_config(new_tenant_conf); @@ -792,8 +816,6 @@ pub(crate) async fn set_new_tenant_config( impl TenantManager { /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. - /// - /// This method is cancel-safe. pub(crate) fn get_attached_tenant_shard( &self, tenant_shard_id: TenantShardId, @@ -842,6 +864,7 @@ impl TenantManager { &self, tenant_shard_id: TenantShardId, new_location_config: LocationConf, + flush: Option, ctx: &RequestContext, ) -> Result<(), anyhow::Error> { debug_assert_current_span_has_tenant_id(); @@ -850,7 +873,7 @@ impl TenantManager { // Special case fast-path for updates to Tenant: if our upsert is only updating configuration, // then we do not need to set the slot to InProgress, we can just call into the // existng tenant. - { + let modify_tenant = { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?; @@ -861,22 +884,50 @@ impl TenantManager { // take our fast path and just provide the updated configuration // to the tenant. tenant.set_new_location_config(AttachedTenantConf::try_from( - new_location_config, + new_location_config.clone(), )?); - // Persist the new config in the background, to avoid holding up any - // locks while we do so. - // TODO - - return Ok(()); + Some(tenant.clone()) } else { // Different generations, fall through to general case + None } } _ => { // Not an Attached->Attached transition, fall through to general case + None } } + }; + + // Fast-path continued: having dropped out of the self.tenants lock, do the async + // phase of waiting for flush, before returning. + if let Some(tenant) = modify_tenant { + // Transition to AttachedStale means we may well hold a valid generation + // still, and have been requested to go stale as part of a migration. If + // the caller set `flush`, then flush to remote storage. + if let LocationMode::Attached(AttachedLocationConfig { + generation: _, + attach_mode: AttachmentMode::Stale, + }) = &new_location_config.mode + { + if let Some(flush_timeout) = flush { + match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await { + Ok(Err(e)) => { + return Err(e); + } + Ok(Ok(_)) => return Ok(()), + Err(_) => { + tracing::warn!( + timeout_ms = flush_timeout.as_millis(), + "Timed out waiting for flush to remote storage, proceeding anyway." + ) + } + } + } + } + + return Ok(()); } // General case for upserts to TenantsMap, excluding the case above: we will substitute an @@ -915,8 +966,7 @@ impl TenantManager { slot_guard.drop_old_value().expect("We just shut it down"); } - // TODO(sharding): make local paths sharding-aware - let tenant_path = self.conf.tenant_path(&tenant_shard_id.tenant_id); + let tenant_path = self.conf.tenant_path(&tenant_shard_id); let new_slot = match &new_location_config.mode { LocationMode::Secondary(_) => { @@ -926,20 +976,14 @@ impl TenantManager { .await .with_context(|| format!("Creating {tenant_path}"))?; - // TODO(sharding): make local paths sharding-aware - Tenant::persist_tenant_config( - self.conf, - &tenant_shard_id.tenant_id, - &new_location_config, - ) - .await - .map_err(SetNewTenantConfigError::Persist)?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; TenantSlot::Secondary } LocationMode::Attached(_attach_config) => { - // TODO(sharding): make local paths sharding-aware - let timelines_path = self.conf.timelines_path(&tenant_shard_id.tenant_id); + let timelines_path = self.conf.timelines_path(&tenant_shard_id); // Directory doesn't need to be fsync'd because we do not depend on // it to exist after crashes: it may be recreated when tenant is @@ -948,19 +992,13 @@ impl TenantManager { .await .with_context(|| format!("Creating {timelines_path}"))?; - // TODO(sharding): make local paths sharding-aware - Tenant::persist_tenant_config( - self.conf, - &tenant_shard_id.tenant_id, - &new_location_config, - ) - .await - .map_err(SetNewTenantConfigError::Persist)?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .map_err(SetNewTenantConfigError::Persist)?; - // TODO(sharding): make spawn sharding-aware let tenant = tenant_spawn( self.conf, - tenant_shard_id.tenant_id, + tenant_shard_id, &tenant_path, self.resources.clone(), AttachedTenantConf::try_from(new_location_config)?, @@ -1262,8 +1300,7 @@ async fn detach_tenant0( deletion_queue_client: &DeletionQueueClient, ) -> Result { let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { - // TODO(sharding): make local path helpers shard-aware - let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean.tenant_id); + let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); safe_rename_tenant_dir(&local_tenant_directory) .await .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename")) @@ -1288,8 +1325,7 @@ async fn detach_tenant0( Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) ) { - // TODO(sharding): make local paths sharding-aware - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id.tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); if tenant_ignore_mark.exists() { info!("Detaching an ignored tenant"); let tmp_path = tenant_dir_rename_operation(tenant_shard_id) @@ -1318,9 +1354,9 @@ pub(crate) async fn load_tenant( let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - let tenant_path = conf.tenant_path(&tenant_id); + let tenant_path = conf.tenant_path(&tenant_shard_id); - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); if tenant_ignore_mark.exists() { std::fs::remove_file(&tenant_ignore_mark).with_context(|| { format!( @@ -1336,14 +1372,14 @@ pub(crate) async fn load_tenant( }; let mut location_conf = - Tenant::load_tenant_config(conf, &tenant_id).map_err(TenantMapInsertError::Other)?; + Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?; location_conf.attach_in_generation(generation); - Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; + Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; let new_tenant = tenant_spawn( conf, - tenant_id, + tenant_shard_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, @@ -1374,7 +1410,7 @@ async fn ignore_tenant0( let tenant_shard_id = TenantShardId::unsharded(tenant_id); remove_tenant_from_memory(tenants, tenant_shard_id, async { - let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id); + let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id); fs::File::create(&ignore_mark_file) .await .context("Failed to create ignore mark file") @@ -1432,13 +1468,13 @@ pub(crate) async fn attach_tenant( let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; let location_conf = LocationConf::attached_single(tenant_conf, generation); - let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?; + let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 let attached_tenant = tenant_spawn( conf, - tenant_id, + tenant_shard_id, &tenant_dir, resources, AttachedTenantConf::try_from(location_conf)?, @@ -1954,6 +1990,9 @@ pub(crate) async fn immediate_gc( .with_context(|| format!("tenant {tenant_id}")) .map_err(|e| ApiError::NotFound(e.into()))?; + // TODO(sharding): make callers of this function shard-aware + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting let pitr = tenant.get_pitr_interval(); @@ -1961,6 +2000,7 @@ pub(crate) async fn immediate_gc( // Run in task_mgr to avoid race with tenant_detach operation let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + // TODO: spawning is redundant now, need to hold the gate task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::GarbageCollector, @@ -1970,12 +2010,40 @@ pub(crate) async fn immediate_gc( false, async move { fail::fail_point!("immediate_gc_task_pre"); - let result = tenant + + #[allow(unused_mut)] + let mut result = tenant .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .instrument(info_span!("manual_gc", %tenant_id, %timeline_id)) + .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. + + #[cfg(feature = "testing")] + { + if let Ok(result) = result.as_mut() { + // why not futures unordered? it seems it needs very much the same task structure + // but would only run on single task. + let mut js = tokio::task::JoinSet::new(); + for layer in std::mem::take(&mut result.doomed_layers) { + js.spawn(layer.wait_drop()); + } + tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped"); + while let Some(res) = js.join_next().await { + res.expect("wait_drop should not panic"); + } + } + + let timeline = tenant.get_timeline(timeline_id, false).ok(); + let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref()); + + if let Some(rtc) = rtc { + // layer drops schedule actions on remote timeline client to actually do the + // deletions; don't care just exit fast about the shutdown error + drop(rtc.wait_completion().await); + } + } + match task_done.send(result) { Ok(_) => (), Err(result) => error!("failed to send gc result: {result:?}"), diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 99d9783f7356..5b649a420cd2 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -188,6 +188,8 @@ use anyhow::Context; use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; +pub(crate) use download::download_initdb_tar_zst; +use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; pub(crate) use upload::upload_initdb_dir; @@ -300,7 +302,7 @@ pub struct RemoteTimelineClient { runtime: tokio::runtime::Handle, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, @@ -324,7 +326,7 @@ impl RemoteTimelineClient { remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, ) -> RemoteTimelineClient { @@ -336,13 +338,16 @@ impl RemoteTimelineClient { } else { BACKGROUND_RUNTIME.handle().clone() }, - tenant_id, + tenant_shard_id, timeline_id, generation, storage_impl: remote_storage, deletion_queue_client, upload_queue: Mutex::new(UploadQueue::Uninitialized), - metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &tenant_shard_id, + &timeline_id, + )), } } @@ -463,13 +468,13 @@ impl RemoteTimelineClient { let index_part = download::download_index_part( &self.storage_impl, - &self.tenant_id, + &self.tenant_shard_id, &self.timeline_id, self.generation, cancel, ) .measure_remote_op( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Download, @@ -505,13 +510,13 @@ impl RemoteTimelineClient { download::download_layer_file( self.conf, &self.storage_impl, - self.tenant_id, + self.tenant_shard_id, self.timeline_id, layer_file_name, layer_metadata, ) .measure_remote_op( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Download, @@ -657,10 +662,10 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - let with_generations = + let with_metadata = self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned()); - self.schedule_deletion_of_unlinked0(upload_queue, with_generations); + self.schedule_deletion_of_unlinked0(upload_queue, with_metadata); // Launch the tasks immediately, if possible self.launch_queued_tasks(upload_queue); @@ -695,7 +700,7 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Vec<(LayerFileName, Generation)> + ) -> Vec<(LayerFileName, LayerFileMetadata)> where I: IntoIterator, { @@ -703,16 +708,17 @@ impl RemoteTimelineClient { // so we don't need update it. Just serialize it. let metadata = upload_queue.latest_metadata.clone(); - // Decorate our list of names with each name's generation, dropping - // names that are unexpectedly missing from our metadata. - let with_generations: Vec<_> = names + // Decorate our list of names with each name's metadata, dropping + // names that are unexpectedly missing from our metadata. This metadata + // is later used when physically deleting layers, to construct key paths. + let with_metadata: Vec<_> = names .into_iter() .filter_map(|name| { let meta = upload_queue.latest_files.remove(&name); if let Some(meta) = meta { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; - Some((name, meta.generation)) + Some((name, meta)) } else { // This can only happen if we forgot to to schedule the file upload // before scheduling the delete. Log it because it is a rare/strange @@ -725,9 +731,10 @@ impl RemoteTimelineClient { .collect(); #[cfg(feature = "testing")] - for (name, gen) in &with_generations { - if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) { - if &unexpected == gen { + for (name, metadata) in &with_metadata { + let gen = metadata.generation; + if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) { + if unexpected == gen { tracing::error!("{name} was unlinked twice with same generation"); } else { tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}"); @@ -742,14 +749,14 @@ impl RemoteTimelineClient { self.schedule_index_upload(upload_queue, metadata); } - with_generations + with_metadata } /// Schedules deletion for layer files which have previously been unlinked from the /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`]. pub(crate) fn schedule_deletion_of_unlinked( self: &Arc, - layers: Vec<(LayerFileName, Generation)>, + layers: Vec<(LayerFileName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -762,16 +769,22 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - with_generations: Vec<(LayerFileName, Generation)>, + with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, ) { - for (name, gen) in &with_generations { - info!("scheduling deletion of layer {}{}", name, gen.get_suffix()); + for (name, meta) in &with_metadata { + info!( + "scheduling deletion of layer {}{} (shard {})", + name, + meta.generation.get_suffix(), + meta.shard + ); } #[cfg(feature = "testing")] - for (name, gen) in &with_generations { + for (name, meta) in &with_metadata { + let gen = meta.generation; match upload_queue.dangling_files.remove(name) { - Some(same) if &same == gen => { /* expected */ } + Some(same) if same == gen => { /* expected */ } Some(other) => { tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}"); } @@ -783,7 +796,7 @@ impl RemoteTimelineClient { // schedule the actual deletions let op = UploadOp::Delete(Delete { - layers: with_generations, + layers: with_metadata, }); self.calls_unfinished_metric_begin(&op); upload_queue.queued_operations.push_back(op); @@ -812,10 +825,8 @@ impl RemoteTimelineClient { Ok(()) } - /// /// Wait for all previously scheduled uploads/deletions to complete - /// - pub async fn wait_completion(self: &Arc) -> anyhow::Result<()> { + pub(crate) async fn wait_completion(self: &Arc) -> anyhow::Result<()> { let mut receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -825,6 +836,7 @@ impl RemoteTimelineClient { if receiver.changed().await.is_err() { anyhow::bail!("wait_completion aborted because upload queue was stopped"); } + Ok(()) } @@ -851,6 +863,56 @@ impl RemoteTimelineClient { receiver } + /// Wait for all previously scheduled operations to complete, and then stop. + /// + /// Not cancellation safe + pub(crate) async fn shutdown(self: &Arc) -> Result<(), StopError> { + // On cancellation the queue is left in ackward state of refusing new operations but + // proper stop is yet to be called. On cancel the original or some later task must call + // `stop` or `shutdown`. + let sg = scopeguard::guard((), |_| { + tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error") + }); + + let fut = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = match &mut *guard { + UploadQueue::Stopped(_) => return Ok(()), + UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized), + UploadQueue::Initialized(ref mut init) => init, + }; + + // if the queue is already stuck due to a shutdown operation which was cancelled, then + // just don't add more of these as they would never complete. + // + // TODO: if launch_queued_tasks were to be refactored to accept a &mut UploadQueue + // in every place we would not have to jump through this hoop, and this method could be + // made cancellable. + if !upload_queue.shutting_down { + upload_queue.shutting_down = true; + upload_queue.queued_operations.push_back(UploadOp::Shutdown); + // this operation is not counted similar to Barrier + + self.launch_queued_tasks(upload_queue); + } + + upload_queue.shutdown_ready.clone().acquire_owned() + }; + + let res = fut.await; + + scopeguard::ScopeGuard::into_inner(sg); + + match res { + Ok(_permit) => unreachable!("shutdown_ready should not have been added permits"), + Err(_closed) => { + // expected + } + } + + self.stop() + } + /// Set the deleted_at field in the remote index file. /// /// This fails if the upload queue has not been `stop()`ed. @@ -902,7 +964,7 @@ impl RemoteTimelineClient { || { upload::upload_index_part( &self.storage_impl, - &self.tenant_id, + &self.tenant_shard_id, &self.timeline_id, self.generation, &index_part_with_deleted_at, @@ -960,8 +1022,9 @@ impl RemoteTimelineClient { .drain() .map(|(file_name, meta)| { remote_layer_path( - &self.tenant_id, + &self.tenant_shard_id.tenant_id, &self.timeline_id, + meta.shard, &file_name, meta.generation, ) @@ -974,7 +1037,7 @@ impl RemoteTimelineClient { // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage - let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id); + let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id); // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't // taking the burden of listing all the layers that we already know we should delete. @@ -1010,12 +1073,22 @@ impl RemoteTimelineClient { .unwrap_or( // No generation-suffixed indices, assume we are dealing with // a legacy index. - remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()), + remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()), ); let remaining_layers: Vec = remaining .into_iter() - .filter(|p| p!= &latest_index) + .filter(|p| { + if p == &latest_index { + return false; + } + if let Some(name) = p.object_name() { + if name == INITDB_PATH { + return false; + } + } + true + }) .inspect(|path| { if let Some(name) = path.object_name() { info!(%name, "deleting a file not referenced from index_part.json"); @@ -1081,7 +1154,9 @@ impl RemoteTimelineClient { upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len() } - UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(), + UploadOp::Barrier(_) | UploadOp::Shutdown => { + upload_queue.inprogress_tasks.is_empty() + } }; // If we cannot launch this task, don't look any further. @@ -1094,6 +1169,13 @@ impl RemoteTimelineClient { break; } + if let UploadOp::Shutdown = next_op { + // leave the op in the queue but do not start more tasks; it will be dropped when + // the stop is called. + upload_queue.shutdown_ready.close(); + break; + } + // We can launch this task. Remove it from the queue first. let next_op = upload_queue.queued_operations.pop_front().unwrap(); @@ -1114,6 +1196,7 @@ impl RemoteTimelineClient { sender.send_replace(()); continue; } + UploadOp::Shutdown => unreachable!("shutdown is intentionally never popped off"), }; // Assign unique ID to this task @@ -1132,12 +1215,12 @@ impl RemoteTimelineClient { // Spawn task to perform the task let self_rc = Arc::clone(self); - let tenant_id = self.tenant_id; + let tenant_shard_id = self.tenant_shard_id; let timeline_id = self.timeline_id; task_mgr::spawn( &self.runtime, TaskKind::RemoteUploadTask, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "remote upload", false, @@ -1145,7 +1228,7 @@ impl RemoteTimelineClient { self_rc.perform_upload_task(task).await; Ok(()) } - .instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)), + .instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)), ); // Loop back to process next task @@ -1197,7 +1280,7 @@ impl RemoteTimelineClient { self.generation, ) .measure_remote_op( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Upload, @@ -1217,13 +1300,13 @@ impl RemoteTimelineClient { let res = upload::upload_index_part( &self.storage_impl, - &self.tenant_id, + &self.tenant_shard_id, &self.timeline_id, self.generation, index_part, ) .measure_remote_op( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Upload, @@ -1243,7 +1326,7 @@ impl RemoteTimelineClient { pausable_failpoint!("before-delete-layer-pausable"); self.deletion_queue_client .push_layers( - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.generation, delete.layers.clone(), @@ -1251,10 +1334,10 @@ impl RemoteTimelineClient { .await .map_err(|e| anyhow::anyhow!(e)) } - UploadOp::Barrier(_) => { + unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => { // unreachable. Barrier operations are handled synchronously in // launch_queued_tasks - warn!("unexpected Barrier operation in perform_upload_task"); + warn!("unexpected {unexpected:?} operation in perform_upload_task"); break; } }; @@ -1348,7 +1431,7 @@ impl RemoteTimelineClient { upload_queue.num_inprogress_deletions -= 1; None } - UploadOp::Barrier(_) => unreachable!(), + UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(), }; // Launch any queued tasks that were unblocked by this one. @@ -1362,7 +1445,7 @@ impl RemoteTimelineClient { // data safety guarantees (see docs/rfcs/025-generation-numbers.md) self.deletion_queue_client .update_remote_consistent_lsn( - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.generation, lsn, @@ -1403,7 +1486,7 @@ impl RemoteTimelineClient { reason: "should we track deletes? positive or negative sign?", }, ), - UploadOp::Barrier(_) => { + UploadOp::Barrier(..) | UploadOp::Shutdown => { // we do not account these return None; } @@ -1429,10 +1512,13 @@ impl RemoteTimelineClient { } /// Close the upload queue for new operations and cancel queued operations. + /// + /// Use [`RemoteTimelineClient::shutdown`] for graceful stop. + /// /// In-progress operations will still be running after this function returns. /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))` /// to wait for them to complete, after calling this function. - pub fn stop(&self) -> Result<(), StopError> { + pub(crate) fn stop(&self) -> Result<(), StopError> { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet. // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business. @@ -1470,6 +1556,8 @@ impl RemoteTimelineClient { queued_operations: VecDeque::default(), #[cfg(feature = "testing")] dangling_files: HashMap::default(), + shutting_down: false, + shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; let upload_queue = std::mem::replace( @@ -1515,24 +1603,32 @@ impl RemoteTimelineClient { } } -pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath { - let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}"); +pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { + let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}"); RemotePath::from_string(&path).expect("Failed to construct path") } -pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath { - remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string())) +pub fn remote_timeline_path( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, +) -> RemotePath { + remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string())) } +/// Note that the shard component of a remote layer path is _not_ always the same +/// as in the TenantShardId of the caller: tenants may reference layers from a different +/// ShardIndex. Use the ShardIndex from the layer's metadata. pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, + shard: ShardIndex, layer_file_name: &LayerFileName, generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( - "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}", + "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", + shard.get_suffix(), layer_file_name.file_name(), generation.get_suffix() ); @@ -1548,12 +1644,12 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId } pub fn remote_index_path( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, ) -> RemotePath { RemotePath::from_string(&format!( - "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}", + "tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}", IndexPart::FILE_NAME, generation.get_suffix() )) @@ -1695,14 +1791,14 @@ mod tests { Arc::new(RemoteTimelineClient { conf: self.harness.conf, runtime: tokio::runtime::Handle::current(), - tenant_id: self.harness.tenant_id, + tenant_shard_id: self.harness.tenant_shard_id, timeline_id: TIMELINE_ID, generation, storage_impl: self.harness.remote_storage.clone(), deletion_queue_client: self.harness.deletion_queue.new_client(), upload_queue: Mutex::new(UploadQueue::Uninitialized), metrics: Arc::new(RemoteTimelineClientMetrics::new( - &self.harness.tenant_id, + &self.harness.tenant_shard_id, &TIMELINE_ID, )), }) @@ -1778,6 +1874,7 @@ mod tests { println!("remote_timeline_dir: {remote_timeline_dir}"); let generation = harness.generation; + let shard = harness.shard; // Create a couple of dummy files, schedule upload for them @@ -1794,7 +1891,7 @@ mod tests { harness.conf, &timeline, name, - LayerFileMetadata::new(contents.len() as u64, generation), + LayerFileMetadata::new(contents.len() as u64, generation, shard), ) }).collect::>(); @@ -1943,7 +2040,7 @@ mod tests { harness.conf, &timeline, layer_file_name_1.clone(), - LayerFileMetadata::new(content_1.len() as u64, harness.generation), + LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard), ); #[derive(Debug, PartialEq, Clone, Copy)] @@ -2029,7 +2126,12 @@ mod tests { std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work"); let index_path = test_state.harness.remote_fs_dir.join( - remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(), + remote_index_path( + &test_state.harness.tenant_shard_id, + &TIMELINE_ID, + generation, + ) + .get_path(), ); eprintln!("Writing {index_path}"); std::fs::write(&index_path, index_part_bytes).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 6039b01ab82a..3b5fe4b207b6 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -8,10 +8,12 @@ use std::future::Future; use std::time::Duration; use anyhow::{anyhow, Context}; -use camino::Utf8Path; -use tokio::fs; -use tokio::io::AsyncWriteExt; +use camino::{Utf8Path, Utf8PathBuf}; +use pageserver_api::shard::TenantShardId; +use tokio::fs::{self, File, OpenOptions}; +use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::sync::CancellationToken; +use tracing::warn; use utils::{backoff, crashsafe}; use crate::config::PageServerConf; @@ -19,14 +21,15 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_ use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::Generation; +use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; use utils::crashsafe::path_with_suffix_extension; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; use super::index::{IndexPart, LayerFileMetadata}; use super::{ - parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, + parse_remote_index_path, remote_index_path, remote_initdb_archive_path, + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, }; static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); @@ -39,7 +42,7 @@ static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); pub async fn download_layer_file<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, @@ -47,12 +50,13 @@ pub async fn download_layer_file<'a>( debug_assert_current_span_has_tenant_and_timeline_id(); let local_path = conf - .timeline_path(&tenant_id, &timeline_id) + .timeline_path(&tenant_shard_id, &timeline_id) .join(layer_file_name.file_name()); let remote_path = remote_layer_path( - &tenant_id, + &tenant_shard_id.tenant_id, &timeline_id, + layer_metadata.shard, layer_file_name, layer_metadata.generation, ); @@ -169,10 +173,10 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool { /// List timelines of given tenant in remote storage pub async fn list_remote_timelines( storage: &GenericRemoteStorage, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, cancel: CancellationToken, ) -> anyhow::Result<(HashSet, HashSet)> { - let remote_path = remote_timelines_path(&tenant_id); + let remote_path = remote_timelines_path(&tenant_shard_id); fail::fail_point!("storage-sync-list-remote-timelines", |_| { anyhow::bail!("storage-sync-list-remote-timelines"); @@ -180,7 +184,7 @@ pub async fn list_remote_timelines( let listing = download_retry_forever( || storage.list(Some(&remote_path), ListingMode::WithDelimiter), - &format!("list timelines for {tenant_id}"), + &format!("list timelines for {tenant_shard_id}"), cancel, ) .await?; @@ -190,7 +194,7 @@ pub async fn list_remote_timelines( for timeline_remote_storage_key in listing.prefixes { let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}") })?; match object_name.parse::() { @@ -211,12 +215,12 @@ pub async fn list_remote_timelines( async fn do_download_index_part( storage: &GenericRemoteStorage, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, index_generation: Generation, cancel: CancellationToken, ) -> Result { - let remote_path = remote_index_path(tenant_id, timeline_id, index_generation); + let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); let index_part_bytes = download_retry_forever( || async { @@ -252,7 +256,7 @@ async fn do_download_index_part( #[tracing::instrument(skip_all, fields(generation=?my_generation))] pub(super) async fn download_index_part( storage: &GenericRemoteStorage, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, my_generation: Generation, cancel: CancellationToken, @@ -261,8 +265,14 @@ pub(super) async fn download_index_part( if my_generation.is_none() { // Operating without generations: just fetch the generation-less path - return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel) - .await; + return do_download_index_part( + storage, + tenant_shard_id, + timeline_id, + my_generation, + cancel, + ) + .await; } // Stale case: If we were intentionally attached in a stale generation, there may already be a remote @@ -271,7 +281,7 @@ pub(super) async fn download_index_part( // This is an optimization to avoid doing the listing for the general case below. let res = do_download_index_part( storage, - tenant_id, + tenant_shard_id, timeline_id, my_generation, cancel.clone(), @@ -298,7 +308,7 @@ pub(super) async fn download_index_part( // This is an optimization to avoid doing the listing for the general case below. let res = do_download_index_part( storage, - tenant_id, + tenant_shard_id, timeline_id, my_generation.previous(), cancel.clone(), @@ -320,8 +330,9 @@ pub(super) async fn download_index_part( } // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json - // objects, and select the highest one with a generation <= my_generation. - let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none()); + // objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent + // to constructing a full index path with no generation, because the generation is a suffix. + let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); let indices = backoff::retry( || async { storage.list_files(Some(&index_prefix)).await }, |_| false, @@ -347,18 +358,87 @@ pub(super) async fn download_index_part( match max_previous_generation { Some(g) => { tracing::debug!("Found index_part in generation {g:?}"); - do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await + do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await } None => { // Migration from legacy pre-generation state: we have a generation but no prior // attached pageservers did. Try to load from a no-generation path. tracing::info!("No index_part.json* found"); - do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel) - .await + do_download_index_part( + storage, + tenant_shard_id, + timeline_id, + Generation::none(), + cancel, + ) + .await } } } +pub(crate) async fn download_initdb_tar_zst( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, +) -> Result<(Utf8PathBuf, File), DownloadError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + + let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id); + + let timeline_path = conf.timelines_path(tenant_shard_id); + + if !timeline_path.exists() { + tokio::fs::create_dir_all(&timeline_path) + .await + .with_context(|| format!("timeline dir creation {timeline_path}")) + .map_err(DownloadError::Other)?; + } + let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}")); + + let file = download_retry( + || async { + let mut file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .open(&temp_path) + .await + .with_context(|| format!("tempfile creation {temp_path}")) + .map_err(DownloadError::Other)?; + + let mut download = storage.download(&remote_path).await?; + + tokio::io::copy(&mut download.download_stream, &mut file) + .await + .with_context(|| format!("download initdb.tar.zst at {remote_path:?}")) + .map_err(DownloadError::Other)?; + + file.seek(std::io::SeekFrom::Start(0)) + .await + .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}")) + .map_err(DownloadError::Other)?; + + Ok(file) + }, + &format!("download {remote_path}"), + ) + .await + .map_err(|e| { + if temp_path.exists() { + // Do a best-effort attempt at deleting the temporary file upon encountering an error. + // We don't have async here nor do we want to pile on any extra errors. + if let Err(e) = std::fs::remove_file(&temp_path) { + warn!("error deleting temporary file {temp_path}: {e}"); + } + } + e + })?; + + Ok((temp_path, file)) +} + /// Helper function to handle retries for a download operation. /// /// Remote operations can fail due to rate limits (IAM, S3), spurious network diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 0d0b34365c9a..0abfdeef023e 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -12,6 +12,7 @@ use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerFileName; use crate::tenant::upload_queue::UploadQueueInitialized; use crate::tenant::Generation; +use pageserver_api::shard::ShardIndex; use utils::lsn::Lsn; @@ -25,6 +26,8 @@ pub struct LayerFileMetadata { file_size: u64, pub(crate) generation: Generation, + + pub(crate) shard: ShardIndex, } impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { @@ -32,15 +35,17 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { LayerFileMetadata { file_size: other.file_size, generation: other.generation, + shard: other.shard, } } } impl LayerFileMetadata { - pub fn new(file_size: u64, generation: Generation) -> Self { + pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self { LayerFileMetadata { file_size, generation, + shard, } } @@ -161,6 +166,10 @@ pub struct IndexLayerMetadata { #[serde(default = "Generation::none")] #[serde(skip_serializing_if = "Generation::is_none")] pub generation: Generation, + + #[serde(default = "ShardIndex::unsharded")] + #[serde(skip_serializing_if = "ShardIndex::is_unsharded")] + pub shard: ShardIndex, } impl From for IndexLayerMetadata { @@ -168,6 +177,7 @@ impl From for IndexLayerMetadata { IndexLayerMetadata { file_size: other.file_size, generation: other.generation, + shard: other.shard, } } } @@ -195,13 +205,15 @@ mod tests { layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: 25600000, - generation: Generation::none() + generation: Generation::none(), + shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, - generation: Generation::none() + generation: Generation::none(), + shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), @@ -233,13 +245,15 @@ mod tests { layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: 25600000, - generation: Generation::none() + generation: Generation::none(), + shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, - generation: Generation::none() + generation: Generation::none(), + shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), @@ -272,13 +286,15 @@ mod tests { layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: 25600000, - generation: Generation::none() + generation: Generation::none(), + shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, - generation: Generation::none() + generation: Generation::none(), + shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), @@ -354,19 +370,21 @@ mod tests { layer_metadata: HashMap::from([ ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: 25600000, - generation: Generation::none() + generation: Generation::none(), + shard: ShardIndex::unsharded() }), ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, - generation: Generation::none() + generation: Generation::none(), + shard: ShardIndex::unsharded() }) ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()) + "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 4d3e1731dc0f..4ca4438003a1 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -4,6 +4,7 @@ use anyhow::{bail, Context}; use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; +use pageserver_api::shard::TenantShardId; use std::io::ErrorKind; use tokio::fs; @@ -24,7 +25,7 @@ use tracing::info; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part<'a>( storage: &'a GenericRemoteStorage, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, index_part: &'a IndexPart, @@ -42,11 +43,11 @@ pub(super) async fn upload_index_part<'a>( let index_part_size = index_part_bytes.len(); let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); - let remote_path = remote_index_path(tenant_id, timeline_id, generation); + let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); storage .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path) .await - .with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'")) + .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } /// Attempts to upload given layer files. diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 3b2a61dcbaa1..944e05883f5f 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -2,7 +2,7 @@ pub mod delta_layer; mod filename; -mod image_layer; +pub mod image_layer; mod inmemory_layer; mod layer; mod layer_desc; @@ -24,10 +24,7 @@ use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; use utils::rate_limit::RateLimit; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::{id::TimelineId, lsn::Lsn}; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; @@ -304,12 +301,14 @@ pub trait AsLayerDesc { } pub mod tests { + use pageserver_api::shard::TenantShardId; + use super::*; impl From for PersistentLayerDesc { fn from(value: DeltaFileName) -> Self { PersistentLayerDesc::new_delta( - TenantId::from_array([0; 16]), + TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), value.key_range, value.lsn_range, @@ -321,7 +320,7 @@ pub mod tests { impl From for PersistentLayerDesc { fn from(value: ImageFileName) -> Self { PersistentLayerDesc::new_img( - TenantId::from_array([0; 16]), + TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), value.key_range, value.lsn, diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 79f37dcb2d50..d33920412763 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -42,6 +42,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::models::LayerAccessKind; +use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::File; @@ -69,13 +70,13 @@ use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer}; #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] pub struct Summary { /// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC. - magic: u16, - format_version: u16, + pub magic: u16, + pub format_version: u16, - tenant_id: TenantId, - timeline_id: TimelineId, - key_range: Range, - lsn_range: Range, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub key_range: Range, + pub lsn_range: Range, /// Block number where the 'index' part of the file begins. pub index_start_blk: u32, @@ -86,7 +87,7 @@ pub struct Summary { impl From<&DeltaLayer> for Summary { fn from(layer: &DeltaLayer) -> Self { Self::expected( - layer.desc.tenant_id, + layer.desc.tenant_shard_id.tenant_id, layer.desc.timeline_id, layer.desc.key_range.clone(), layer.desc.lsn_range.clone(), @@ -248,7 +249,7 @@ impl DeltaLayer { fn temp_path_for( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, key_start: Key, lsn_range: &Range, @@ -259,14 +260,15 @@ impl DeltaLayer { .map(char::from) .collect(); - conf.timeline_path(tenant_id, timeline_id).join(format!( - "{}-XXX__{:016X}-{:016X}.{}.{}", - key_start, - u64::from(lsn_range.start), - u64::from(lsn_range.end), - rand_string, - TEMP_FILE_SUFFIX, - )) + conf.timeline_path(tenant_shard_id, timeline_id) + .join(format!( + "{}-XXX__{:016X}-{:016X}.{}.{}", + key_start, + u64::from(lsn_range.start), + u64::from(lsn_range.end), + rand_string, + TEMP_FILE_SUFFIX, + )) } /// @@ -318,10 +320,14 @@ impl DeltaLayer { .metadata() .context("get file metadata to determine size")?; + // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary. + // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn. + let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id); + Ok(DeltaLayer { path: path.to_path_buf(), desc: PersistentLayerDesc::new_delta( - summary.tenant_id, + tenant_shard_id, summary.timeline_id, summary.key_range, summary.lsn_range, @@ -353,7 +359,7 @@ struct DeltaLayerWriterInner { conf: &'static PageServerConf, pub path: Utf8PathBuf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, @@ -370,7 +376,7 @@ impl DeltaLayerWriterInner { async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, ) -> anyhow::Result { @@ -380,7 +386,8 @@ impl DeltaLayerWriterInner { // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range); + let path = + DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range); let mut file = VirtualFile::create(&path).await?; // make room for the header block @@ -395,7 +402,7 @@ impl DeltaLayerWriterInner { conf, path, timeline_id, - tenant_id, + tenant_shard_id, key_start, lsn_range, tree: tree_builder, @@ -457,7 +464,7 @@ impl DeltaLayerWriterInner { let summary = Summary { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: self.tenant_id, + tenant_id: self.tenant_shard_id.tenant_id, timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), @@ -498,7 +505,7 @@ impl DeltaLayerWriterInner { // set inner.file here. The first read will have to re-open it. let desc = PersistentLayerDesc::new_delta( - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.key_start..key_end, self.lsn_range.clone(), @@ -549,14 +556,20 @@ impl DeltaLayerWriter { pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, ) -> anyhow::Result { Ok(Self { inner: Some( - DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range) - .await?, + DeltaLayerWriterInner::new( + conf, + timeline_id, + tenant_shard_id, + key_start, + lsn_range, + ) + .await?, ), }) } @@ -611,6 +624,61 @@ impl Drop for DeltaLayerWriter { } } +#[derive(thiserror::Error, Debug)] +pub enum RewriteSummaryError { + #[error("magic mismatch")] + MagicMismatch, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for RewriteSummaryError { + fn from(e: std::io::Error) -> Self { + Self::Other(anyhow::anyhow!(e)) + } +} + +impl DeltaLayer { + pub async fn rewrite_summary( + path: &Utf8Path, + rewrite: F, + ctx: &RequestContext, + ) -> Result<(), RewriteSummaryError> + where + F: Fn(Summary) -> Summary, + { + let file = VirtualFile::open_with_options( + path, + &*std::fs::OpenOptions::new().read(true).write(true), + ) + .await + .with_context(|| format!("Failed to open file '{}'", path))?; + let file = FileBlockReader::new(file); + let summary_blk = file.read_blk(0, ctx).await?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; + let mut file = file.file; + if actual_summary.magic != DELTA_FILE_MAGIC { + return Err(RewriteSummaryError::MagicMismatch); + } + + let new_summary = rewrite(actual_summary); + + let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + Summary::ser_into(&new_summary, &mut buf).context("serialize")?; + if buf.spilled() { + // The code in DeltaLayerWriterInner just warn!()s for this. + // It should probably error out as well. + return Err(RewriteSummaryError::Other(anyhow::anyhow!( + "Used more than one page size for summary buffer: {}", + buf.len() + ))); + } + file.seek(SeekFrom::Start(0)).await?; + file.write_all(&buf).await?; + Ok(()) + } +} + impl DeltaLayerInner { /// Returns nested result following Result, Critical>: /// - inner has the success or transient failure diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index c38a9f6883a8..023122c0b1b4 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -41,6 +41,7 @@ use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use hex; use pageserver_api::models::LayerAccessKind; +use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::File; @@ -67,27 +68,27 @@ use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer}; /// the 'index' starts at the block indicated by 'index_start_blk' /// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] -pub(super) struct Summary { +pub struct Summary { /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC. - magic: u16, - format_version: u16, + pub magic: u16, + pub format_version: u16, - tenant_id: TenantId, - timeline_id: TimelineId, - key_range: Range, - lsn: Lsn, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub key_range: Range, + pub lsn: Lsn, /// Block number where the 'index' part of the file begins. - index_start_blk: u32, + pub index_start_blk: u32, /// Block within the 'index', where the B-tree root page is stored - index_root_blk: u32, + pub index_root_blk: u32, // the 'values' part starts after the summary header, on block 1. } impl From<&ImageLayer> for Summary { fn from(layer: &ImageLayer) -> Self { Self::expected( - layer.desc.tenant_id, + layer.desc.tenant_shard_id.tenant_id, layer.desc.timeline_id, layer.desc.key_range.clone(), layer.lsn, @@ -217,7 +218,7 @@ impl ImageLayer { fn temp_path_for( conf: &PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, fname: &ImageFileName, ) -> Utf8PathBuf { let rand_string: String = rand::thread_rng() @@ -226,7 +227,7 @@ impl ImageLayer { .map(char::from) .collect(); - conf.timeline_path(&tenant_id, &timeline_id) + conf.timeline_path(&tenant_shard_id, &timeline_id) .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) } @@ -276,10 +277,15 @@ impl ImageLayer { let metadata = file .metadata() .context("get file metadata to determine size")?; + + // TODO(sharding): we should get TenantShardId from path. + // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart. + let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id); + Ok(ImageLayer { path: path.to_path_buf(), desc: PersistentLayerDesc::new_img( - summary.tenant_id, + tenant_shard_id, summary.timeline_id, summary.key_range, summary.lsn, @@ -296,6 +302,61 @@ impl ImageLayer { } } +#[derive(thiserror::Error, Debug)] +pub enum RewriteSummaryError { + #[error("magic mismatch")] + MagicMismatch, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl From for RewriteSummaryError { + fn from(e: std::io::Error) -> Self { + Self::Other(anyhow::anyhow!(e)) + } +} + +impl ImageLayer { + pub async fn rewrite_summary( + path: &Utf8Path, + rewrite: F, + ctx: &RequestContext, + ) -> Result<(), RewriteSummaryError> + where + F: Fn(Summary) -> Summary, + { + let file = VirtualFile::open_with_options( + path, + &*std::fs::OpenOptions::new().read(true).write(true), + ) + .await + .with_context(|| format!("Failed to open file '{}'", path))?; + let file = FileBlockReader::new(file); + let summary_blk = file.read_blk(0, ctx).await?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; + let mut file = file.file; + if actual_summary.magic != IMAGE_FILE_MAGIC { + return Err(RewriteSummaryError::MagicMismatch); + } + + let new_summary = rewrite(actual_summary); + + let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + Summary::ser_into(&new_summary, &mut buf).context("serialize")?; + if buf.spilled() { + // The code in ImageLayerWriterInner just warn!()s for this. + // It should probably error out as well. + return Err(RewriteSummaryError::Other(anyhow::anyhow!( + "Used more than one page size for summary buffer: {}", + buf.len() + ))); + } + file.seek(SeekFrom::Start(0)).await?; + file.write_all(&buf).await?; + Ok(()) + } +} + impl ImageLayerInner { /// Returns nested result following Result, Critical>: /// - inner has the success or transient failure @@ -400,7 +461,7 @@ struct ImageLayerWriterInner { conf: &'static PageServerConf, path: Utf8PathBuf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_range: Range, lsn: Lsn, @@ -415,7 +476,7 @@ impl ImageLayerWriterInner { async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, ) -> anyhow::Result { @@ -424,7 +485,7 @@ impl ImageLayerWriterInner { let path = ImageLayer::temp_path_for( conf, timeline_id, - tenant_id, + tenant_shard_id, &ImageFileName { key_range: key_range.clone(), lsn, @@ -448,7 +509,7 @@ impl ImageLayerWriterInner { conf, path, timeline_id, - tenant_id, + tenant_shard_id, key_range: key_range.clone(), lsn, tree: tree_builder, @@ -495,7 +556,7 @@ impl ImageLayerWriterInner { let summary = Summary { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: self.tenant_id, + tenant_id: self.tenant_shard_id.tenant_id, timeline_id: self.timeline_id, key_range: self.key_range.clone(), lsn: self.lsn, @@ -521,7 +582,7 @@ impl ImageLayerWriterInner { .context("get metadata to determine file size")?; let desc = PersistentLayerDesc::new_img( - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.key_range.clone(), self.lsn, @@ -577,13 +638,14 @@ impl ImageLayerWriter { pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, ) -> anyhow::Result { Ok(Self { inner: Some( - ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?, + ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn) + .await?, ), }) } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 2cb1e55b2606..003cf0e92b5e 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -14,15 +14,11 @@ use crate::tenant::Timeline; use crate::walrecord; use anyhow::{ensure, Result}; use pageserver_api::models::InMemoryLayerInfo; +use pageserver_api::shard::TenantShardId; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; use tracing::*; -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, - vec_map::VecMap, -}; +use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; @@ -33,7 +29,7 @@ use super::{DeltaLayerWriter, ResidentLayer}; pub struct InMemoryLayer { conf: &'static PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, /// This layer contains all the changes from 'start_lsn'. The @@ -226,17 +222,17 @@ impl InMemoryLayer { pub async fn create( conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, start_lsn: Lsn, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; Ok(InMemoryLayer { conf, timeline_id, - tenant_id, + tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), inner: RwLock::new(InMemoryLayerInner { @@ -335,7 +331,7 @@ impl InMemoryLayer { let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, - self.tenant_id, + self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, ) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index f28f1c9444f5..3ed4e05beaba 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -3,6 +3,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::models::{ HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, }; +use pageserver_api::shard::ShardIndex; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; @@ -81,7 +82,7 @@ impl Layer { metadata: LayerFileMetadata, ) -> Self { let desc = PersistentLayerDesc::from_filename( - timeline.tenant_id, + timeline.tenant_shard_id, timeline.timeline_id, file_name, metadata.file_size(), @@ -96,6 +97,7 @@ impl Layer { desc, None, metadata.generation, + metadata.shard, ))); debug_assert!(owner.0.needs_download_blocking().unwrap().is_some()); @@ -111,7 +113,7 @@ impl Layer { metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( - timeline.tenant_id, + timeline.tenant_shard_id, timeline.timeline_id, file_name, metadata.file_size(), @@ -136,6 +138,7 @@ impl Layer { desc, Some(inner), metadata.generation, + metadata.shard, ) })); @@ -179,6 +182,7 @@ impl Layer { desc, Some(inner), timeline.generation, + timeline.get_shard_index(), ) })); @@ -322,6 +326,24 @@ impl Layer { Ok(()) } + + /// Waits until this layer has been dropped (and if needed, local garbage collection and remote + /// deletion scheduling has completed). + /// + /// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that + /// separatedly. + #[cfg(feature = "testing")] + pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { + let mut rx = self.0.status.subscribe(); + + async move { + loop { + if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await { + break; + } + } + } + } } /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted. @@ -426,6 +448,15 @@ struct LayerInner { /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`], /// for created layers from [`Timeline::generation`]. generation: Generation, + + /// The shard of this Layer. + /// + /// For layers created in this process, this will always be the [`ShardIndex`] of the + /// current `ShardIdentity`` (TODO: add link once it's introduced). + /// + /// For loaded layers, this may be some other value if the tenant has undergone + /// a shard split since the layer was originally written. + shard: ShardIndex, } impl std::fmt::Display for LayerInner { @@ -455,17 +486,21 @@ impl Drop for LayerInner { return; } - let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id); + let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); let file_name = self.layer_desc().filename(); - let gen = self.generation; let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); + let meta = self.metadata(); + let status = self.status.clone(); crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { let _g = span.entered(); + // carry this until we are finished for [`Layer::wait_drop`] support + let _status = status; + let removed = match std::fs::remove_file(path) { Ok(()) => true, Err(e) if e.kind() == std::io::ErrorKind::NotFound => { @@ -489,7 +524,7 @@ impl Drop for LayerInner { timeline.metrics.resident_physical_size_sub(file_size); } if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, gen)]); + let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]); if let Err(e) = res { // test_timeline_deletion_with_files_stuck_in_upload_queue is good at @@ -523,9 +558,10 @@ impl LayerInner { desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, + shard: ShardIndex, ) -> Self { let path = conf - .timeline_path(&timeline.tenant_id, &timeline.timeline_id) + .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) .join(desc.filename().to_string()); let (inner, version) = if let Some(inner) = downloaded { @@ -550,6 +586,7 @@ impl LayerInner { status: tokio::sync::broadcast::channel(1).0, consecutive_failures: AtomicUsize::new(0), generation, + shard, } } @@ -795,7 +832,7 @@ impl LayerInner { crate::task_mgr::spawn( &tokio::runtime::Handle::current(), crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_id), + Some(self.desc.tenant_shard_id.tenant_id), Some(self.desc.timeline_id), &task_name, false, @@ -960,7 +997,7 @@ impl LayerInner { if gc { // do nothing now, only in LayerInner::drop } else if can_evict && evict { - let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version); + let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version); // downgrade for queueing, in case there's a tear down already ongoing we should not // hold it alive. @@ -1077,7 +1114,7 @@ impl LayerInner { } fn metadata(&self) -> LayerFileMetadata { - LayerFileMetadata::new(self.desc.file_size, self.generation) + LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard) } } @@ -1192,7 +1229,7 @@ impl DownloadedLayer { let res = if owner.desc.is_delta { let summary = Some(delta_layer::Summary::expected( - owner.desc.tenant_id, + owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, owner.desc.key_range.clone(), owner.desc.lsn_range.clone(), @@ -1203,7 +1240,7 @@ impl DownloadedLayer { } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( - owner.desc.tenant_id, + owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, owner.desc.key_range.clone(), lsn, @@ -1401,6 +1438,7 @@ impl Default for LayerImplMetrics { ) .unwrap(); + // reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix let gcs = metrics::register_int_counter_vec!( "pageserver_layer_gcs_count", "Garbage collections started and completed in the Layer implementation", diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index 2e0b0b3e645c..bf24407fc582 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -1,9 +1,7 @@ use core::fmt::Display; +use pageserver_api::shard::TenantShardId; use std::ops::Range; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::{id::TimelineId, lsn::Lsn}; use crate::repository::Key; @@ -11,12 +9,15 @@ use super::{DeltaFileName, ImageFileName, LayerFileName}; use serde::{Deserialize, Serialize}; +#[cfg(test)] +use utils::id::TenantId; + /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct PersistentLayerDesc { - pub tenant_id: TenantId, + pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, /// Range of keys that this layer covers pub key_range: Range, @@ -56,7 +57,7 @@ impl PersistentLayerDesc { #[cfg(test)] pub fn new_test(key_range: Range) -> Self { Self { - tenant_id: TenantId::generate(), + tenant_shard_id: TenantShardId::unsharded(TenantId::generate()), timeline_id: TimelineId::generate(), key_range, lsn_range: Lsn(0)..Lsn(1), @@ -66,14 +67,14 @@ impl PersistentLayerDesc { } pub fn new_img( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key_range: Range, lsn: Lsn, file_size: u64, ) -> Self { Self { - tenant_id, + tenant_shard_id, timeline_id, key_range, lsn_range: Self::image_layer_lsn_range(lsn), @@ -83,14 +84,14 @@ impl PersistentLayerDesc { } pub fn new_delta( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key_range: Range, lsn_range: Range, file_size: u64, ) -> Self { Self { - tenant_id, + tenant_shard_id, timeline_id, key_range, lsn_range, @@ -100,18 +101,22 @@ impl PersistentLayerDesc { } pub fn from_filename( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, filename: LayerFileName, file_size: u64, ) -> Self { match filename { LayerFileName::Image(i) => { - Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size) - } - LayerFileName::Delta(d) => { - Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size) + Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } + LayerFileName::Delta(d) => Self::new_delta( + tenant_shard_id, + timeline_id, + d.key_range, + d.lsn_range, + file_size, + ), } } @@ -172,10 +177,6 @@ impl PersistentLayerDesc { self.timeline_id } - pub fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - /// Does this layer only contain some data for the key-range (incremental), /// or does it contain a version of every page? This is important to know /// for garbage collecting old layers: an incremental layer depends on @@ -192,7 +193,7 @@ impl PersistentLayerDesc { if self.is_delta { println!( "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----", - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.key_range.start, self.key_range.end, @@ -204,7 +205,7 @@ impl PersistentLayerDesc { } else { println!( "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----", - self.tenant_id, + self.tenant_shard_id, self.timeline_id, self.key_range.start, self.key_range.end, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 860bb255ca43..138578ec8ae1 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -86,7 +86,7 @@ pub fn start_background_loops( tenant: &Arc, background_jobs_can_start: Option<&completion::Barrier>, ) { - let tenant_id = tenant.tenant_id; + let tenant_id = tenant.tenant_shard_id.tenant_id; task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 9493ed1c9a33..bf4e19e5fbaa 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2,7 +2,7 @@ pub mod delete; mod eviction_task; mod init; pub mod layer_manager; -mod logical_size; +pub(crate) mod logical_size; pub mod span; pub mod uninit; mod walreceiver; @@ -13,8 +13,12 @@ use camino::{Utf8Path, Utf8PathBuf}; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::models::{ - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo, TimelineState, +use pageserver_api::{ + models::{ + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo, + TimelineState, + }, + shard::TenantShardId, }; use serde_with::serde_as; use storage_broker::BrokerClientChannel; @@ -62,6 +66,7 @@ use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; use crate::tenant::config::{EvictionPolicy, TenantConfOpt}; use pageserver_api::reltag::RelTag; +use pageserver_api::shard::ShardIndex; use postgres_connection::PgConnectionConfig; use postgres_ffi::to_pg_timestamp; @@ -148,7 +153,7 @@ pub struct Timeline { myself: Weak, - pub tenant_id: TenantId, + pub(crate) tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects. @@ -250,14 +255,6 @@ pub struct Timeline { /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, - /// Layer removal lock. - /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. - /// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`]. - /// This is an `Arc` lock because we need an owned - /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`). - /// Note that [`DeleteTimelineFlow`] uses `delete_progress` field. - pub(super) layer_removal_cs: Arc>, - // Needed to ensure that we can't create a branch at a point that was already garbage collected pub latest_gc_cutoff_lsn: Rcu, @@ -318,6 +315,24 @@ pub struct Timeline { /// Cancellation token scoped to this timeline: anything doing long-running work relating /// to the timeline should drop out when this token fires. pub(crate) cancel: CancellationToken, + + /// Make sure we only have one running compaction at a time in tests. + /// + /// Must only be taken in two places: + /// - [`Timeline::compact`] (this file) + /// - [`delete::delete_local_layer_files`] + /// + /// Timeline deletion will acquire both compaction and gc locks in whatever order. + compaction_lock: tokio::sync::Mutex<()>, + + /// Make sure we only have one running gc at a time. + /// + /// Must only be taken in two places: + /// - [`Timeline::gc`] (this file) + /// - [`delete::delete_local_layer_files`] + /// + /// Timeline deletion will acquire both compaction and gc locks in whatever order. + gc_lock: tokio::sync::Mutex<()>, } pub struct WalReceiverInfo { @@ -690,7 +705,7 @@ impl Timeline { } /// Flush to disk all data that was written with the put_* functions - #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))] + #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { self.freeze_inmem_layer(false).await; self.flush_frozen_layers_and_wait().await @@ -703,6 +718,8 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> Result<(), CompactionError> { + let _g = self.compaction_lock.lock().await; + // this wait probably never needs any "long time spent" logging, because we already nag if // compaction task goes over it's period (20s) which is quite often in production. let _permit = match super::tasks::concurrent_background_tasks_rate_limit( @@ -757,7 +774,7 @@ impl Timeline { // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. - let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await); + // Is the timeline being deleted? if self.is_stopping() { trace!("Dropping out of compaction on timeline shutdown"); @@ -798,8 +815,7 @@ impl Timeline { // 3. Compact let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx) - .await?; + self.compact_level0(target_file_size, ctx).await?; timer.stop_and_record(); if let Some(remote_client) = &self.remote_client { @@ -839,23 +855,38 @@ impl Timeline { /// the initial size calculation has not been run (gets triggered on the first size access). /// /// return size and boolean flag that shows if the size is exact - pub fn get_current_logical_size( + pub(crate) fn get_current_logical_size( self: &Arc, ctx: &RequestContext, - ) -> anyhow::Result<(u64, bool)> { - let current_size = self.current_logical_size.current_size()?; + ) -> logical_size::CurrentLogicalSize { + let current_size = self.current_logical_size.current_size(); debug!("Current size: {current_size:?}"); - let mut is_exact = true; - let size = current_size.size(); if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) = (current_size, self.current_logical_size.initial_part_end) { - is_exact = false; self.try_spawn_size_init_task(initial_part_end, ctx); } - Ok((size, is_exact)) + if let CurrentLogicalSize::Approximate(_) = ¤t_size { + if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler { + let first = self + .current_logical_size + .did_return_approximate_to_walreceiver + .compare_exchange( + false, + true, + AtomicOrdering::Relaxed, + AtomicOrdering::Relaxed, + ) + .is_ok(); + if first { + crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc(); + } + } + } + + current_size } /// Check if more than 'checkpoint_distance' of WAL has been accumulated in @@ -925,7 +956,7 @@ impl Timeline { tracing::debug!("Waiting for WalReceiverManager..."); task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), ) .await; @@ -945,7 +976,7 @@ impl Timeline { // what is problematic is the shutting down of RemoteTimelineClient, because // obviously it does not make sense to stop while we wait for it, but what // about corner cases like s3 suddenly hanging up? - if let Err(e) = client.wait_completion().await { + if let Err(e) = client.shutdown().await { // Non-fatal. Shutdown is infallible. Failures to flush just mean that // we have some extra WAL replay to do next time the timeline starts. warn!("failed to flush to remote storage: {e:#}"); @@ -976,7 +1007,7 @@ impl Timeline { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), ) .await; @@ -994,7 +1025,12 @@ impl Timeline { tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await; + task_mgr::shutdown_tasks( + None, + Some(self.tenant_shard_id.tenant_id), + Some(self.timeline_id), + ) + .await; // Finally wait until any gate-holders are complete self.gate.close().await; @@ -1113,7 +1149,7 @@ impl Timeline { } } - #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] + #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); @@ -1200,16 +1236,6 @@ impl Timeline { remote_client: &Arc, layers_to_evict: &[Layer], ) -> anyhow::Result>>> { - // ensure that the layers have finished uploading - // (don't hold the layer_removal_cs while we do it, we're not removing anything yet) - remote_client - .wait_completion() - .await - .context("wait for layer upload ops to complete")?; - - // now lock out layer removal (compaction, gc, timeline deletion) - let _layer_removal_guard = self.layer_removal_cs.lock().await; - { // to avoid racing with detach and delete_timeline let state = self.current_state(); @@ -1328,7 +1354,11 @@ impl Timeline { &self.tenant_conf.read().unwrap().tenant_conf, &self.conf.default_tenant_conf, ); - let tenant_id_str = self.tenant_id.to_string(); + + // TODO(sharding): make evictions state shard aware + // (https://github.com/neondatabase/neon/issues/5953) + let tenant_id_str = self.tenant_shard_id.tenant_id.to_string(); + let timeline_id_str = self.timeline_id.to_string(); self.metrics .evictions_with_low_residence_duration @@ -1348,7 +1378,7 @@ impl Timeline { metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, generation: Generation, walredo_mgr: Arc, resources: TimelineResources, @@ -1379,7 +1409,7 @@ impl Timeline { tenant_conf, myself: myself.clone(), timeline_id, - tenant_id, + tenant_shard_id, generation, pg_version, layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())), @@ -1406,7 +1436,7 @@ impl Timeline { ancestor_lsn: metadata.ancestor_lsn(), metrics: TimelineMetrics::new( - &tenant_id, + &tenant_shard_id.tenant_id, &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", @@ -1420,7 +1450,6 @@ impl Timeline { layer_flush_done_tx, write_lock: tokio::sync::Mutex::new(()), - layer_removal_cs: Default::default(), gc_info: std::sync::RwLock::new(GcInfo { retain_lsns: Vec::new(), @@ -1458,7 +1487,10 @@ impl Timeline { initial_logical_size_can_start, initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt), cancel, - gate: Gate::new(format!("Timeline<{tenant_id}/{timeline_id}>")), + gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")), + + compaction_lock: tokio::sync::Mutex::default(), + gc_lock: tokio::sync::Mutex::default(), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -1471,20 +1503,24 @@ impl Timeline { } pub(super) fn maybe_spawn_flush_loop(self: &Arc) { + let Ok(guard) = self.gate.enter() else { + info!("cannot start flush loop when the timeline gate has already been closed"); + return; + }; let mut flush_loop_state = self.flush_loop_state.lock().unwrap(); match *flush_loop_state { FlushLoopState::NotStarted => (), FlushLoopState::Running { .. } => { info!( "skipping attempt to start flush_loop twice {}/{}", - self.tenant_id, self.timeline_id + self.tenant_shard_id, self.timeline_id ); return; } FlushLoopState::Exited => { warn!( "ignoring attempt to restart exited flush_loop {}/{}", - self.tenant_id, self.timeline_id + self.tenant_shard_id, self.timeline_id ); return; } @@ -1503,11 +1539,12 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "layer flush task", false, async move { + let _guard = guard; let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); @@ -1515,7 +1552,7 @@ impl Timeline { *flush_loop_state = FlushLoopState::Exited; Ok(()) } - .instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id)) + .instrument(info_span!(parent: None, "layer flush task", tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id)) ); } @@ -1530,7 +1567,7 @@ impl Timeline { ) { info!( "launching WAL receiver for timeline {} of tenant {}", - self.timeline_id, self.tenant_id + self.timeline_id, self.tenant_shard_id ); let tenant_conf_guard = self.tenant_conf.read().unwrap(); @@ -1591,12 +1628,15 @@ impl Timeline { // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); + let timeline_path = self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id); let conf = self.conf; let span = tracing::Span::current(); // Copy to move into the task we're about to spawn let generation = self.generation; + let shard = self.get_shard_index(); let this = self.myself.upgrade().expect("&self method holds the arc"); let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({ @@ -1645,6 +1685,7 @@ impl Timeline { index_part.as_ref(), disk_consistent_lsn, generation, + shard, ); let mut loaded_layers = Vec::new(); @@ -1786,6 +1827,7 @@ impl Timeline { "spawning logical size computation from context of task kind {:?}", ctx.task_kind() ); + let causing_task_kind = ctx.task_kind(); // We need to start the computation task. // It gets a separate context since it will outlive the request that called this function. let self_clone = Arc::clone(self); @@ -1796,7 +1838,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "initial size calculation", false, @@ -1813,6 +1855,8 @@ impl Timeline { _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {} }; + + // hold off background tasks from starting until all timelines get to try at least // once initial logical size calculation; though retry will rarely be useful. // holding off is done because heavier tasks execute blockingly on the same @@ -1820,7 +1864,12 @@ impl Timeline { // // dropping this at every outcome is probably better than trying to cling on to it, // delay will be terminated by a timeout regardless. - let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() }; + let completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() }; + + let metrics_guard = match &completion { + Some(_) => crate::metrics::initial_logical_size::START_CALCULATION.first(Some(causing_task_kind)), + None => crate::metrics::initial_logical_size::START_CALCULATION.retry(Some(causing_task_kind)), + }; let calculated_size = match self_clone .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx) @@ -1865,11 +1914,11 @@ impl Timeline { match self_clone .current_logical_size .initial_logical_size - .set(calculated_size) + .set((calculated_size, metrics_guard.calculation_result_saved())) { Ok(()) => (), Err(_what_we_just_attempted_to_set) => { - let existing_size = self_clone + let (existing_size, _) = self_clone .current_logical_size .initial_logical_size .get() @@ -1906,7 +1955,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "ondemand logical size calculation", false, @@ -1982,7 +2031,7 @@ impl Timeline { fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| { if !self .conf - .metadata_path(&self.tenant_id, &self.timeline_id) + .metadata_path(&self.tenant_shard_id, &self.timeline_id) .exists() { error!("timeline-calculate-logical-size-pre metadata file does not exist") @@ -2023,16 +2072,14 @@ impl Timeline { // one value while current_logical_size is set to the // other. match logical_size.current_size() { - Ok(CurrentLogicalSize::Exact(new_current_size)) => self + CurrentLogicalSize::Exact(ref new_current_size) => self .metrics .current_logical_size_gauge - .set(new_current_size), - Ok(CurrentLogicalSize::Approximate(_)) => { + .set(new_current_size.into()), + CurrentLogicalSize::Approximate(_) => { // don't update the gauge yet, this allows us not to update the gauge back and // forth between the initial size calculation task. } - // this is overflow - Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"), } } @@ -2335,7 +2382,13 @@ impl Timeline { // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object let (lsn, read_guard) = cache - .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn, ctx) + .lookup_materialized_page( + self.tenant_shard_id.tenant_id, + self.timeline_id, + key, + lsn, + ctx, + ) .await?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) @@ -2363,7 +2416,7 @@ impl Timeline { self.get_last_record_lsn(), self.conf, self.timeline_id, - self.tenant_id, + self.tenant_shard_id, ) .await?; Ok(layer) @@ -2529,7 +2582,7 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer))] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, @@ -2650,9 +2703,14 @@ impl Timeline { // If we updated our disk_consistent_lsn, persist the updated metadata to local disk. if let Some(metadata) = metadata { - save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata) - .await - .context("save_metadata")?; + save_metadata( + self.conf, + &self.tenant_shard_id, + &self.timeline_id, + &metadata, + ) + .await + .context("save_metadata")?; } Ok(()) } @@ -2716,9 +2774,14 @@ impl Timeline { ) -> anyhow::Result<()> { let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; - save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata) - .await - .context("save_metadata")?; + save_metadata( + self.conf, + &self.tenant_shard_id, + &self.timeline_id, + &metadata, + ) + .await + .context("save_metadata")?; Ok(()) } @@ -2766,7 +2829,7 @@ impl Timeline { par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; par_fsync::par_fsync(&[self_clone .conf - .timeline_path(&self_clone.tenant_id, &self_clone.timeline_id)]) + .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) .context("fsync of timeline dir")?; anyhow::Ok(new_delta) @@ -2922,7 +2985,7 @@ impl Timeline { let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, - self.tenant_id, + self.tenant_shard_id, &img_range, lsn, ) @@ -2995,9 +3058,11 @@ impl Timeline { .await .context("fsync of newly created layer files")?; - par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)]) - .await - .context("fsync of timeline dir")?; + par_fsync::par_fsync_async(&[self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) + .await + .context("fsync of timeline dir")?; let mut guard = self.layers.write().await; @@ -3147,13 +3212,8 @@ impl TryFrom for CompactLevel0Phase1Stats { impl Timeline { /// Level0 files first phase of compaction, explained in the [`Self::compact`] comment. - /// - /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are - /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the - /// start of level0 files compaction, the on-demand download should be revisited as well. async fn compact_level0_phase1( self: &Arc, - _layer_removal_cs: Arc>, guard: tokio::sync::OwnedRwLockReadGuard, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, @@ -3240,8 +3300,6 @@ impl Timeline { let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); - // FIXME: downloading while holding layer_removal_cs is not great, but we will remove that - // soon deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; @@ -3490,7 +3548,7 @@ impl Timeline { DeltaLayerWriter::new( self.conf, self.timeline_id, - self.tenant_id, + self.tenant_shard_id, key, if dup_end_lsn.is_valid() { // this is a layer containing slice of values of the same key @@ -3551,7 +3609,9 @@ impl Timeline { .await .context("fsync all new layers")?; - let timeline_dir = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); + let timeline_dir = self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id); par_fsync::par_fsync_async(&[timeline_dir]) .await @@ -3591,7 +3651,6 @@ impl Timeline { /// async fn compact_level0( self: &Arc, - layer_removal_cs: Arc>, target_file_size: u64, ctx: &RequestContext, ) -> Result<(), CompactionError> { @@ -3603,7 +3662,7 @@ impl Timeline { let ctx = ctx.attached_child(); let mut stats = CompactLevel0Phase1StatsBuilder { version: Some(2), - tenant_id: Some(self.tenant_id), + tenant_id: Some(self.tenant_shard_id.tenant_id), timeline_id: Some(self.timeline_id), ..Default::default() }; @@ -3613,16 +3672,9 @@ impl Timeline { let now = tokio::time::Instant::now(); stats.read_lock_acquisition_micros = DurationRecorder::Recorded(RecordedDuration(now - begin), now); - let layer_removal_cs = layer_removal_cs.clone(); - self.compact_level0_phase1( - layer_removal_cs, - phase1_layers_locked, - stats, - target_file_size, - &ctx, - ) - .instrument(phase1_span) - .await? + self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) + .instrument(phase1_span) + .await? }; if new_layers.is_empty() && deltas_to_compact.is_empty() { @@ -3630,17 +3682,6 @@ impl Timeline { return Ok(()); } - // Before deleting any layers, we need to wait for their upload ops to finish. - // See remote_timeline_client module level comment on consistency. - // Do it here because we don't want to hold self.layers.write() while waiting. - if let Some(remote_client) = &self.remote_client { - debug!("waiting for upload ops to complete"); - remote_client - .wait_completion() - .await - .context("wait for layer upload ops to complete")?; - } - let mut guard = self.layers.write().await; let mut duplicated_layers = HashSet::new(); @@ -3672,12 +3713,7 @@ impl Timeline { }; // deletion will happen later, the layer file manager calls garbage_collect_on_drop - guard.finish_compact_l0( - &layer_removal_cs, - &remove_layers, - &insert_layers, - &self.metrics, - ); + guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); if let Some(remote_client) = self.remote_client.as_ref() { remote_client.schedule_compaction_update(&remove_layers, &new_layers)?; @@ -3788,19 +3824,17 @@ impl Timeline { Ok(()) } - /// /// Garbage collect layer files on a timeline that are no longer needed. /// /// Currently, we don't make any attempt at removing unneeded page versions /// within a layer file. We can only remove the whole file if it's fully /// obsolete. - /// pub(super) async fn gc(&self) -> anyhow::Result { + let _g = self.gc_lock.lock().await; let timer = self.metrics.garbage_collect_histo.start_timer(); fail_point!("before-timeline-gc"); - let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await); // Is the timeline being deleted? if self.is_stopping() { anyhow::bail!("timeline is Stopping"); @@ -3818,13 +3852,7 @@ impl Timeline { let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); let res = self - .gc_timeline( - layer_removal_cs.clone(), - horizon_cutoff, - pitr_cutoff, - retain_lsns, - new_gc_cutoff, - ) + .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff) .instrument( info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff), ) @@ -3838,7 +3866,6 @@ impl Timeline { async fn gc_timeline( &self, - layer_removal_cs: Arc>, horizon_cutoff: Lsn, pitr_cutoff: Lsn, retain_lsns: Vec, @@ -3876,17 +3903,6 @@ impl Timeline { debug!("retain_lsns: {:?}", retain_lsns); - // Before deleting any layers, we need to wait for their upload ops to finish. - // See storage_sync module level comment on consistency. - // Do it here because we don't want to hold self.layers.write() while waiting. - if let Some(remote_client) = &self.remote_client { - debug!("waiting for upload ops to complete"); - remote_client - .wait_completion() - .await - .context("wait for layer upload ops to complete")?; - } - let mut layers_to_remove = Vec::new(); let mut wanted_image_layers = KeySpaceRandomAccum::default(); @@ -4002,6 +4018,11 @@ impl Timeline { // // This does not in fact have any effect as we no longer consider local metadata unless // running without remote storage. + // + // This unconditionally schedules also an index_part.json update, even though, we will + // be doing one a bit later with the unlinked gc'd layers. + // + // TODO: remove when implementing . self.update_metadata_file(self.disk_consistent_lsn.load(), None) .await?; @@ -4016,11 +4037,16 @@ impl Timeline { remote_client.schedule_gc_update(&gc_layers)?; } - guard.finish_gc_timeline(&layer_removal_cs, gc_layers); + guard.finish_gc_timeline(&gc_layers); if result.layers_removed != 0 { fail_point!("after-timeline-gc-removed-layers"); } + + #[cfg(feature = "testing")] + { + result.doomed_layers = gc_layers; + } } info!( @@ -4032,9 +4058,7 @@ impl Timeline { Ok(result) } - /// /// Reconstruct a value, using the given base image and WAL records in 'data'. - /// async fn reconstruct_value( &self, key: Key, @@ -4099,7 +4123,7 @@ impl Timeline { let cache = page_cache::get(); if let Err(e) = cache .memorize_materialized_page( - self.tenant_id, + self.tenant_shard_id.tenant_id, self.timeline_id, key, last_rec_lsn, @@ -4143,7 +4167,7 @@ impl Timeline { let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), "download all remote layers task", false, @@ -4165,7 +4189,7 @@ impl Timeline { }; Ok(()) } - .instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_id, timeline_id = %self.timeline_id)) + .instrument(info_span!(parent: None, "download_all_remote_layers", tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id)) ); let initial_info = DownloadRemoteLayersTaskInfo { @@ -4364,6 +4388,13 @@ impl Timeline { resident_layers, } } + + pub(crate) fn get_shard_index(&self) -> ShardIndex { + ShardIndex { + shard_number: self.tenant_shard_id.shard_number, + shard_count: self.tenant_shard_id.shard_count, + } + } } type TraversalPathItem = ( diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 56a99a25cf75..497796c80ab5 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -4,13 +4,10 @@ use std::{ }; use anyhow::Context; -use pageserver_api::models::TimelineState; +use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; use tracing::{debug, error, info, instrument, warn, Instrument, Span}; -use utils::{ - crashsafe, fs_ext, - id::{TenantId, TimelineId}, -}; +use utils::{crashsafe, fs_ext, id::TimelineId}; use crate::{ config::PageServerConf, @@ -47,7 +44,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_id), + Some(timeline.tenant_shard_id.tenant_id), Some(timeline.timeline_id), ) .await; @@ -73,7 +70,12 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { // NB: This and other delete_timeline calls do not run as a task_mgr task, // so, they are not affected by this shutdown_tasks() call. info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await; + task_mgr::shutdown_tasks( + None, + Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.timeline_id), + ) + .await; fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { Err(anyhow::anyhow!( @@ -110,40 +112,11 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi Ok(()) } -// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued. -// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json -// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote" -// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file. -// After index part is deleted presence of this mark file indentifies that it was a deletion intention. -// So we can just remove the mark file. -async fn create_delete_mark( - conf: &PageServerConf, - tenant_id: TenantId, - timeline_id: TimelineId, -) -> Result<(), DeleteTimelineError> { - fail::fail_point!("timeline-delete-before-delete-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-before-delete-mark" - ))? - }); - let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id); - - // Note: we're ok to replace existing file. - let _ = std::fs::OpenOptions::new() - .write(true) - .create(true) - .open(&marker_path) - .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; - - crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?; - Ok(()) -} - -/// Grab the layer_removal_cs lock, and actually perform the deletion. +/// Grab the compaction and gc locks, and actually perform the deletion. /// -/// This lock prevents prevents GC or compaction from running at the same time. -/// The GC task doesn't register itself with the timeline it's operating on, -/// so it might still be running even though we called `shutdown_tasks`. +/// The locks prevent GC or compaction from running at the same time. The background tasks do not +/// register themselves with the timeline it's operating on, so it might still be running even +/// though we called `shutdown_tasks`. /// /// Note that there are still other race conditions between /// GC, compaction and timeline deletion. See @@ -151,19 +124,24 @@ async fn create_delete_mark( /// /// No timeout here, GC & Compaction should be responsive to the /// `TimelineState::Stopping` change. -async fn delete_local_layer_files( +// pub(super): documentation link +pub(super) async fn delete_local_layer_files( conf: &PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline: &Timeline, ) -> anyhow::Result<()> { - info!("waiting for layer_removal_cs.lock()"); - let layer_removal_guard = timeline.layer_removal_cs.lock().await; - info!("got layer_removal_cs.lock(), deleting layer files"); + let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) }; + let guards = crate::timed( + guards, + "acquire gc and compaction locks", + std::time::Duration::from_secs(5), + ) + .await; // NB: storage_sync upload tasks that reference these layers have been cancelled // by the caller. - let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id); + let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id); fail::fail_point!("timeline-delete-before-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? @@ -179,8 +157,8 @@ async fn delete_local_layer_files( // because of a previous failure/cancellation at/after // failpoint timeline-delete-after-rm. // - // It can also happen if we race with tenant detach, because, - // it doesn't grab the layer_removal_cs lock. + // ErrorKind::NotFound can also happen if we race with tenant detach, because, + // no locks are shared. // // For now, log and continue. // warn! level is technically not appropriate for the @@ -199,7 +177,7 @@ async fn delete_local_layer_files( return Ok(()); } - let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id); + let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id); for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) { #[cfg(feature = "testing")] @@ -248,8 +226,8 @@ async fn delete_local_layer_files( .with_context(|| format!("Failed to remove: {}", entry.path().display()))?; } - info!("finished deleting layer files, releasing layer_removal_cs.lock()"); - drop(layer_removal_guard); + info!("finished deleting layer files, releasing locks"); + drop(guards); fail::fail_point!("timeline-delete-after-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? @@ -274,11 +252,11 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<( // (nothing can fail after its deletion) async fn cleanup_remaining_timeline_fs_traces( conf: &PageServerConf, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> anyhow::Result<()> { // Remove local metadata - tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id)) + tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id)) .await .or_else(fs_ext::ignore_not_found) .context("remove metadata")?; @@ -290,7 +268,7 @@ async fn cleanup_remaining_timeline_fs_traces( }); // Remove timeline dir - tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id)) + tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id)) .await .or_else(fs_ext::ignore_not_found) .context("timeline dir")?; @@ -305,13 +283,15 @@ async fn cleanup_remaining_timeline_fs_traces( // to be reordered later and thus missed if a crash occurs. // Note that we dont need to sync after mark file is removed // because we can tolerate the case when mark file reappears on startup. - let timeline_path = conf.timelines_path(&tenant_id); + let timeline_path = conf.timelines_path(&tenant_shard_id); crashsafe::fsync_async(timeline_path) .await .context("fsync_pre_mark_remove")?; // Remove delete mark - tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id)) + // TODO: once we are confident that no more exist in the field, remove this + // line. It cleans up a legacy marker file that might in rare cases be present. + tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id)) .await .or_else(fs_ext::ignore_not_found) .context("remove delete mark") @@ -377,7 +357,7 @@ impl DeleteTimelineFlow { // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! - #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))] + #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))] pub async fn run( tenant: &Arc, timeline_id: TimelineId, @@ -391,8 +371,6 @@ impl DeleteTimelineFlow { set_deleted_in_remote_index(&timeline).await?; - create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?; - fail::fail_point!("timeline-delete-before-schedule", |_| { Err(anyhow::anyhow!( "failpoint: timeline-delete-before-schedule" @@ -464,10 +442,6 @@ impl DeleteTimelineFlow { guard.mark_in_progress()?; - // Note that delete mark can be missing on resume - // because we create delete mark after we set deleted_at in the index part. - create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?; - Self::schedule_background(guard, tenant.conf, tenant, timeline); Ok(()) @@ -479,7 +453,8 @@ impl DeleteTimelineFlow { timeline_id: TimelineId, ) -> anyhow::Result<()> { let r = - cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await; + cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id) + .await; info!("Done"); r } @@ -550,13 +525,13 @@ impl DeleteTimelineFlow { tenant: Arc, timeline: Arc, ) { - let tenant_id = timeline.tenant_id; + let tenant_shard_id = timeline.tenant_shard_id; let timeline_id = timeline.timeline_id; task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_id), + Some(tenant_shard_id.tenant_id), Some(timeline_id), "timeline_delete", false, @@ -569,7 +544,7 @@ impl DeleteTimelineFlow { } .instrument({ let span = - tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id); + tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id); span.follows_from(Span::current()); span }), @@ -582,13 +557,14 @@ impl DeleteTimelineFlow { tenant: &Tenant, timeline: &Timeline, ) -> Result<(), DeleteTimelineError> { - delete_local_layer_files(conf, tenant.tenant_id, timeline).await?; + delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?; delete_remote_layers_and_index(timeline).await?; pausable_failpoint!("in_progress_delete"); - cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?; + cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id) + .await?; remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index f4a4c26c06c3..3fe4bc0f83d2 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -60,9 +60,12 @@ impl Timeline { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, - Some(self.tenant_id), + Some(self.tenant_shard_id.tenant_id), Some(self.timeline_id), - &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id), + &format!( + "layer eviction for {}/{}", + self.tenant_shard_id, self.timeline_id + ), false, async move { let cancel = task_mgr::shutdown_token(); @@ -77,7 +80,7 @@ impl Timeline { ); } - #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] + #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, cancel: CancellationToken) { use crate::tenant::tasks::random_init_delay; { @@ -296,7 +299,6 @@ impl Timeline { stats.evicted += 1; } Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { - // compaction/gc removed the file while we were waiting on layer_removal_cs stats.not_evictable += 1; } } @@ -341,7 +343,7 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) { + let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) { Ok(t) => t, Err(_) => { return ControlFlow::Break(()); diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 96bf847fbb4b..916ebfc6d9f2 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -13,6 +13,7 @@ use crate::{ }; use anyhow::Context; use camino::Utf8Path; +use pageserver_api::shard::ShardIndex; use std::{collections::HashMap, str::FromStr}; use utils::lsn::Lsn; @@ -107,6 +108,7 @@ pub(super) fn reconcile( index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, generation: Generation, + shard: ShardIndex, ) -> Vec<(LayerFileName, Result)> { use Decision::*; @@ -118,10 +120,13 @@ pub(super) fn reconcile( .map(|(name, file_size)| { ( name, - // The generation here will be corrected to match IndexPart in the merge below, unless + // The generation and shard here will be corrected to match IndexPart in the merge below, unless // it is not in IndexPart, in which case using our current generation makes sense // because it will be uploaded in this generation. - (Some(LayerFileMetadata::new(file_size, generation)), None), + ( + Some(LayerFileMetadata::new(file_size, generation, shard)), + None, + ), ) }) .collect::(); diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index e4991e08659c..dcd82949dd0b 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,8 +1,9 @@ use anyhow::{bail, ensure, Context, Result}; +use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; use tracing::trace; use utils::{ - id::{TenantId, TimelineId}, + id::TimelineId, lsn::{AtomicLsn, Lsn}, }; @@ -73,7 +74,7 @@ impl LayerManager { last_record_lsn: Lsn, conf: &'static PageServerConf, timeline_id: TimelineId, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, ) -> Result> { ensure!(lsn.is_aligned()); @@ -109,7 +110,8 @@ impl LayerManager { lsn ); - let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?; + let new_layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); @@ -190,7 +192,6 @@ impl LayerManager { /// Called when compaction is completed. pub(crate) fn finish_compact_l0( &mut self, - layer_removal_cs: &Arc>, compact_from: &[Layer], compact_to: &[ResidentLayer], metrics: &TimelineMetrics, @@ -201,25 +202,16 @@ impl LayerManager { metrics.record_new_file_metrics(l.layer_desc().file_size); } for l in compact_from { - Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr); + Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); } updates.flush(); } - /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map. - pub(crate) fn finish_gc_timeline( - &mut self, - layer_removal_cs: &Arc>, - gc_layers: Vec, - ) { + /// Called when garbage collect has selected the layers to be removed. + pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) { let mut updates = self.layer_map.batch_update(); for doomed_layer in gc_layers { - Self::delete_historic_layer( - layer_removal_cs, - &doomed_layer, - &mut updates, - &mut self.layer_fmgr, - ); + Self::delete_historic_layer(doomed_layer, &mut updates, &mut self.layer_fmgr); } updates.flush() } @@ -238,7 +230,6 @@ impl LayerManager { /// Remote storage is not affected by this operation. fn delete_historic_layer( // we cannot remove layers otherwise, since gc and compaction will race - _layer_removal_cs: &Arc>, layer: &Layer, updates: &mut BatchedUpdates<'_>, mapping: &mut LayerFileManager, diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index d9c2bc4cb970..a33fb28ebd83 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -4,7 +4,7 @@ use once_cell::sync::OnceCell; use tokio::sync::Semaphore; use utils::lsn::Lsn; -use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; +use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; use std::sync::Arc; /// Internal structure to hold all data needed for logical size calculation. @@ -23,7 +23,10 @@ pub(super) struct LogicalSize { /// /// NOTE: size at a given LSN is constant, but after a restart we will calculate /// the initial size at a different LSN. - pub initial_logical_size: OnceCell, + pub initial_logical_size: OnceCell<( + u64, + crate::metrics::initial_logical_size::FinishedCalculationGuard, + )>, /// Semaphore to track ongoing calculation of `initial_logical_size`. pub initial_size_computation: Arc, @@ -52,25 +55,57 @@ pub(super) struct LogicalSize { /// see `current_logical_size_gauge`. Use the `update_current_logical_size` /// to modify this, it will also keep the prometheus metric in sync. pub size_added_after_initial: AtomicI64, + + /// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`]. + pub(super) did_return_approximate_to_walreceiver: AtomicBool, } /// Normalized current size, that the data in pageserver occupies. #[derive(Debug, Clone, Copy)] -pub(super) enum CurrentLogicalSize { +pub(crate) enum CurrentLogicalSize { /// The size is not yet calculated to the end, this is an intermediate result, /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative, /// yet total logical size cannot be below 0. - Approximate(u64), + Approximate(Approximate), // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are // available for observation without any calculations. - Exact(u64), + Exact(Exact), +} + +#[derive(Debug, Copy, Clone)] +pub(crate) enum Accuracy { + Approximate, + Exact, +} + +#[derive(Debug, Clone, Copy)] +pub(crate) struct Approximate(u64); +#[derive(Debug, Clone, Copy)] +pub(crate) struct Exact(u64); + +impl From<&Approximate> for u64 { + fn from(value: &Approximate) -> Self { + value.0 + } +} + +impl From<&Exact> for u64 { + fn from(val: &Exact) -> Self { + val.0 + } } impl CurrentLogicalSize { - pub(super) fn size(&self) -> u64 { - *match self { - Self::Approximate(size) => size, - Self::Exact(size) => size, + pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 { + match self { + Self::Approximate(size) => size.into(), + Self::Exact(size) => size.into(), + } + } + pub(crate) fn accuracy(&self) -> Accuracy { + match self { + Self::Approximate(_) => Accuracy::Approximate, + Self::Exact(_) => Accuracy::Exact, } } } @@ -78,11 +113,16 @@ impl CurrentLogicalSize { impl LogicalSize { pub(super) fn empty_initial() -> Self { Self { - initial_logical_size: OnceCell::with_value(0), + initial_logical_size: OnceCell::with_value((0, { + crate::metrics::initial_logical_size::START_CALCULATION + .first(None) + .calculation_result_saved() + })), // initial_logical_size already computed, so, don't admit any calculations initial_size_computation: Arc::new(Semaphore::new(0)), initial_part_end: None, size_added_after_initial: AtomicI64::new(0), + did_return_approximate_to_walreceiver: AtomicBool::new(false), } } @@ -92,22 +132,24 @@ impl LogicalSize { initial_size_computation: Arc::new(Semaphore::new(1)), initial_part_end: Some(compute_to), size_added_after_initial: AtomicI64::new(0), + did_return_approximate_to_walreceiver: AtomicBool::new(false), } } - pub(super) fn current_size(&self) -> anyhow::Result { + pub(super) fn current_size(&self) -> CurrentLogicalSize { let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); // ^^^ keep this type explicit so that the casts in this function break if // we change the type. match self.initial_logical_size.get() { - Some(initial_size) => { - initial_size.checked_add_signed(size_increment) + Some((initial_size, _)) => { + CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment) .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) - .map(CurrentLogicalSize::Exact) + .unwrap())) } None => { + let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); - Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) + CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment)) } } } @@ -121,7 +163,7 @@ impl LogicalSize { /// available for re-use. This doesn't contain the incremental part. pub(super) fn initialized_size(&self, lsn: Lsn) -> Option { match self.initial_part_end { - Some(v) if v == lsn => self.initial_logical_size.get().copied(), + Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s), _ => None, } } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index f9bb6ca4195a..61130f541a0c 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -43,11 +43,11 @@ impl<'t> UninitializedTimeline<'t> { /// The caller is responsible for activating the timeline (function `.activate()`). pub(crate) fn finish_creation(mut self) -> anyhow::Result> { let timeline_id = self.timeline_id; - let tenant_id = self.owning_tenant.tenant_id; + let tenant_shard_id = self.owning_tenant.tenant_shard_id; if self.raw_timeline.is_none() { return Err(anyhow::anyhow!( - "No timeline for initialization found for {tenant_id}/{timeline_id}" + "No timeline for initialization found for {tenant_shard_id}/{timeline_id}" )); } @@ -61,13 +61,13 @@ impl<'t> UninitializedTimeline<'t> { anyhow::ensure!( new_disk_consistent_lsn.is_valid(), - "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn" + "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn" ); let mut timelines = self.owning_tenant.timelines.lock().unwrap(); match timelines.entry(timeline_id) { Entry::Occupied(_) => anyhow::bail!( - "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" + "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" ), Entry::Vacant(v) => { // after taking here should be no fallible operations, because the drop guard will not @@ -79,7 +79,7 @@ impl<'t> UninitializedTimeline<'t> { // this should be an assertion. uninit_mark.remove_uninit_mark().with_context(|| { format!( - "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" + "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}" ) })?; v.insert(Arc::clone(&new_timeline)); @@ -134,7 +134,7 @@ impl<'t> UninitializedTimeline<'t> { .with_context(|| { format!( "No raw timeline {}/{} found", - self.owning_tenant.tenant_id, self.timeline_id + self.owning_tenant.tenant_shard_id, self.timeline_id ) })? .0) @@ -144,7 +144,7 @@ impl<'t> UninitializedTimeline<'t> { impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { if let Some((_, uninit_mark)) = self.raw_timeline.take() { - let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered(); + let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); error!("Timeline got dropped without initializing, cleaning its files"); cleanup_timeline_directory(uninit_mark); } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 842bc3675c5a..04ff8602d65b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -71,7 +71,7 @@ impl WalReceiver { mut broker_client: BrokerClientChannel, ctx: &RequestContext, ) -> Self { - let tenant_id = timeline.tenant_id; + let tenant_id = timeline.tenant_shard_id.tenant_id; let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 30777124457a..7bfa246eeb0f 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -75,7 +75,7 @@ pub(super) async fn connection_manager_loop_step( } let id = TenantTimelineId { - tenant_id: connection_manager_state.timeline.tenant_id, + tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id, timeline_id: connection_manager_state.timeline.timeline_id, }; @@ -388,7 +388,7 @@ struct BrokerSkTimeline { impl ConnectionManagerState { pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { let id = TenantTimelineId { - tenant_id: timeline.tenant_id, + tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, }; Self { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 3e56753ad495..7045658f2415 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection( task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_id), + Some(timeline.tenant_shard_id.tenant_id), Some(timeline.timeline_id), "walreceiver connection", false, @@ -396,11 +396,12 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let (timeline_logical_size, _) = timeline + let current_timeline_size = timeline .get_current_logical_size(&ctx) - .context("Status update creation failed to get current logical size")?; + // FIXME: https://github.com/neondatabase/neon/issues/5963 + .size_dont_care_about_accuracy(); let status_update = PageserverFeedback { - current_timeline_size: timeline_logical_size, + current_timeline_size, last_received_lsn, disk_consistent_lsn, remote_consistent_lsn, diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index b47878aacce8..32f14f40c532 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,6 +1,5 @@ use super::storage_layer::LayerFileName; use super::storage_layer::ResidentLayer; -use super::Generation; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; @@ -15,6 +14,9 @@ use utils::lsn::AtomicLsn; use std::sync::atomic::AtomicU32; use utils::lsn::Lsn; +#[cfg(feature = "testing")] +use utils::generation::Generation; + // clippy warns that Uninitialized is much smaller than Initialized, which wastes // memory for Uninitialized variants. Doesn't matter in practice, there are not // that many upload queues in a running pageserver, and most of them are initialized @@ -88,6 +90,14 @@ pub(crate) struct UploadQueueInitialized { /// bug causing leaks, then it's better to not leave this enabled for production builds. #[cfg(feature = "testing")] pub(crate) dangling_files: HashMap, + + /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. + pub(crate) shutting_down: bool, + + /// Permitless semaphore on which any number of `RemoteTimelineClient::shutdown` futures can + /// wait on until one of them stops the queue. The semaphore is closed when + /// `RemoteTimelineClient::launch_queued_tasks` encounters `UploadOp::Shutdown`. + pub(crate) shutdown_ready: Arc, } impl UploadQueueInitialized { @@ -146,6 +156,8 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + shutting_down: false, + shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; *self = UploadQueue::Initialized(state); @@ -193,6 +205,8 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + shutting_down: false, + shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; *self = UploadQueue::Initialized(state); @@ -204,7 +218,13 @@ impl UploadQueue { UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { anyhow::bail!("queue is in state {}", self.as_str()) } - UploadQueue::Initialized(x) => Ok(x), + UploadQueue::Initialized(x) => { + if !x.shutting_down { + Ok(x) + } else { + anyhow::bail!("queue is shutting down") + } + } } } @@ -232,7 +252,7 @@ pub(crate) struct UploadTask { /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerFileName, Generation)>, + pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>, } #[derive(Debug)] @@ -248,6 +268,10 @@ pub(crate) enum UploadOp { /// Barrier. When the barrier operation is reached, Barrier(tokio::sync::watch::Sender<()>), + + /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise + /// this is the same as a Barrier. + Shutdown, } impl std::fmt::Display for UploadOp { @@ -269,6 +293,7 @@ impl std::fmt::Display for UploadOp { write!(f, "Delete({} layers)", delete.layers.len()) } UploadOp::Barrier(_) => write!(f, "Barrier"), + UploadOp::Shutdown => write!(f, "Shutdown"), } } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 23367928d35a..cbb08f7ff147 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -98,261 +98,258 @@ impl<'a> WalIngest<'a> { self.checkpoint_modified = true; } - // Heap AM records need some special handling, because they modify VM pages - // without registering them with the standard mechanism. - if decoded.xl_rmid == pg_constants::RM_HEAP_ID - || decoded.xl_rmid == pg_constants::RM_HEAP2_ID - { - self.ingest_heapam_record(&mut buf, modification, decoded, ctx) - .await?; - } - if decoded.xl_rmid == pg_constants::RM_NEON_ID { - self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx) - .await?; - } - // Handle other special record types - if decoded.xl_rmid == pg_constants::RM_SMGR_ID - && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_SMGR_CREATE - { - let create = XlSmgrCreate::decode(&mut buf); - self.ingest_xlog_smgr_create(modification, &create, ctx) - .await?; - } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID - && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_SMGR_TRUNCATE - { - let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(modification, &truncate, ctx) - .await?; - } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { - debug!( - "handle RM_DBASE_ID for Postgres version {:?}", - self.timeline.pg_version - ); - if self.timeline.pg_version == 14 { - if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE - { - let createdb = XlCreateDatabase::decode(&mut buf); - debug!("XLOG_DBASE_CREATE v14"); + match decoded.xl_rmid { + pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { + // Heap AM records need some special handling, because they modify VM pages + // without registering them with the standard mechanism. + self.ingest_heapam_record(&mut buf, modification, decoded, ctx) + .await?; + } + pg_constants::RM_NEON_ID => { + self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx) + .await?; + } + // Handle other special record types + pg_constants::RM_SMGR_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - self.ingest_xlog_dbase_create(modification, &createdb, ctx) + if info == pg_constants::XLOG_SMGR_CREATE { + let create = XlSmgrCreate::decode(&mut buf); + self.ingest_xlog_smgr_create(modification, &create, ctx) .await?; - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == postgres_ffi::v14::bindings::XLOG_DBASE_DROP - { - let dropdb = XlDropDatabase::decode(&mut buf); - for tablespace_id in dropdb.tablespace_ids { - trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification - .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + } else if info == pg_constants::XLOG_SMGR_TRUNCATE { + let truncate = XlSmgrTruncate::decode(&mut buf); + self.ingest_xlog_smgr_truncate(modification, &truncate, ctx) + .await?; + } + } + pg_constants::RM_DBASE_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID"); + + if self.timeline.pg_version == 14 { + if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { + let createdb = XlCreateDatabase::decode(&mut buf); + debug!("XLOG_DBASE_CREATE v14"); + + self.ingest_xlog_dbase_create(modification, &createdb, ctx) .await?; + } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } } - } - } else if self.timeline.pg_version == 15 { - if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG - { - debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY - { - // The XLOG record was renamed between v14 and v15, - // but the record format is the same. - // So we can reuse XlCreateDatabase here. - debug!("XLOG_DBASE_CREATE_FILE_COPY"); - let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb, ctx) - .await?; - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == postgres_ffi::v15::bindings::XLOG_DBASE_DROP - { - let dropdb = XlDropDatabase::decode(&mut buf); - for tablespace_id in dropdb.tablespace_ids { - trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification - .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + } else if self.timeline.pg_version == 15 { + if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) .await?; + } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } } - } - } else if self.timeline.pg_version == 16 { - if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG - { - debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY - { - // The XLOG record was renamed between v14 and v15, - // but the record format is the same. - // So we can reuse XlCreateDatabase here. - debug!("XLOG_DBASE_CREATE_FILE_COPY"); - let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb, ctx) - .await?; - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == postgres_ffi::v16::bindings::XLOG_DBASE_DROP - { - let dropdb = XlDropDatabase::decode(&mut buf); - for tablespace_id in dropdb.tablespace_ids { - trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification - .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + } else if self.timeline.pg_version == 16 { + if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) .await?; + } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } } } } - } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { - trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet"); - } else if decoded.xl_rmid == pg_constants::RM_CLOG_ID { - let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; - if info == pg_constants::CLOG_ZEROPAGE { - let pageno = buf.get_u32_le(); - let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( - modification, - SlruKind::Clog, - segno, - rpageno, - ZERO_PAGE.clone(), - ctx, - ) - .await?; - } else { - assert!(info == pg_constants::CLOG_TRUNCATE); - let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(modification, &xlrec, ctx) + pg_constants::RM_TBLSPC_ID => { + trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet"); + } + pg_constants::RM_CLOG_ID => { + let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; + + if info == pg_constants::CLOG_ZEROPAGE { + let pageno = buf.get_u32_le(); + let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + self.put_slru_page_image( + modification, + SlruKind::Clog, + segno, + rpageno, + ZERO_PAGE.clone(), + ctx, + ) .await?; + } else { + assert!(info == pg_constants::CLOG_TRUNCATE); + let xlrec = XlClogTruncate::decode(&mut buf); + self.ingest_clog_truncate_record(modification, &xlrec, ctx) + .await?; + } } - } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { - let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; - if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT { - let parsed_xact = - XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - self.ingest_xact_record( - modification, - &parsed_xact, - info == pg_constants::XLOG_XACT_COMMIT, - ctx, - ) - .await?; - } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED - || info == pg_constants::XLOG_XACT_ABORT_PREPARED - { - let parsed_xact = - XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - self.ingest_xact_record( - modification, - &parsed_xact, - info == pg_constants::XLOG_XACT_COMMIT_PREPARED, - ctx, - ) - .await?; - // Remove twophase file. see RemoveTwoPhaseFile() in postgres code - trace!( - "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", - decoded.xl_xid, - parsed_xact.xid, - lsn, - ); - modification - .drop_twophase_file(parsed_xact.xid, ctx) + pg_constants::RM_XACT_ID => { + let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; + + if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT { + let parsed_xact = + XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); + self.ingest_xact_record( + modification, + &parsed_xact, + info == pg_constants::XLOG_XACT_COMMIT, + ctx, + ) .await?; - } else if info == pg_constants::XLOG_XACT_PREPARE { - modification - .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx) + } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED + || info == pg_constants::XLOG_XACT_ABORT_PREPARED + { + let parsed_xact = + XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); + self.ingest_xact_record( + modification, + &parsed_xact, + info == pg_constants::XLOG_XACT_COMMIT_PREPARED, + ctx, + ) .await?; + // Remove twophase file. see RemoveTwoPhaseFile() in postgres code + trace!( + "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", + decoded.xl_xid, + parsed_xact.xid, + lsn, + ); + modification + .drop_twophase_file(parsed_xact.xid, ctx) + .await?; + } else if info == pg_constants::XLOG_XACT_PREPARE { + modification + .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx) + .await?; + } } - } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { - let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - - if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE { - let pageno = buf.get_u32_le(); - let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( - modification, - SlruKind::MultiXactOffsets, - segno, - rpageno, - ZERO_PAGE.clone(), - ctx, - ) - .await?; - } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { - let pageno = buf.get_u32_le(); - let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( - modification, - SlruKind::MultiXactMembers, - segno, - rpageno, - ZERO_PAGE.clone(), - ctx, - ) - .await?; - } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { - let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(modification, &xlrec)?; - } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { - let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(modification, &xlrec, ctx) + pg_constants::RM_MULTIXACT_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + + if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE { + let pageno = buf.get_u32_le(); + let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + self.put_slru_page_image( + modification, + SlruKind::MultiXactOffsets, + segno, + rpageno, + ZERO_PAGE.clone(), + ctx, + ) .await?; - } - } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { - let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded, ctx) - .await?; - } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { - let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - if info == pg_constants::XLOG_NEXTOID { - let next_oid = buf.get_u32_le(); - if self.checkpoint.nextOid != next_oid { - self.checkpoint.nextOid = next_oid; - self.checkpoint_modified = true; + } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { + let pageno = buf.get_u32_le(); + let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + self.put_slru_page_image( + modification, + SlruKind::MultiXactMembers, + segno, + rpageno, + ZERO_PAGE.clone(), + ctx, + ) + .await?; + } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { + let xlrec = XlMultiXactCreate::decode(&mut buf); + self.ingest_multixact_create_record(modification, &xlrec)?; + } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { + let xlrec = XlMultiXactTruncate::decode(&mut buf); + self.ingest_multixact_truncate_record(modification, &xlrec, ctx) + .await?; } - } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE - || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN - { - let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; - buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - trace!( - "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", - xlog_checkpoint.oldestXid, - self.checkpoint.oldestXid - ); - if (self - .checkpoint - .oldestXid - .wrapping_sub(xlog_checkpoint.oldestXid) as i32) - < 0 + } + pg_constants::RM_RELMAP_ID => { + let xlrec = XlRelmapUpdate::decode(&mut buf); + self.ingest_relmap_page(modification, &xlrec, decoded, ctx) + .await?; + } + pg_constants::RM_XLOG_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + + if info == pg_constants::XLOG_NEXTOID { + let next_oid = buf.get_u32_le(); + if self.checkpoint.nextOid != next_oid { + self.checkpoint.nextOid = next_oid; + self.checkpoint_modified = true; + } + } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE + || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - self.checkpoint.oldestXid = xlog_checkpoint.oldestXid; - self.checkpoint_modified = true; + let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; + buf.copy_to_slice(&mut checkpoint_bytes); + let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; + trace!( + "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", + xlog_checkpoint.oldestXid, + self.checkpoint.oldestXid + ); + if (self + .checkpoint + .oldestXid + .wrapping_sub(xlog_checkpoint.oldestXid) as i32) + < 0 + { + self.checkpoint.oldestXid = xlog_checkpoint.oldestXid; + self.checkpoint_modified = true; + } } } - } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID { - let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - if info == pg_constants::XLOG_LOGICAL_MESSAGE { - let xlrec = XlLogicalMessage::decode(&mut buf); - let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?; - let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size]; - if prefix == "neon-test" { - // This is a convenient way to make the WAL ingestion pause at - // particular point in the WAL. For more fine-grained control, - // we could peek into the message and only pause if it contains - // a particular string, for example, but this is enough for now. - crate::failpoint_support::sleep_millis_async!( - "wal-ingest-logical-message-sleep" - ); - } else if let Some(path) = prefix.strip_prefix("neon-file:") { - modification.put_file(path, message, ctx).await?; + pg_constants::RM_LOGICALMSG_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + + if info == pg_constants::XLOG_LOGICAL_MESSAGE { + let xlrec = XlLogicalMessage::decode(&mut buf); + let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?; + let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size]; + if prefix == "neon-test" { + // This is a convenient way to make the WAL ingestion pause at + // particular point in the WAL. For more fine-grained control, + // we could peek into the message and only pause if it contains + // a particular string, for example, but this is enough for now. + crate::failpoint_support::sleep_millis_async!( + "wal-ingest-logical-message-sleep" + ); + } else if let Some(path) = prefix.strip_prefix("neon-file:") { + modification.put_file(path, message, ctx).await?; + } } } + _x => { + // TODO: should probably log & fail here instead of blindly + // doing something without understanding the protocol + } } // Iterate through all the blocks that the record modifies, and @@ -1440,7 +1437,16 @@ impl<'a> WalIngest<'a> { // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; - let old_nblocks = if !self + + // Get current size and put rel creation if rel doesn't exist + // + // NOTE: we check the cache first even though get_rel_exists and get_rel_size would + // check the cache too. This is because eagerly checking the cache results in + // less work overall and 10% better performance. It's more work on cache miss + // but cache miss is rare. + let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) { + nblocks + } else if !self .timeline .get_rel_exists(rel, last_lsn, true, ctx) .await? @@ -2079,4 +2085,88 @@ mod tests { Ok(()) } + + /// Replay a wal segment file taken directly from safekeepers. + /// + /// This test is useful for benchmarking since it allows us to profile only + /// the walingest code in a single-threaded executor, and iterate more quickly + /// without waiting for unrelated steps. + #[tokio::test] + async fn test_ingest_real_wal() { + use crate::tenant::harness::*; + use postgres_ffi::waldecoder::WalStreamDecoder; + use postgres_ffi::WAL_SEGMENT_SIZE; + + // Define test data path and constants. + // + // Steps to reconstruct the data, if needed: + // 1. Run the pgbench python test + // 2. Take the first wal segment file from safekeeper + // 3. Compress it using `zstd --long input_file` + // 4. Copy initdb.tar.zst from local_fs_remote_storage + // 5. Grep sk logs for "restart decoder" to get startpoint + // 6. Run just the decoder from this test to get the endpoint. + // It's the last LSN the decoder will output. + let pg_version = 15; // The test data was generated by pg15 + let path = "test_data/sk_wal_segment_from_pgbench"; + let wal_segment_path = format!("{path}/000000010000000000000001.zst"); + let startpoint = Lsn::from_hex("14AEC08").unwrap(); + let endpoint = Lsn::from_hex("1FFFF98").unwrap(); + + // Bootstrap a real timeline. We can't use create_test_timeline because + // it doesn't create a real checkpoint, and Walingest::new tries to parse + // the garbage data. + // + // TODO use the initdb.tar.zst file stored with the test data to avoid + // problems with inconsistent initdb results after pg minor version bumps. + let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal") + .unwrap() + .load() + .await; + let tline = tenant + .bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx) + .await + .unwrap(); + + // We fully read and decompress this into memory before decoding + // to get a more accurate perf profile of the decoder. + let bytes = { + use async_compression::tokio::bufread::ZstdDecoder; + let file = tokio::fs::File::open(wal_segment_path).await.unwrap(); + let reader = tokio::io::BufReader::new(file); + let decoder = ZstdDecoder::new(reader); + let mut reader = tokio::io::BufReader::new(decoder); + let mut buffer = Vec::new(); + tokio::io::copy_buf(&mut reader, &mut buffer).await.unwrap(); + buffer + }; + + // TODO start a profiler too + let started_at = std::time::Instant::now(); + + // Initialize walingest + let xlogoff: usize = startpoint.segment_offset(WAL_SEGMENT_SIZE); + let mut decoder = WalStreamDecoder::new(startpoint, pg_version); + let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx) + .await + .unwrap(); + let mut modification = tline.begin_modification(endpoint); + let mut decoded = DecodedWALRecord::default(); + println!("decoding {} bytes", bytes.len() - xlogoff); + + // Decode and ingest wal. We process the wal in chunks because + // that's what happens when we get bytes from safekeepers. + for chunk in bytes[xlogoff..].chunks(50) { + decoder.feed_bytes(chunk); + while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .await + .unwrap(); + } + } + + let duration = started_at.elapsed(); + println!("done in {:?}", duration); + } } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 5d8cc0e181d1..edce158e75fe 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -41,10 +41,14 @@ use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; #[cfg(feature = "testing")] use std::sync::atomic::{AtomicUsize, Ordering}; +#[cfg(feature = "testing")] +use pageserver_api::shard::TenantShardId; + use crate::config::PageServerConf; use crate::metrics::{ WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, - WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, + WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, }; use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::repository::Key; @@ -238,10 +242,13 @@ impl PostgresRedoManager { let mut proc_guard = self.redo_process.write().unwrap(); match &*proc_guard { None => { + let timer = + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer(); let proc = Arc::new( WalRedoProcess::launch(self.conf, self.tenant_id, pg_version) .context("launch walredo process")?, ); + timer.observe_duration(); *proc_guard = Some(Arc::clone(&proc)); proc } @@ -991,7 +998,11 @@ impl WalRedoProcess { // these files will be collected to an allure report let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - let path = self.conf.tenant_path(&self.tenant_id).join(&filename); + // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId. + let path = self + .conf + .tenant_path(&TenantShardId::unsharded(self.tenant_id)) + .join(&filename); let res = std::fs::OpenOptions::new() .write(true) @@ -1182,7 +1193,7 @@ mod tests { #[tokio::test] async fn short_v14_redo() { - let expected = std::fs::read("fixtures/short_v14_redo.page").unwrap(); + let expected = std::fs::read("test_data/short_v14_redo.page").unwrap(); let h = RedoHarness::new().unwrap(); diff --git a/pageserver/fixtures/short_v14_redo.page b/pageserver/test_data/short_v14_redo.page similarity index 100% rename from pageserver/fixtures/short_v14_redo.page rename to pageserver/test_data/short_v14_redo.page diff --git a/pageserver/test_data/sk_wal_segment_from_pgbench/000000010000000000000001.zst b/pageserver/test_data/sk_wal_segment_from_pgbench/000000010000000000000001.zst new file mode 100644 index 000000000000..3c478e78272c Binary files /dev/null and b/pageserver/test_data/sk_wal_segment_from_pgbench/000000010000000000000001.zst differ diff --git a/pageserver/test_data/sk_wal_segment_from_pgbench/initdb.tar.zst b/pageserver/test_data/sk_wal_segment_from_pgbench/initdb.tar.zst new file mode 100644 index 000000000000..17e9c7ea0849 Binary files /dev/null and b/pageserver/test_data/sk_wal_segment_from_pgbench/initdb.tar.zst differ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index cc09fb849d59..8eb9ebb9159a 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -21,6 +21,7 @@ #include "storage/buf_internals.h" #include "storage/lwlock.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "c.h" #include "postmaster/interrupt.h" @@ -87,6 +88,12 @@ bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = static bool pageserver_flush(void); static void pageserver_disconnect(void); +static bool +PagestoreShmemIsValid() +{ + return pagestore_shared && UsedShmemSegAddr; +} + static bool CheckPageserverConnstring(char **newval, void **extra, GucSource source) { @@ -96,7 +103,7 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source) static void AssignPageserverConnstring(const char *newval, void *extra) { - if(!pagestore_shared) + if(!PagestoreShmemIsValid()) return; LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE); strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE); @@ -107,7 +114,7 @@ AssignPageserverConnstring(const char *newval, void *extra) static bool CheckConnstringUpdated() { - if(!pagestore_shared) + if(!PagestoreShmemIsValid()) return false; return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter); } @@ -115,7 +122,7 @@ CheckConnstringUpdated() static void ReloadConnstring() { - if(!pagestore_shared) + if(!PagestoreShmemIsValid()) return; LWLockAcquire(pagestore_shared->lock, LW_SHARED); strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring)); diff --git a/poetry.lock b/poetry.lock index 58ab4e70f231..a85325b696d2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,112 +1,100 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "aiohttp" -version = "3.8.6" +version = "3.9.0" description = "Async http client/server framework (asyncio)" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:41d55fc043954cddbbd82503d9cc3f4814a40bcef30b3569bc7b5e34130718c1"}, - {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1d84166673694841d8953f0a8d0c90e1087739d24632fe86b1a08819168b4566"}, - {file = "aiohttp-3.8.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:253bf92b744b3170eb4c4ca2fa58f9c4b87aeb1df42f71d4e78815e6e8b73c9e"}, - {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fd194939b1f764d6bb05490987bfe104287bbf51b8d862261ccf66f48fb4096"}, - {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c5f938d199a6fdbdc10bbb9447496561c3a9a565b43be564648d81e1102ac22"}, - {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2817b2f66ca82ee699acd90e05c95e79bbf1dc986abb62b61ec8aaf851e81c93"}, - {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa375b3d34e71ccccf172cab401cd94a72de7a8cc01847a7b3386204093bb47"}, - {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9de50a199b7710fa2904be5a4a9b51af587ab24c8e540a7243ab737b45844543"}, - {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e1d8cb0b56b3587c5c01de3bf2f600f186da7e7b5f7353d1bf26a8ddca57f965"}, - {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8e31e9db1bee8b4f407b77fd2507337a0a80665ad7b6c749d08df595d88f1cf5"}, - {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7bc88fc494b1f0311d67f29fee6fd636606f4697e8cc793a2d912ac5b19aa38d"}, - {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ec00c3305788e04bf6d29d42e504560e159ccaf0be30c09203b468a6c1ccd3b2"}, - {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad1407db8f2f49329729564f71685557157bfa42b48f4b93e53721a16eb813ed"}, - {file = "aiohttp-3.8.6-cp310-cp310-win32.whl", hash = "sha256:ccc360e87341ad47c777f5723f68adbb52b37ab450c8bc3ca9ca1f3e849e5fe2"}, - {file = "aiohttp-3.8.6-cp310-cp310-win_amd64.whl", hash = "sha256:93c15c8e48e5e7b89d5cb4613479d144fda8344e2d886cf694fd36db4cc86865"}, - {file = "aiohttp-3.8.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e2f9cc8e5328f829f6e1fb74a0a3a939b14e67e80832975e01929e320386b34"}, - {file = "aiohttp-3.8.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e6a00ffcc173e765e200ceefb06399ba09c06db97f401f920513a10c803604ca"}, - {file = "aiohttp-3.8.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:41bdc2ba359032e36c0e9de5a3bd00d6fb7ea558a6ce6b70acedf0da86458321"}, - {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14cd52ccf40006c7a6cd34a0f8663734e5363fd981807173faf3a017e202fec9"}, - {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2d5b785c792802e7b275c420d84f3397668e9d49ab1cb52bd916b3b3ffcf09ad"}, - {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1bed815f3dc3d915c5c1e556c397c8667826fbc1b935d95b0ad680787896a358"}, - {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96603a562b546632441926cd1293cfcb5b69f0b4159e6077f7c7dbdfb686af4d"}, - {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d76e8b13161a202d14c9584590c4df4d068c9567c99506497bdd67eaedf36403"}, - {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e3f1e3f1a1751bb62b4a1b7f4e435afcdade6c17a4fd9b9d43607cebd242924a"}, - {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:76b36b3124f0223903609944a3c8bf28a599b2cc0ce0be60b45211c8e9be97f8"}, - {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:a2ece4af1f3c967a4390c284797ab595a9f1bc1130ef8b01828915a05a6ae684"}, - {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:16d330b3b9db87c3883e565340d292638a878236418b23cc8b9b11a054aaa887"}, - {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42c89579f82e49db436b69c938ab3e1559e5a4409eb8639eb4143989bc390f2f"}, - {file = "aiohttp-3.8.6-cp311-cp311-win32.whl", hash = "sha256:efd2fcf7e7b9d7ab16e6b7d54205beded0a9c8566cb30f09c1abe42b4e22bdcb"}, - {file = "aiohttp-3.8.6-cp311-cp311-win_amd64.whl", hash = "sha256:3b2ab182fc28e7a81f6c70bfbd829045d9480063f5ab06f6e601a3eddbbd49a0"}, - {file = "aiohttp-3.8.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:fdee8405931b0615220e5ddf8cd7edd8592c606a8e4ca2a00704883c396e4479"}, - {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d25036d161c4fe2225d1abff2bd52c34ed0b1099f02c208cd34d8c05729882f0"}, - {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d791245a894be071d5ab04bbb4850534261a7d4fd363b094a7b9963e8cdbd31"}, - {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0cccd1de239afa866e4ce5c789b3032442f19c261c7d8a01183fd956b1935349"}, - {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f13f60d78224f0dace220d8ab4ef1dbc37115eeeab8c06804fec11bec2bbd07"}, - {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a9b5a0606faca4f6cc0d338359d6fa137104c337f489cd135bb7fbdbccb1e39"}, - {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:13da35c9ceb847732bf5c6c5781dcf4780e14392e5d3b3c689f6d22f8e15ae31"}, - {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:4d4cbe4ffa9d05f46a28252efc5941e0462792930caa370a6efaf491f412bc66"}, - {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:229852e147f44da0241954fc6cb910ba074e597f06789c867cb7fb0621e0ba7a"}, - {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:713103a8bdde61d13490adf47171a1039fd880113981e55401a0f7b42c37d071"}, - {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:45ad816b2c8e3b60b510f30dbd37fe74fd4a772248a52bb021f6fd65dff809b6"}, - {file = "aiohttp-3.8.6-cp36-cp36m-win32.whl", hash = "sha256:2b8d4e166e600dcfbff51919c7a3789ff6ca8b3ecce16e1d9c96d95dd569eb4c"}, - {file = "aiohttp-3.8.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0912ed87fee967940aacc5306d3aa8ba3a459fcd12add0b407081fbefc931e53"}, - {file = "aiohttp-3.8.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e2a988a0c673c2e12084f5e6ba3392d76c75ddb8ebc6c7e9ead68248101cd446"}, - {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebf3fd9f141700b510d4b190094db0ce37ac6361a6806c153c161dc6c041ccda"}, - {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3161ce82ab85acd267c8f4b14aa226047a6bee1e4e6adb74b798bd42c6ae1f80"}, - {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d95fc1bf33a9a81469aa760617b5971331cdd74370d1214f0b3109272c0e1e3c"}, - {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c43ecfef7deaf0617cee936836518e7424ee12cb709883f2c9a1adda63cc460"}, - {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca80e1b90a05a4f476547f904992ae81eda5c2c85c66ee4195bb8f9c5fb47f28"}, - {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:90c72ebb7cb3a08a7f40061079817133f502a160561d0675b0a6adf231382c92"}, - {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bb54c54510e47a8c7c8e63454a6acc817519337b2b78606c4e840871a3e15349"}, - {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:de6a1c9f6803b90e20869e6b99c2c18cef5cc691363954c93cb9adeb26d9f3ae"}, - {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:a3628b6c7b880b181a3ae0a0683698513874df63783fd89de99b7b7539e3e8a8"}, - {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:fc37e9aef10a696a5a4474802930079ccfc14d9f9c10b4662169671ff034b7df"}, - {file = "aiohttp-3.8.6-cp37-cp37m-win32.whl", hash = "sha256:f8ef51e459eb2ad8e7a66c1d6440c808485840ad55ecc3cafefadea47d1b1ba2"}, - {file = "aiohttp-3.8.6-cp37-cp37m-win_amd64.whl", hash = "sha256:b2fe42e523be344124c6c8ef32a011444e869dc5f883c591ed87f84339de5976"}, - {file = "aiohttp-3.8.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9e2ee0ac5a1f5c7dd3197de309adfb99ac4617ff02b0603fd1e65b07dc772e4b"}, - {file = "aiohttp-3.8.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01770d8c04bd8db568abb636c1fdd4f7140b284b8b3e0b4584f070180c1e5c62"}, - {file = "aiohttp-3.8.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c68330a59506254b556b99a91857428cab98b2f84061260a67865f7f52899f5"}, - {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89341b2c19fb5eac30c341133ae2cc3544d40d9b1892749cdd25892bbc6ac951"}, - {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71783b0b6455ac8f34b5ec99d83e686892c50498d5d00b8e56d47f41b38fbe04"}, - {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f628dbf3c91e12f4d6c8b3f092069567d8eb17814aebba3d7d60c149391aee3a"}, - {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b04691bc6601ef47c88f0255043df6f570ada1a9ebef99c34bd0b72866c217ae"}, - {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ee912f7e78287516df155f69da575a0ba33b02dd7c1d6614dbc9463f43066e3"}, - {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9c19b26acdd08dd239e0d3669a3dddafd600902e37881f13fbd8a53943079dbc"}, - {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:99c5ac4ad492b4a19fc132306cd57075c28446ec2ed970973bbf036bcda1bcc6"}, - {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f0f03211fd14a6a0aed2997d4b1c013d49fb7b50eeb9ffdf5e51f23cfe2c77fa"}, - {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:8d399dade330c53b4106160f75f55407e9ae7505263ea86f2ccca6bfcbdb4921"}, - {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ec4fd86658c6a8964d75426517dc01cbf840bbf32d055ce64a9e63a40fd7b771"}, - {file = "aiohttp-3.8.6-cp38-cp38-win32.whl", hash = "sha256:33164093be11fcef3ce2571a0dccd9041c9a93fa3bde86569d7b03120d276c6f"}, - {file = "aiohttp-3.8.6-cp38-cp38-win_amd64.whl", hash = "sha256:bdf70bfe5a1414ba9afb9d49f0c912dc524cf60141102f3a11143ba3d291870f"}, - {file = "aiohttp-3.8.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d52d5dc7c6682b720280f9d9db41d36ebe4791622c842e258c9206232251ab2b"}, - {file = "aiohttp-3.8.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ac39027011414dbd3d87f7edb31680e1f430834c8cef029f11c66dad0670aa5"}, - {file = "aiohttp-3.8.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f5c7ce535a1d2429a634310e308fb7d718905487257060e5d4598e29dc17f0b"}, - {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b30e963f9e0d52c28f284d554a9469af073030030cef8693106d918b2ca92f54"}, - {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:918810ef188f84152af6b938254911055a72e0f935b5fbc4c1a4ed0b0584aed1"}, - {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:002f23e6ea8d3dd8d149e569fd580c999232b5fbc601c48d55398fbc2e582e8c"}, - {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4fcf3eabd3fd1a5e6092d1242295fa37d0354b2eb2077e6eb670accad78e40e1"}, - {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:255ba9d6d5ff1a382bb9a578cd563605aa69bec845680e21c44afc2670607a95"}, - {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d67f8baed00870aa390ea2590798766256f31dc5ed3ecc737debb6e97e2ede78"}, - {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:86f20cee0f0a317c76573b627b954c412ea766d6ada1a9fcf1b805763ae7feeb"}, - {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:39a312d0e991690ccc1a61f1e9e42daa519dcc34ad03eb6f826d94c1190190dd"}, - {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e827d48cf802de06d9c935088c2924e3c7e7533377d66b6f31ed175c1620e05e"}, - {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bd111d7fc5591ddf377a408ed9067045259ff2770f37e2d94e6478d0f3fc0c17"}, - {file = "aiohttp-3.8.6-cp39-cp39-win32.whl", hash = "sha256:caf486ac1e689dda3502567eb89ffe02876546599bbf915ec94b1fa424eeffd4"}, - {file = "aiohttp-3.8.6-cp39-cp39-win_amd64.whl", hash = "sha256:3f0e27e5b733803333bb2371249f41cf42bae8884863e8e8965ec69bebe53132"}, - {file = "aiohttp-3.8.6.tar.gz", hash = "sha256:b0cf2a4501bff9330a8a5248b4ce951851e415bdcce9dc158e76cfd55e15085c"}, + {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"}, + {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"}, + {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"}, + {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"}, + {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"}, + {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"}, + {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"}, + {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"}, + {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"}, + {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"}, + {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"}, + {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"}, + {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"}, + {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"}, + {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"}, + {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"}, + {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"}, + {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"}, + {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"}, + {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"}, + {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"}, + {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"}, + {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"}, + {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"}, + {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"}, + {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"}, + {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"}, + {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"}, + {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"}, + {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"}, + {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"}, + {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"}, + {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"}, + {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"}, + {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"}, + {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"}, + {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"}, + {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"}, + {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"}, + {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"}, + {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"}, + {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"}, + {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"}, + {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"}, + {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"}, + {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"}, + {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"}, + {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"}, + {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"}, + {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"}, + {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"}, + {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"}, + {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"}, + {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"}, + {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"}, + {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"}, + {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"}, + {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"}, + {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"}, + {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"}, + {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"}, + {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"}, + {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"}, + {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"}, + {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"}, + {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"}, + {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"}, + {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"}, + {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"}, + {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"}, + {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"}, + {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"}, + {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"}, + {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"}, + {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"}, + {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"}, ] [package.dependencies] aiosignal = ">=1.1.2" -async-timeout = ">=4.0.0a3,<5.0" +async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} attrs = ">=17.3.0" -charset-normalizer = ">=2.0,<4.0" frozenlist = ">=1.1.1" multidict = ">=4.5,<7.0" yarl = ">=1.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns", "cchardet"] +speedups = ["Brotli", "aiodns", "brotlicffi"] [[package]] name = "aiopg" @@ -887,34 +875,34 @@ files = [ [[package]] name = "cryptography" -version = "41.0.4" +version = "41.0.6" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"}, - {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"}, - {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714"}, - {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb"}, - {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13"}, - {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143"}, - {file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397"}, - {file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860"}, - {file = "cryptography-41.0.4-cp37-abi3-win32.whl", hash = "sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd"}, - {file = "cryptography-41.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d"}, - {file = "cryptography-41.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67"}, - {file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e"}, - {file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829"}, - {file = "cryptography-41.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca"}, - {file = "cryptography-41.0.4-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d"}, - {file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac"}, - {file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9"}, - {file = "cryptography-41.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f"}, - {file = "cryptography-41.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91"}, - {file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8"}, - {file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6"}, - {file = "cryptography-41.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311"}, - {file = "cryptography-41.0.4.tar.gz", hash = "sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a"}, + {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"}, + {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"}, + {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"}, + {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"}, + {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"}, + {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"}, + {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"}, + {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"}, + {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"}, + {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"}, + {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"}, + {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"}, + {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"}, + {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"}, + {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"}, + {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"}, + {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"}, + {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"}, + {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"}, + {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"}, + {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"}, + {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"}, + {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"}, ] [package.dependencies] @@ -1979,18 +1967,18 @@ pytest = [ [[package]] name = "pytest-rerunfailures" -version = "11.1.2" +version = "13.0" description = "pytest plugin to re-run tests to eliminate flaky failures" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"}, - {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"}, + {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"}, + {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"}, ] [package.dependencies] packaging = ">=17.1" -pytest = ">=5.3" +pytest = ">=7" [[package]] name = "pytest-split" @@ -2488,16 +2476,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -2719,4 +2697,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "0834e5cb69e5457741d4f476c3e49a4dc83598b5730685c8755da651b96ad3ec" +content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 0ec7efd3167a..48c8604d86fa 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -24,6 +24,7 @@ hostname.workspace = true humantime.workspace = true hyper-tungstenite.workspace = true hyper.workspace = true +ipnet.workspace = true itertools.workspace = true md5.workspace = true metrics.workspace = true @@ -68,6 +69,7 @@ webpki-roots.workspace = true x509-parser.workspace = true native-tls.workspace = true postgres-native-tls.workspace = true +smol_str.workspace = true workspace_hack.workspace = true tokio-util.workspace = true @@ -76,3 +78,4 @@ tokio-util.workspace = true rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true +postgres-protocol.workspace = true diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 58dceb3bb6e1..7d79d3404525 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -4,7 +4,7 @@ pub mod backend; pub use backend::BackendType; mod credentials; -pub use credentials::ClientCredentials; +pub use credentials::{check_peer_addr_is_in_list, ClientCredentials}; mod password_hack; pub use password_hack::parse_endpoint_param; @@ -56,6 +56,12 @@ pub enum AuthErrorImpl { /// Errors produced by e.g. [`crate::stream::PqStream`]. #[error(transparent)] Io(#[from] io::Error), + + #[error( + "This IP address is not allowed to connect to this endpoint. \ + Please add it to the allowed list in the Neon console." + )] + IpAddressNotAllowed, } #[derive(Debug, Error)] @@ -70,6 +76,10 @@ impl AuthError { pub fn auth_failed(user: impl Into>) -> Self { AuthErrorImpl::AuthFailed(user.into()).into() } + + pub fn ip_address_not_allowed() -> Self { + AuthErrorImpl::IpAddressNotAllowed.into() + } } impl> From for AuthError { @@ -91,6 +101,7 @@ impl UserFacingError for AuthError { MalformedPassword(_) => self.to_string(), MissingEndpointName => self.to_string(), Io(_) => "Internal error".to_string(), + IpAddressNotAllowed => self.to_string(), } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 9cf45c0eec30..aa872285b169 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -5,7 +5,13 @@ mod link; pub use link::LinkAuthError; use tokio_postgres::config::AuthKeys; +use crate::auth::credentials::check_peer_addr_is_in_list; +use crate::console::errors::GetAuthInfoError; +use crate::console::provider::AuthInfo; +use crate::console::AuthSecret; use crate::proxy::{handle_try_wake, retry_after, LatencyTimer}; +use crate::scram; +use crate::stream::Stream; use crate::{ auth::{self, ClientCredentials}, config::AuthenticationConfig, @@ -19,6 +25,7 @@ use crate::{ use futures::TryFutureExt; use std::borrow::Cow; use std::ops::ControlFlow; +use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{error, info, warn}; @@ -63,6 +70,7 @@ pub enum BackendType<'a, T> { pub trait TestBackend: Send + Sync + 'static { fn wake_compute(&self) -> Result; + fn get_allowed_ips(&self) -> Result>, console::errors::GetAuthInfoError>; } impl std::fmt::Display for BackendType<'_, ()> { @@ -131,7 +139,7 @@ async fn auth_quirks_creds( api: &impl console::Api, extra: &ConsoleReqExtra<'_>, creds: &mut ClientCredentials<'_>, - client: &mut stream::PqStream, + client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, latency_timer: &mut LatencyTimer, @@ -139,14 +147,38 @@ async fn auth_quirks_creds( // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. - if creds.project.is_none() { + let maybe_success = if creds.project.is_none() { // Password will be checked by the compute node later. - return hacks::password_hack(creds, client, latency_timer).await; - } + Some(hacks::password_hack(creds, client, latency_timer).await?) + } else { + None + }; // Password hack should set the project name. // TODO: make `creds.project` more type-safe. assert!(creds.project.is_some()); + info!("fetching user's authentication info"); + // TODO(anna): this will slow down both "hacks" below; we probably need a cache. + let AuthInfo { + secret, + allowed_ips, + } = api.get_auth_info(extra, creds).await?; + + // check allowed list + if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed()); + } + let secret = secret.unwrap_or_else(|| { + // If we don't have an authentication secret, we mock one to + // prevent malicious probing (possible due to missing protocol steps). + // This mocked secret will never lead to successful authentication. + info!("authentication info not found, mocking it"); + AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random())) + }); + + if let Some(success) = maybe_success { + return Ok(success); + } // Perform cleartext auth if we're allowed to do that. // Currently, we use it for websocket connections (latency). @@ -156,7 +188,7 @@ async fn auth_quirks_creds( } // Finally, proceed with the main auth flow (SCRAM-based). - classic::authenticate(api, extra, creds, client, config, latency_timer).await + classic::authenticate(creds, client, config, latency_timer, secret).await } /// True to its name, this function encapsulates our current auth trade-offs. @@ -165,7 +197,7 @@ async fn auth_quirks( api: &impl console::Api, extra: &ConsoleReqExtra<'_>, creds: &mut ClientCredentials<'_>, - client: &mut stream::PqStream, + client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, latency_timer: &mut LatencyTimer, @@ -241,7 +273,7 @@ impl BackendType<'_, ClientCredentials<'_>> { pub async fn authenticate( &mut self, extra: &ConsoleReqExtra<'_>, - client: &mut stream::PqStream, + client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, latency_timer: &mut LatencyTimer, @@ -304,6 +336,19 @@ impl BackendType<'_, ClientCredentials<'_>> { Ok(res) } + pub async fn get_allowed_ips( + &self, + extra: &ConsoleReqExtra<'_>, + ) -> Result>, GetAuthInfoError> { + use BackendType::*; + match self { + Console(api, creds) => api.get_allowed_ips(extra, creds).await, + Postgres(api, creds) => api.get_allowed_ips(extra, creds).await, + Link(_) => Ok(Arc::new(vec![])), + Test(x) => x.get_allowed_ips(), + } + } + /// When applicable, wake the compute node, gaining its connection info in the process. /// The link auth flow doesn't support this, so we return [`None`] in that case. pub async fn wake_compute( diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index aee00576062d..bb210821cd9a 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -3,38 +3,28 @@ use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute, config::AuthenticationConfig, - console::{self, AuthInfo, ConsoleReqExtra}, + console::AuthSecret, proxy::LatencyTimer, - sasl, scram, - stream::PqStream, + sasl, + stream::{PqStream, Stream}, }; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; pub(super) async fn authenticate( - api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials<'_>, - client: &mut PqStream, + client: &mut PqStream>, config: &'static AuthenticationConfig, latency_timer: &mut LatencyTimer, + secret: AuthSecret, ) -> auth::Result> { - info!("fetching user's authentication info"); - let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| { - // If we don't have an authentication secret, we mock one to - // prevent malicious probing (possible due to missing protocol steps). - // This mocked secret will never lead to successful authentication. - info!("authentication info not found, mocking it"); - AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random())) - }); - let flow = AuthFlow::new(client); - let scram_keys = match info { - AuthInfo::Md5(_) => { + let scram_keys = match secret { + AuthSecret::Md5(_) => { info!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); } - AuthInfo::Scram(secret) => { + AuthSecret::Scram(secret) => { info!("auth endpoint chooses SCRAM"); let scram = auth::Scram(&secret); diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 895683af1b2d..4448dbc56aaf 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -2,7 +2,7 @@ use super::{AuthSuccess, ComputeCredentials}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, proxy::LatencyTimer, - stream, + stream::{self, Stream}, }; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; @@ -12,7 +12,7 @@ use tracing::{info, warn}; /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub async fn cleartext_hack( - client: &mut stream::PqStream, + client: &mut stream::PqStream>, latency_timer: &mut LatencyTimer, ) -> auth::Result> { warn!("cleartext auth flow override is enabled, proceeding"); @@ -37,7 +37,7 @@ pub async fn cleartext_hack( /// Very similar to [`cleartext_hack`], but there's a specific password format. pub async fn password_hack( creds: &mut ClientCredentials<'_>, - client: &mut stream::PqStream, + client: &mut stream::PqStream>, latency_timer: &mut LatencyTimer, ) -> auth::Result> { warn!("project not specified, resorting to the password hack auth flow"); diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index da43cf11c403..3a77d7e5cae2 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -106,7 +106,7 @@ pub(super) async fn authenticate( reported_auth_ok: true, value: NodeInfo { config, - aux: db_info.aux.into(), + aux: db_info.aux, allow_self_signed_compute: false, // caller may override }, }) diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 9fe9c26f0c27..facb8da8cd2d 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -7,9 +7,12 @@ use crate::{ }; use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::collections::HashSet; +use std::{ + collections::HashSet, + net::{IpAddr, SocketAddr}, +}; use thiserror::Error; -use tracing::info; +use tracing::{info, warn}; #[derive(Debug, Error, PartialEq, Eq, Clone)] pub enum ClientCredsParseError { @@ -44,6 +47,7 @@ pub struct ClientCredentials<'a> { pub project: Option, pub cache_key: String, + pub peer_addr: SocketAddr, } impl ClientCredentials<'_> { @@ -54,19 +58,11 @@ impl ClientCredentials<'_> { } impl<'a> ClientCredentials<'a> { - #[cfg(test)] - pub fn new_noop() -> Self { - ClientCredentials { - user: "", - project: None, - cache_key: "".to_string(), - } - } - pub fn parse( params: &'a StartupMessageParams, sni: Option<&str>, common_names: Option>, + peer_addr: SocketAddr, ) -> Result { use ClientCredsParseError::*; @@ -153,10 +149,59 @@ impl<'a> ClientCredentials<'a> { user, project, cache_key, + peer_addr, }) } } +pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec) -> bool { + if ip_list.is_empty() { + return true; + } + for ip in ip_list { + // We expect that all ip addresses from control plane are correct. + // However, if some of them are broken, we still can check the others. + match parse_ip_pattern(ip) { + Ok(pattern) => { + if check_ip(peer_addr, &pattern) { + return true; + } + } + Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err), + } + } + false +} + +#[derive(Debug, Clone, Eq, PartialEq)] +enum IpPattern { + Subnet(ipnet::IpNet), + Range(IpAddr, IpAddr), + Single(IpAddr), +} + +fn parse_ip_pattern(pattern: &str) -> anyhow::Result { + if pattern.contains('/') { + let subnet: ipnet::IpNet = pattern.parse()?; + return Ok(IpPattern::Subnet(subnet)); + } + if let Some((start, end)) = pattern.split_once('-') { + let start: IpAddr = start.parse()?; + let end: IpAddr = end.parse()?; + return Ok(IpPattern::Range(start, end)); + } + let addr: IpAddr = pattern.parse()?; + Ok(IpPattern::Single(addr)) +} + +fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool { + match pattern { + IpPattern::Subnet(subnet) => subnet.contains(ip), + IpPattern::Range(start, end) => start <= ip && ip <= end, + IpPattern::Single(addr) => addr == ip, + } +} + fn project_name_valid(name: &str) -> bool { name.chars().all(|c| c.is_alphanumeric() || c == '-') } @@ -176,8 +221,8 @@ mod tests { fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - - let creds = ClientCredentials::parse(&options, None, None)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project, None); @@ -191,8 +236,8 @@ mod tests { ("database", "world"), // should be ignored ("foo", "bar"), // should be ignored ]); - - let creds = ClientCredentials::parse(&options, None, None)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project, None); @@ -206,7 +251,8 @@ mod tests { let sni = Some("foo.localhost"); let common_names = Some(["localhost".into()].into()); - let creds = ClientCredentials::parse(&options, sni, common_names)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("foo")); assert_eq!(creds.cache_key, "foo"); @@ -221,7 +267,8 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(&options, None, None)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -235,7 +282,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(&options, None, None)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -252,7 +300,8 @@ mod tests { ), ]); - let creds = ClientCredentials::parse(&options, None, None)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert!(creds.project.is_none()); @@ -266,7 +315,8 @@ mod tests { ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"), ]); - let creds = ClientCredentials::parse(&options, None, None)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, None, None, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert!(creds.project.is_none()); @@ -280,7 +330,8 @@ mod tests { let sni = Some("baz.localhost"); let common_names = Some(["localhost".into()].into()); - let creds = ClientCredentials::parse(&options, sni, common_names)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("baz")); @@ -293,12 +344,14 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); - let creds = ClientCredentials::parse(&options, sni, common_names)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.project.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); - let creds = ClientCredentials::parse(&options, sni, common_names)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.project.as_deref(), Some("p1")); Ok(()) @@ -312,7 +365,9 @@ mod tests { let sni = Some("second.localhost"); let common_names = Some(["localhost".into()].into()); - let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail"); + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let err = ClientCredentials::parse(&options, sni, common_names, peer_addr) + .expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -329,7 +384,9 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); - let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail"); + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let err = ClientCredentials::parse(&options, sni, common_names, peer_addr) + .expect_err("should fail"); match err { UnknownCommonName { cn } => { assert_eq!(cn, "localhost"); @@ -347,7 +404,8 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); - let creds = ClientCredentials::parse(&options, sni, common_names)?; + let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234)); + let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?; assert_eq!(creds.project.as_deref(), Some("project")); assert_eq!( creds.cache_key, @@ -356,4 +414,91 @@ mod tests { Ok(()) } + + #[test] + fn test_check_peer_addr_is_in_list() { + let peer_addr = IpAddr::from([127, 0, 0, 1]); + assert!(check_peer_addr_is_in_list(&peer_addr, &vec![])); + assert!(check_peer_addr_is_in_list( + &peer_addr, + &vec!["127.0.0.1".into()] + )); + assert!(!check_peer_addr_is_in_list( + &peer_addr, + &vec!["8.8.8.8".into()] + )); + // If there is an incorrect address, it will be skipped. + assert!(check_peer_addr_is_in_list( + &peer_addr, + &vec!["88.8.8".into(), "127.0.0.1".into()] + )); + } + #[test] + fn test_parse_ip_v4() -> anyhow::Result<()> { + let peer_addr = IpAddr::from([127, 0, 0, 1]); + // Ok + assert_eq!(parse_ip_pattern("127.0.0.1")?, IpPattern::Single(peer_addr)); + assert_eq!( + parse_ip_pattern("127.0.0.1/31")?, + IpPattern::Subnet(ipnet::IpNet::new(peer_addr, 31)?) + ); + assert_eq!( + parse_ip_pattern("0.0.0.0-200.0.1.2")?, + IpPattern::Range(IpAddr::from([0, 0, 0, 0]), IpAddr::from([200, 0, 1, 2])) + ); + + // Error + assert!(parse_ip_pattern("300.0.1.2").is_err()); + assert!(parse_ip_pattern("30.1.2").is_err()); + assert!(parse_ip_pattern("127.0.0.1/33").is_err()); + assert!(parse_ip_pattern("127.0.0.1-127.0.3").is_err()); + assert!(parse_ip_pattern("1234.0.0.1-127.0.3.0").is_err()); + Ok(()) + } + + #[test] + fn test_check_ipv4() -> anyhow::Result<()> { + let peer_addr = IpAddr::from([127, 0, 0, 1]); + let peer_addr_next = IpAddr::from([127, 0, 0, 2]); + let peer_addr_prev = IpAddr::from([127, 0, 0, 0]); + // Success + assert!(check_ip(&peer_addr, &IpPattern::Single(peer_addr))); + assert!(check_ip( + &peer_addr, + &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_prev, 31)?) + )); + assert!(check_ip( + &peer_addr, + &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_next, 30)?) + )); + assert!(check_ip( + &peer_addr, + &IpPattern::Range(IpAddr::from([0, 0, 0, 0]), IpAddr::from([200, 0, 1, 2])) + )); + assert!(check_ip( + &peer_addr, + &IpPattern::Range(peer_addr, peer_addr) + )); + + // Not success + assert!(!check_ip(&peer_addr, &IpPattern::Single(peer_addr_prev))); + assert!(!check_ip( + &peer_addr, + &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_next, 31)?) + )); + assert!(!check_ip( + &peer_addr, + &IpPattern::Range(IpAddr::from([0, 0, 0, 0]), peer_addr_prev) + )); + assert!(!check_ip( + &peer_addr, + &IpPattern::Range(peer_addr_next, IpAddr::from([128, 0, 0, 0])) + )); + // There is no check that for range start <= end. But it's fine as long as for all this cases the result is false. + assert!(!check_ip( + &peer_addr, + &IpPattern::Range(peer_addr, peer_addr_prev) + )); + Ok(()) + } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 190abc9b2e0d..efb90733d6fe 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,16 +1,21 @@ //! Main authentication flow. use super::{AuthErrorImpl, PasswordHackPayload}; -use crate::{sasl, scram, stream::PqStream}; +use crate::{ + config::TlsServerEndPoint, + sasl, scram, + stream::{PqStream, Stream}, +}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; use std::io; use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::info; /// Every authentication selector is supposed to implement this trait. pub trait AuthMethod { /// Any authentication selector should provide initial backend message /// containing auth method name and parameters, e.g. md5 salt. - fn first_message(&self) -> BeMessage<'_>; + fn first_message(&self, channel_binding: bool) -> BeMessage<'_>; } /// Initial state of [`AuthFlow`]. @@ -21,8 +26,14 @@ pub struct Scram<'a>(pub &'a scram::ServerSecret); impl AuthMethod for Scram<'_> { #[inline(always)] - fn first_message(&self) -> BeMessage<'_> { - Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS)) + fn first_message(&self, channel_binding: bool) -> BeMessage<'_> { + if channel_binding { + Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS)) + } else { + Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods( + scram::METHODS_WITHOUT_PLUS, + )) + } } } @@ -32,7 +43,7 @@ pub struct PasswordHack; impl AuthMethod for PasswordHack { #[inline(always)] - fn first_message(&self) -> BeMessage<'_> { + fn first_message(&self, _channel_binding: bool) -> BeMessage<'_> { Be::AuthenticationCleartextPassword } } @@ -43,37 +54,44 @@ pub struct CleartextPassword; impl AuthMethod for CleartextPassword { #[inline(always)] - fn first_message(&self) -> BeMessage<'_> { + fn first_message(&self, _channel_binding: bool) -> BeMessage<'_> { Be::AuthenticationCleartextPassword } } /// This wrapper for [`PqStream`] performs client authentication. #[must_use] -pub struct AuthFlow<'a, Stream, State> { +pub struct AuthFlow<'a, S, State> { /// The underlying stream which implements libpq's protocol. - stream: &'a mut PqStream, + stream: &'a mut PqStream>, /// State might contain ancillary data (see [`Self::begin`]). state: State, + tls_server_end_point: TlsServerEndPoint, } /// Initial state of the stream wrapper. -impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { +impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { /// Create a new wrapper for client authentication. - pub fn new(stream: &'a mut PqStream) -> Self { + pub fn new(stream: &'a mut PqStream>) -> Self { + let tls_server_end_point = stream.get_ref().tls_server_end_point(); + Self { stream, state: Begin, + tls_server_end_point, } } /// Move to the next step by sending auth method's name & params to client. pub async fn begin(self, method: M) -> io::Result> { - self.stream.write_message(&method.first_message()).await?; + self.stream + .write_message(&method.first_message(self.tls_server_end_point.supported())) + .await?; Ok(AuthFlow { stream: self.stream, state: method, + tls_server_end_point: self.tls_server_end_point, }) } } @@ -123,9 +141,15 @@ impl AuthFlow<'_, S, Scram<'_>> { return Err(super::AuthError::bad_auth_method(sasl.method)); } + info!("client chooses {}", sasl.method); + let secret = self.state.0; let outcome = sasl::SaslStream::new(self.stream, sasl.message) - .authenticate(scram::Exchange::new(secret, rand::random, None)) + .authenticate(scram::Exchange::new( + secret, + rand::random, + self.tls_server_end_point, + )) .await?; Ok(outcome) diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 42aecdb6fe89..bedbdbcc8358 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -6,6 +6,8 @@ use std::{net::SocketAddr, sync::Arc}; use futures::future::Either; +use itertools::Itertools; +use proxy::config::TlsServerEndPoint; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; @@ -65,7 +67,7 @@ async fn main() -> anyhow::Result<()> { let destination: String = args.get_one::("dest").unwrap().parse()?; // Configure TLS - let tls_config: Arc = match ( + let (tls_config, tls_server_end_point): (Arc, TlsServerEndPoint) = match ( args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { @@ -89,16 +91,22 @@ async fn main() -> anyhow::Result<()> { ))? .into_iter() .map(rustls::Certificate) - .collect() + .collect_vec() }; - rustls::ServerConfig::builder() + // needed for channel bindings + let first_cert = cert_chain.first().context("missing certificate")?; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let tls_config = rustls::ServerConfig::builder() .with_safe_default_cipher_suites() .with_safe_default_kx_groups() .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? .with_no_client_auth() .with_single_cert(cert_chain, key)? - .into() + .into(); + + (tls_config, tls_server_end_point) } _ => bail!("tls-key and tls-cert must be specified"), }; @@ -113,6 +121,7 @@ async fn main() -> anyhow::Result<()> { let main = tokio::spawn(task_main( Arc::new(destination), tls_config, + tls_server_end_point, proxy_listener, cancellation_token.clone(), )); @@ -134,6 +143,7 @@ async fn main() -> anyhow::Result<()> { async fn task_main( dest_suffix: Arc, tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, ) -> anyhow::Result<()> { @@ -159,7 +169,7 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - handle_client(dest_suffix, tls_config, socket).await + handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. @@ -207,6 +217,7 @@ const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmod async fn ssl_handshake( raw_stream: S, tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, ) -> anyhow::Result> { let mut stream = PqStream::new(Stream::from_raw(raw_stream)); @@ -231,7 +242,11 @@ async fn ssl_handshake( if !read_buf.is_empty() { bail!("data is sent before server replied with EncryptionResponse"); } - Ok(raw.upgrade(tls_config).await?) + + Ok(Stream::Tls { + tls: Box::new(raw.upgrade(tls_config).await?), + tls_server_end_point, + }) } unexpected => { info!( @@ -246,9 +261,10 @@ async fn ssl_handshake( async fn handle_client( dest_suffix: Arc, tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let tls_stream = ssl_handshake(stream, tls_config).await?; + let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of @@ -268,5 +284,5 @@ async fn handle_client( let client = tokio::net::TcpStream::connect(destination).await?; let metrics_aux: MetricsAuxInfo = Default::default(); - proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await + proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 570cf0943a0e..7457e268679d 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,8 +1,11 @@ use futures::future::Either; use proxy::auth; use proxy::config::AuthenticationConfig; +use proxy::config::CacheOptions; use proxy::config::HttpConfig; use proxy::console; +use proxy::console::provider::AllowedIpsCache; +use proxy::console::provider::NodeInfoCache; use proxy::http; use proxy::rate_limiter::RateLimiterConfig; use proxy::usage_metrics; @@ -90,6 +93,9 @@ struct ProxyCliArgs { /// timeout for http connections #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] sql_over_http_timeout: tokio::time::Duration, + /// Whether the SQL over http pool is opt-in + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + sql_over_http_pool_opt_in: bool, /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, @@ -110,6 +116,12 @@ struct ProxyCliArgs { initial_limit: usize, #[clap(flatten)] aimd_config: proxy::rate_limiter::AimdConfig, + /// cache for `allowed_ips` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)] + allowed_ips_cache: String, + /// disable ip check for http requests. If it is too time consuming, it could be turned off. + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + disable_ip_check_for_http: bool, } #[tokio::main] @@ -238,11 +250,24 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let auth_backend = match &args.auth_backend { AuthBackend::Console => { - let config::CacheOptions { size, ttl } = args.wake_compute_cache.parse()?; + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?; - info!("Using NodeInfoCache (wake_compute) with size={size} ttl={ttl:?}"); + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}"); let caches = Box::leak(Box::new(console::caches::ApiCaches { - node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl), + node_info: NodeInfoCache::new( + "node_info_cache", + wake_compute_cache_config.size, + wake_compute_cache_config.ttl, + true, + ), + allowed_ips: AllowedIpsCache::new( + "allowed_ips_cache", + allowed_ips_cache_config.size, + allowed_ips_cache_config.ttl, + false, + ), })); let config::WakeComputeLockOptions { @@ -275,7 +300,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } }; let http_config = HttpConfig { - sql_over_http_timeout: args.sql_over_http_timeout, + timeout: args.sql_over_http_timeout, + pool_opt_in: args.sql_over_http_pool_opt_in, }; let authentication_config = AuthenticationConfig { scram_protocol_timeout: args.scram_protocol_timeout, @@ -288,6 +314,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { http_config, authentication_config, require_client_ip: args.require_client_ip, + disable_ip_check_for_http: args.disable_ip_check_for_http, })); Ok(config) diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index a9d6793bbda9..f54f360b0195 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -55,7 +55,7 @@ pub mod timed_lru { /// * Whenever a new entry is inserted, the least recently accessed one is evicted. /// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`). /// - /// * When the entry is about to be retrieved, we check its expiration timestamp. + /// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp. /// If the entry has expired, we remove it from the cache; Otherwise we bump the /// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong /// its existence. @@ -79,6 +79,8 @@ pub mod timed_lru { /// Default time-to-live of a single entry. ttl: Duration, + + update_ttl_on_retrieval: bool, } impl Cache for TimedLru { @@ -99,11 +101,17 @@ pub mod timed_lru { impl TimedLru { /// Construct a new LRU cache with timed entries. - pub fn new(name: &'static str, capacity: usize, ttl: Duration) -> Self { + pub fn new( + name: &'static str, + capacity: usize, + ttl: Duration, + update_ttl_on_retrieval: bool, + ) -> Self { Self { name, cache: LruCache::new(capacity).into(), ttl, + update_ttl_on_retrieval, } } @@ -165,7 +173,9 @@ pub mod timed_lru { let (created_at, expires_at) = (entry.created_at, entry.expires_at); // Update the deadline and the entry's position in the LRU list. - raw_entry.get_mut().expires_at = deadline; + if self.update_ttl_on_retrieval { + raw_entry.get_mut().expires_at = deadline; + } raw_entry.to_back(); drop(cache); // drop lock before logging diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 0741ad06230f..c838c8fc3814 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,9 +1,6 @@ use crate::{ - auth::parse_endpoint_param, - cancellation::CancelClosure, - console::errors::WakeComputeError, - error::{io_error, UserFacingError}, - proxy::is_neon_param, + auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError, + error::UserFacingError, proxy::is_neon_param, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; @@ -28,12 +25,9 @@ pub enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] TlsError(#[from] native_tls::Error), -} -impl From for ConnectionError { - fn from(value: WakeComputeError) -> Self { - io_error(value).into() - } + #[error("{COULD_NOT_CONNECT}: {0}")] + WakeComputeError(#[from] WakeComputeError), } impl UserFacingError for ConnectionError { @@ -46,6 +40,7 @@ impl UserFacingError for ConnectionError { Some(err) => err.message().to_owned(), None => err.to_string(), }, + WakeComputeError(err) => err.to_string_client(), _ => COULD_NOT_CONNECT.to_owned(), } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index bd00123905eb..182d71f9be0b 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,12 +1,15 @@ use crate::auth; use anyhow::{bail, ensure, Context, Ok}; -use rustls::sign; +use rustls::{sign, Certificate, PrivateKey}; +use sha2::{Digest, Sha256}; use std::{ collections::{HashMap, HashSet}, str::FromStr, sync::Arc, time::Duration, }; +use tracing::{error, info}; +use x509_parser::oid_registry; pub struct ProxyConfig { pub tls_config: Option, @@ -16,6 +19,7 @@ pub struct ProxyConfig { pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, + pub disable_ip_check_for_http: bool, } #[derive(Debug)] @@ -27,10 +31,12 @@ pub struct MetricCollectionConfig { pub struct TlsConfig { pub config: Arc, pub common_names: Option>, + pub cert_resolver: Arc, } pub struct HttpConfig { - pub sql_over_http_timeout: tokio::time::Duration, + pub timeout: tokio::time::Duration, + pub pool_opt_in: bool, } pub struct AuthenticationConfig { @@ -52,7 +58,7 @@ pub fn configure_tls( let mut cert_resolver = CertResolver::new(); // add default certificate - cert_resolver.add_cert(key_path, cert_path, true)?; + cert_resolver.add_cert_path(key_path, cert_path, true)?; // add extra certificates if let Some(certs_dir) = certs_dir { @@ -64,7 +70,7 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver.add_cert( + cert_resolver.add_cert_path( &key_path.to_string_lossy(), &cert_path.to_string_lossy(), false, @@ -76,35 +82,97 @@ pub fn configure_tls( let common_names = cert_resolver.get_common_names(); + let cert_resolver = Arc::new(cert_resolver); + let config = rustls::ServerConfig::builder() .with_safe_default_cipher_suites() .with_safe_default_kx_groups() // allow TLS 1.2 to be compatible with older client libraries .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? .with_no_client_auth() - .with_cert_resolver(Arc::new(cert_resolver)) + .with_cert_resolver(cert_resolver.clone()) .into(); Ok(TlsConfig { config, common_names: Some(common_names), + cert_resolver, }) } -struct CertResolver { - certs: HashMap>, - default: Option>, +/// Channel binding parameter +/// +/// +/// Description: The hash of the TLS server's certificate as it +/// appears, octet for octet, in the server's Certificate message. Note +/// that the Certificate message contains a certificate_list, in which +/// the first element is the server's certificate. +/// +/// The hash function is to be selected as follows: +/// +/// * if the certificate's signatureAlgorithm uses a single hash +/// function, and that hash function is either MD5 or SHA-1, then use SHA-256; +/// +/// * if the certificate's signatureAlgorithm uses a single hash +/// function and that hash function neither MD5 nor SHA-1, then use +/// the hash function associated with the certificate's +/// signatureAlgorithm; +/// +/// * if the certificate's signatureAlgorithm uses no hash functions or +/// uses multiple hash functions, then this channel binding type's +/// channel bindings are undefined at this time (updates to is channel +/// binding type may occur to address this issue if it ever arises). +#[derive(Debug, Clone, Copy)] +pub enum TlsServerEndPoint { + Sha256([u8; 32]), + Undefined, } -impl CertResolver { - fn new() -> Self { - Self { - certs: HashMap::new(), - default: None, +impl TlsServerEndPoint { + pub fn new(cert: &Certificate) -> anyhow::Result { + let sha256_oids = [ + // I'm explicitly not adding MD5 or SHA1 here... They're bad. + oid_registry::OID_SIG_ECDSA_WITH_SHA256, + oid_registry::OID_PKCS1_SHA256WITHRSA, + ]; + + let pem = x509_parser::parse_x509_certificate(&cert.0) + .context("Failed to parse PEM object from cerficiate")? + .1; + + info!(subject = %pem.subject, "parsing TLS certificate"); + + let reg = oid_registry::OidRegistry::default().with_all_crypto(); + let oid = pem.signature_algorithm.oid(); + let alg = reg.get(oid); + if sha256_oids.contains(oid) { + let tls_server_end_point: [u8; 32] = + Sha256::new().chain_update(&cert.0).finalize().into(); + info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); + Ok(Self::Sha256(tls_server_end_point)) + } else { + error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding"); + Ok(Self::Undefined) } } - fn add_cert( + pub fn supported(&self) -> bool { + !matches!(self, TlsServerEndPoint::Undefined) + } +} + +#[derive(Default)] +pub struct CertResolver { + certs: HashMap, TlsServerEndPoint)>, + default: Option<(Arc, TlsServerEndPoint)>, +} + +impl CertResolver { + pub fn new() -> Self { + Self::default() + } + + fn add_cert_path( &mut self, key_path: &str, cert_path: &str, @@ -120,57 +188,65 @@ impl CertResolver { keys.pop().map(rustls::PrivateKey).unwrap() }; - let key = sign::any_supported_type(&priv_key).context("invalid private key")?; - let cert_chain_bytes = std::fs::read(cert_path) .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context(format!( + .with_context(|| { + format!( "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ))? + ) + })? .into_iter() .map(rustls::Certificate) .collect() }; - let common_name = { - let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes) - .context(format!( - "Failed to parse PEM object from bytes from file at '{cert_path}'." - ))? - .1; - let common_name = pem.parse_x509()?.subject().to_string(); - - // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as - // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so - // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names - // and passed None instead, which blows up number of cases downstream code should handle. Proper coding - // here should better avoid Option for common_names, and do wildcard-based certificate selection instead - // of cutting off '*.' parts. - if common_name.starts_with("CN=*.") { - common_name.strip_prefix("CN=*.").map(|s| s.to_string()) - } else { - common_name.strip_prefix("CN=").map(|s| s.to_string()) - } + self.add_cert(priv_key, cert_chain, is_default) + } + + pub fn add_cert( + &mut self, + priv_key: PrivateKey, + cert_chain: Vec, + is_default: bool, + ) -> anyhow::Result<()> { + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let first_cert = &cert_chain[0]; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + let pem = x509_parser::parse_x509_certificate(&first_cert.0) + .context("Failed to parse PEM object from cerficiate")? + .1; + + let common_name = pem.subject().to_string(); + + // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as + // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so + // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names + // and passed None instead, which blows up number of cases downstream code should handle. Proper coding + // here should better avoid Option for common_names, and do wildcard-based certificate selection instead + // of cutting off '*.' parts. + let common_name = if common_name.starts_with("CN=*.") { + common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + } else { + common_name.strip_prefix("CN=").map(|s| s.to_string()) } - .context(format!( - "Failed to parse common name from certificate at '{cert_path}'." - ))?; + .context("Failed to parse common name from certificate")?; let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); if is_default { - self.default = Some(cert.clone()); + self.default = Some((cert.clone(), tls_server_end_point)); } - self.certs.insert(common_name, cert); + self.certs.insert(common_name, (cert, tls_server_end_point)); Ok(()) } - fn get_common_names(&self) -> HashSet { + pub fn get_common_names(&self) -> HashSet { self.certs.keys().map(|s| s.to_string()).collect() } } @@ -178,15 +254,24 @@ impl CertResolver { impl rustls::server::ResolvesServerCert for CertResolver { fn resolve( &self, - _client_hello: rustls::server::ClientHello, + client_hello: rustls::server::ClientHello, ) -> Option> { + self.resolve(client_hello.server_name()).map(|x| x.0) + } +} + +impl CertResolver { + pub fn resolve( + &self, + server_name: Option<&str>, + ) -> Option<(Arc, TlsServerEndPoint)> { // loop here and cut off more and more subdomains until we find // a match to get a proper wildcard support. OTOH, we now do not // use nested domains, so keep this simple for now. // // With the current coding foo.com will match *.foo.com and that // repeats behavior of the old code. - if let Some(mut sni_name) = _client_hello.server_name() { + if let Some(mut sni_name) = server_name { loop { if let Some(cert) = self.certs.get(sni_name) { return Some(cert.clone()); @@ -214,6 +299,7 @@ impl rustls::server::ResolvesServerCert for CertResolver { } /// Helper for cmdline cache options parsing. +#[derive(Debug)] pub struct CacheOptions { /// Max number of entries. pub size: usize, diff --git a/proxy/src/console.rs b/proxy/src/console.rs index 6da627389e9f..07bc807950cc 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -6,7 +6,7 @@ pub mod messages; /// Wrappers for console APIs and their mocks. pub mod provider; -pub use provider::{errors, Api, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo}; +pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo}; /// Various cache-related types. pub mod caches { diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index e5f1615b149e..837379b21ff4 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,4 +1,5 @@ use serde::Deserialize; +use smol_str::SmolStr; use std::fmt; /// Generic error response with human-readable description. @@ -88,11 +89,11 @@ impl fmt::Debug for DatabaseInfo { /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. -#[derive(Debug, Deserialize, Default)] +#[derive(Debug, Deserialize, Clone, Default)] pub struct MetricsAuxInfo { - pub endpoint_id: Box, - pub project_id: Box, - pub branch_id: Box, + pub endpoint_id: SmolStr, + pub project_id: SmolStr, + pub branch_id: SmolStr, } impl MetricsAuxInfo { diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 54bcd1f081c2..e735b9f66c21 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -204,7 +204,7 @@ pub struct ConsoleReqExtra<'a> { } /// Auth secret which is managed by the cloud. -pub enum AuthInfo { +pub enum AuthSecret { /// Md5 hash of user's password. Md5([u8; 16]), @@ -212,6 +212,13 @@ pub enum AuthInfo { Scram(scram::ServerSecret), } +#[derive(Default)] +pub struct AuthInfo { + pub secret: Option, + /// List of IP addresses allowed for the autorization. + pub allowed_ips: Vec, +} + /// Info for establishing a connection to a compute node. /// This is what we get after auth succeeded, but not before! #[derive(Clone)] @@ -222,7 +229,7 @@ pub struct NodeInfo { pub config: compute::ConnCfg, /// Labels for proxy's metrics. - pub aux: Arc, + pub aux: MetricsAuxInfo, /// Whether we should accept self-signed certificates (for testing) pub allow_self_signed_compute: bool, @@ -230,6 +237,7 @@ pub struct NodeInfo { pub type NodeInfoCache = TimedLru, NodeInfo>; pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>; +pub type AllowedIpsCache = TimedLru, Arc>>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. @@ -240,7 +248,13 @@ pub trait Api { &self, extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials, - ) -> Result, errors::GetAuthInfoError>; + ) -> Result; + + async fn get_allowed_ips( + &self, + extra: &ConsoleReqExtra<'_>, + creds: &ClientCredentials, + ) -> Result>, errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( @@ -254,6 +268,8 @@ pub trait Api { pub struct ApiCaches { /// Cache for the `wake_compute` API method. pub node_info: NodeInfoCache, + /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead. + pub allowed_ips: TimedLru, Arc>>, } /// Various caches for [`console`](super). diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 750a2d141ea3..4cc68f0ac143 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -1,14 +1,16 @@ //! Mock console backend which relies on a user-provided postgres instance. +use std::sync::Arc; + use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, - AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo, + AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, }; use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUrl}; use async_trait::async_trait; use futures::TryFutureExt; use thiserror::Error; -use tokio_postgres::config::SslMode; +use tokio_postgres::{config::SslMode, Client}; use tracing::{error, info, info_span, warn, Instrument}; #[derive(Debug, Error)] @@ -46,8 +48,8 @@ impl Api { async fn do_get_auth_info( &self, creds: &ClientCredentials<'_>, - ) -> Result, GetAuthInfoError> { - async { + ) -> Result { + let (secret, allowed_ips) = async { // Perhaps we could persist this connection, but then we'd have to // write more code for reopening it if it got closed, which doesn't // seem worth it. @@ -55,32 +57,48 @@ impl Api { tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; tokio::spawn(connection); - let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; - let rows = client.query(query, &[&creds.user]).await?; - - // We can get at most one row, because `rolname` is unique. - let row = match rows.first() { - Some(row) => row, - // This means that the user doesn't exist, so there can be no secret. - // However, this is still a *valid* outcome which is very similar - // to getting `404 Not found` from the Neon console. + let secret = match get_execute_postgres_query( + &client, + "select rolpassword from pg_catalog.pg_authid where rolname = $1", + &[&creds.user], + "rolpassword", + ) + .await? + { + Some(entry) => { + info!("got a secret: {entry}"); // safe since it's not a prod scenario + let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram); + secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) + } None => { warn!("user '{}' does not exist", creds.user); - return Ok(None); + None } }; + let allowed_ips = match get_execute_postgres_query( + &client, + "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", + &[&creds.project.clone().unwrap_or_default().as_str()], + "allowed_ips", + ) + .await? + { + Some(s) => { + info!("got allowed_ips: {s}"); + s.split(',').map(String::from).collect() + } + None => vec![], + }; - let entry = row - .try_get("rolpassword") - .map_err(MockApiError::PasswordNotSet)?; - - info!("got a secret: {entry}"); // safe since it's not a prod scenario - let secret = scram::ServerSecret::parse(entry).map(AuthInfo::Scram); - Ok(secret.or_else(|| parse_md5(entry).map(AuthInfo::Md5))) + Ok((secret, allowed_ips)) } - .map_err(crate::error::log_error) + .map_err(crate::error::log_error::) .instrument(info_span!("postgres", url = self.endpoint.as_str())) - .await + .await?; + Ok(AuthInfo { + secret, + allowed_ips, + }) } async fn do_wake_compute(&self) -> Result { @@ -100,6 +118,27 @@ impl Api { } } +async fn get_execute_postgres_query( + client: &Client, + query: &str, + params: &[&(dyn tokio_postgres::types::ToSql + Sync)], + idx: &str, +) -> Result, GetAuthInfoError> { + let rows = client.query(query, params).await?; + + // We can get at most one row, because `rolname` is unique. + let row = match rows.first() { + Some(row) => row, + // This means that the user doesn't exist, so there can be no secret. + // However, this is still a *valid* outcome which is very similar + // to getting `404 Not found` from the Neon console. + None => return Ok(None), + }; + + let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?; + Ok(Some(entry)) +} + #[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] @@ -107,10 +146,18 @@ impl super::Api for Api { &self, _extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials, - ) -> Result, GetAuthInfoError> { + ) -> Result { self.do_get_auth_info(creds).await } + async fn get_allowed_ips( + &self, + _extra: &ConsoleReqExtra<'_>, + creds: &ClientCredentials, + ) -> Result>, GetAuthInfoError> { + Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips)) + } + #[tracing::instrument(skip_all)] async fn wake_compute( &self, diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 0dc7c7153433..7828a7d7e43b 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -3,11 +3,17 @@ use super::{ super::messages::{ConsoleError, GetRoleSecret, WakeCompute}, errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo, + ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, +}; +use crate::{ + auth::ClientCredentials, + compute, http, + proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, + scram, }; -use crate::{auth::ClientCredentials, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; +use itertools::Itertools; use std::{net::SocketAddr, sync::Arc}; use tokio::time::Instant; use tokio_postgres::config::SslMode; @@ -48,7 +54,7 @@ impl Api { &self, extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials<'_>, - ) -> Result, GetAuthInfoError> { + ) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); async { let request = self @@ -72,16 +78,25 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(None), + Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), _otherwise => return Err(e.into()), }, }; let secret = scram::ServerSecret::parse(&body.role_secret) - .map(AuthInfo::Scram) + .map(AuthSecret::Scram) .ok_or(GetAuthInfoError::BadSecret)?; - - Ok(Some(secret)) + let allowed_ips = body + .allowed_ips + .into_iter() + .flatten() + .map(String::from) + .collect_vec(); + ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); + Ok(AuthInfo { + secret: Some(secret), + allowed_ips, + }) } .map_err(crate::error::log_error) .instrument(info_span!("http", id = request_id)) @@ -129,7 +144,7 @@ impl Api { let node = NodeInfo { config, - aux: body.aux.into(), + aux: body.aux, allow_self_signed_compute: false, }; @@ -148,10 +163,32 @@ impl super::Api for Api { &self, extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials, - ) -> Result, GetAuthInfoError> { + ) -> Result { self.do_get_auth_info(extra, creds).await } + async fn get_allowed_ips( + &self, + extra: &ConsoleReqExtra<'_>, + creds: &ClientCredentials, + ) -> Result>, GetAuthInfoError> { + let key: &str = creds.project().expect("impossible"); + if let Some(allowed_ips) = self.caches.allowed_ips.get(key) { + ALLOWED_IPS_BY_CACHE_OUTCOME + .with_label_values(&["hit"]) + .inc(); + return Ok(Arc::new(allowed_ips.to_vec())); + } + ALLOWED_IPS_BY_CACHE_OUTCOME + .with_label_values(&["miss"]) + .inc(); + let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips); + self.caches + .allowed_ips + .insert(key.into(), allowed_ips.clone()); + Ok(allowed_ips) + } + #[tracing::instrument(skip_all)] async fn wake_compute( &self, diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 159b949da345..09423eca77c9 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -13,7 +13,7 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{rate_limiter, url::ApiUrl}; +use crate::{proxy::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, @@ -90,7 +90,13 @@ impl Endpoint { /// Execute a [request](reqwest::Request). pub async fn execute(&self, request: Request) -> Result { - self.client.execute(request).await + let path = request.url().path().to_string(); + let start = Instant::now(); + let res = self.client.execute(request).await; + CONSOLE_REQUEST_LATENCY + .with_label_values(&[&path]) + .observe(start.elapsed().as_secs_f64()); + res } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index adcb1bffaf9e..36d01f9acc36 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -24,7 +24,7 @@ use prometheus::{ IntGaugeVec, }; use regex::Regex; -use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant}; +use std::{error::Error, io, net::SocketAddr, ops::ControlFlow, sync::Arc, time::Instant}; use tokio::{ io::{AsyncRead, AsyncWrite, AsyncWriteExt}, time, @@ -110,12 +110,34 @@ static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { .unwrap() }); +pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { + register_histogram_vec!( + "proxy_console_request_latency", + "Time it took for proxy to establish a connection to the compute endpoint", + // proxy_wake_compute/proxy_get_role_info + &["request"], + // largest bucket = 2^16 * 0.2ms = 13s + exponential_buckets(0.0002, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_allowed_ips_cache_misses", + "Number of cache hits/misses for allowed ips", + // hit/miss + &["outcome"], + ) + .unwrap() +}); + pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { register_histogram!( "semaphore_control_plane_token_acquire_seconds", "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), + // largest bucket = 3^16 * 0.00005ms = 2.15s + exponential_buckets(0.00005, 3.0, 16).unwrap(), ) .unwrap() }); @@ -138,6 +160,15 @@ pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { .unwrap() }); +pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_allowed_ips_number", + "Number of allowed ips", + vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], + ) + .unwrap() +}); + pub struct LatencyTimer { // time since the stopwatch was started start: Option, @@ -265,7 +296,7 @@ pub async fn task_main( loop { tokio::select! { accept_result = listener.accept() => { - let (socket, _) = accept_result?; + let (socket, peer_addr) = accept_result?; let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); @@ -274,7 +305,9 @@ pub async fn task_main( info!("accepted postgres client connection"); let mut socket = WithClientIp::new(socket); + let mut peer_addr = peer_addr; if let Some(ip) = socket.wait_for_addr().await? { + peer_addr = ip; tracing::Span::current().record("peer_addr", &tracing::field::display(ip)); } else if config.require_client_ip { bail!("missing required client IP"); @@ -285,7 +318,7 @@ pub async fn task_main( .set_nodelay(true) .context("failed to set socket option")?; - handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await + handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr).await } .instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty)) .unwrap_or_else(move |e| { @@ -375,6 +408,7 @@ pub async fn handle_client( session_id: uuid::Uuid, stream: S, mode: ClientMode, + peer_addr: SocketAddr, ) -> anyhow::Result<()> { info!( protocol = mode.protocol_label(), @@ -408,7 +442,7 @@ pub async fn handle_client( let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names)) + .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names, peer_addr)) .transpose(); match result { @@ -470,7 +504,17 @@ async fn handshake( if !read_buf.is_empty() { bail!("data is sent before server replied with EncryptionResponse"); } - stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?); + let tls_stream = raw.upgrade(tls.to_server_config()).await?; + + let (_, tls_server_end_point) = tls + .cert_resolver + .resolve(tls_stream.get_ref().1.server_name()) + .context("missing certificate")?; + + stream = PqStream::new(Stream::Tls { + tls: Box::new(tls_stream), + tls_server_end_point, + }); } } _ => bail!(ERR_PROTO_VIOLATION), @@ -833,11 +877,11 @@ async fn prepare_client_connection( pub async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, - aux: &MetricsAuxInfo, + aux: MetricsAuxInfo, ) -> anyhow::Result<()> { let usage = USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.to_string(), - branch_id: aux.branch_id.to_string(), + endpoint_id: aux.endpoint_id.clone(), + branch_id: aux.branch_id.clone(), }); let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); @@ -875,7 +919,7 @@ pub async fn proxy_pass( /// Thin connection context. struct Client<'a, S> { /// The underlying libpq protocol stream. - stream: PqStream, + stream: PqStream>, /// Client credentials that we care about. creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, /// KV-dictionary with PostgreSQL connection params. @@ -889,7 +933,7 @@ struct Client<'a, S> { impl<'a, S> Client<'a, S> { /// Construct a new connection context. fn new( - stream: PqStream, + stream: PqStream>, creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, params: &'a StartupMessageParams, session_id: uuid::Uuid, @@ -988,7 +1032,7 @@ impl Client<'_, S> { // immediately after opening the connection. let (stream, read_buf) = stream.into_inner(); node.stream.write_all(&read_buf).await?; - proxy_pass(stream, node.stream, &aux).await + proxy_pass(stream, node.stream, aux).await } } diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 3ae4df46ef83..b97c0efce47f 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -1,19 +1,23 @@ //! A group of high-level tests for connection establishing logic and auth. -//! + +mod mitm; + use super::*; use crate::auth::backend::TestBackend; use crate::auth::ClientCredentials; +use crate::config::CertResolver; use crate::console::{CachedNodeInfo, NodeInfo}; use crate::{auth, http, sasl, scram}; use async_trait::async_trait; use rstest::rstest; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; -use tokio_postgres_rustls::MakeRustlsConnect; +use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; /// Generate a set of TLS certificates: CA + server. fn generate_certs( hostname: &str, + common_name: &str, ) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> { let ca = rcgen::Certificate::from_params({ let mut params = rcgen::CertificateParams::default(); @@ -21,7 +25,15 @@ fn generate_certs( params })?; - let cert = rcgen::generate_simple_self_signed(vec![hostname.into()])?; + let cert = rcgen::Certificate::from_params({ + let mut params = rcgen::CertificateParams::new(vec![hostname.into()]); + params.distinguished_name = rcgen::DistinguishedName::new(); + params + .distinguished_name + .push(rcgen::DnType::CommonName, common_name); + params + })?; + Ok(( rustls::Certificate(ca.serialize_der()?), rustls::Certificate(cert.serialize_der_with_signer(&ca)?), @@ -37,7 +49,14 @@ struct ClientConfig<'a> { impl ClientConfig<'_> { fn make_tls_connect( self, - ) -> anyhow::Result> { + ) -> anyhow::Result< + impl tokio_postgres::tls::TlsConnect< + S, + Error = impl std::fmt::Debug, + Future = impl Send, + Stream = RustlsStream, + >, + > { let mut mk = MakeRustlsConnect::new(self.config); let tls = MakeTlsConnect::::make_tls_connect(&mut mk, self.hostname)?; Ok(tls) @@ -49,20 +68,24 @@ fn generate_tls_config<'a>( hostname: &'a str, common_name: &'a str, ) -> anyhow::Result<(ClientConfig<'a>, TlsConfig)> { - let (ca, cert, key) = generate_certs(hostname)?; + let (ca, cert, key) = generate_certs(hostname, common_name)?; let tls_config = { let config = rustls::ServerConfig::builder() .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![cert], key)? + .with_single_cert(vec![cert.clone()], key.clone())? .into(); - let common_names = Some([common_name.to_owned()].iter().cloned().collect()); + let mut cert_resolver = CertResolver::new(); + cert_resolver.add_cert(key, vec![cert], true)?; + + let common_names = Some(cert_resolver.get_common_names()); TlsConfig { config, common_names, + cert_resolver: Arc::new(cert_resolver), } }; @@ -253,6 +276,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { )); let (_client, _conn) = tokio_postgres::Config::new() + .channel_binding(tokio_postgres::config::ChannelBinding::Require) .user("user") .dbname("db") .password(password) @@ -263,6 +287,30 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { proxy.await? } +#[tokio::test] +async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; + let proxy = tokio::spawn(dummy_proxy( + client, + Some(server_config), + Scram::new("password")?, + )); + + let (_client, _conn) = tokio_postgres::Config::new() + .channel_binding(tokio_postgres::config::ChannelBinding::Disable) + .user("user") + .dbname("db") + .password("password") + .ssl_mode(SslMode::Require) + .connect_raw(server, client_config.make_tls_connect()?) + .await?; + + proxy.await? +} + #[tokio::test] async fn scram_auth_mock() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); @@ -418,6 +466,10 @@ impl TestBackend for TestConnectMechanism { x => panic!("expecting action {:?}, wake_compute is called instead", x), } } + + fn get_allowed_ips(&self) -> Result>, console::errors::GetAuthInfoError> { + unimplemented!("not used in tests") + } } fn helper_create_cached_node_info() -> CachedNodeInfo { diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs new file mode 100644 index 000000000000..50b3034936e8 --- /dev/null +++ b/proxy/src/proxy/tests/mitm.rs @@ -0,0 +1,257 @@ +//! Man-in-the-middle tests +//! +//! Channel binding should prevent a proxy server +//! - that has access to create valid certificates - +//! from controlling the TLS connection. + +use std::fmt::Debug; + +use super::*; +use bytes::{Bytes, BytesMut}; +use futures::{SinkExt, StreamExt}; +use postgres_protocol::message::frontend; +use tokio::io::{AsyncReadExt, DuplexStream}; +use tokio_postgres::config::SslMode; +use tokio_postgres::tls::TlsConnect; +use tokio_util::codec::{Decoder, Encoder}; + +enum Intercept { + None, + Methods, + SASLResponse, +} + +async fn proxy_mitm( + intercept: Intercept, +) -> (DuplexStream, DuplexStream, ClientConfig<'static>, TlsConfig) { + let (end_server1, client1) = tokio::io::duplex(1024); + let (server2, end_client2) = tokio::io::duplex(1024); + + let (client_config1, server_config1) = + generate_tls_config("generic-project-name.localhost", "localhost").unwrap(); + let (client_config2, server_config2) = + generate_tls_config("generic-project-name.localhost", "localhost").unwrap(); + + tokio::spawn(async move { + // begin handshake with end_server + let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; + // process handshake with end_client + let (end_client, startup) = + handshake(client1, Some(&server_config1), &CancelMap::default()) + .await + .unwrap() + .unwrap(); + + let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame); + let (end_client, buf) = end_client.framed.into_inner(); + assert!(buf.is_empty()); + let mut end_client = tokio_util::codec::Framed::new(end_client, PgFrame); + + // give the end_server the startup parameters + let mut buf = BytesMut::new(); + frontend::startup_message(startup.iter(), &mut buf).unwrap(); + end_server.send(buf.freeze()).await.unwrap(); + + // proxy messages between end_client and end_server + loop { + tokio::select! { + message = end_server.next() => { + match message { + Some(Ok(message)) => { + // intercept SASL and return only SCRAM-SHA-256 ;) + if matches!(intercept, Intercept::Methods) && message.starts_with(b"R") && message[5..].starts_with(&[0,0,0,10]) { + end_client.send(Bytes::from_static(b"R\0\0\0\x17\0\0\0\x0aSCRAM-SHA-256\0\0")).await.unwrap(); + continue; + } + end_client.send(message).await.unwrap() + } + _ => break, + } + } + message = end_client.next() => { + match message { + Some(Ok(message)) => { + // intercept SASL response and return SCRAM-SHA-256 with no channel binding ;) + if matches!(intercept, Intercept::SASLResponse) && message.starts_with(b"p") && message[5..].starts_with(b"SCRAM-SHA-256-PLUS\0") { + let sasl_message = &message[1+4+19+4..]; + let mut new_message = b"n,,".to_vec(); + new_message.extend_from_slice(sasl_message.strip_prefix(b"p=tls-server-end-point,,").unwrap()); + + let mut buf = BytesMut::new(); + frontend::sasl_initial_response("SCRAM-SHA-256", &new_message, &mut buf).unwrap(); + + end_server.send(buf.freeze()).await.unwrap(); + continue; + } + end_server.send(message).await.unwrap() + } + _ => break, + } + } + else => { break } + } + } + }); + + (end_server1, end_client2, client_config1, server_config2) +} + +/// taken from tokio-postgres +pub async fn connect_tls(mut stream: S, tls: T) -> T::Stream +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, + T::Error: Debug, +{ + let mut buf = BytesMut::new(); + frontend::ssl_request(&mut buf); + stream.write_all(&buf).await.unwrap(); + + let mut buf = [0]; + stream.read_exact(&mut buf).await.unwrap(); + + if buf[0] != b'S' { + panic!("ssl not supported by server"); + } + + tls.connect(stream).await.unwrap() +} + +struct PgFrame; +impl Decoder for PgFrame { + type Item = Bytes; + type Error = io::Error; + + fn decode(&mut self, src: &mut BytesMut) -> Result, Self::Error> { + if src.len() < 5 { + src.reserve(5 - src.len()); + return Ok(None); + } + let len = u32::from_be_bytes(src[1..5].try_into().unwrap()) as usize + 1; + if src.len() < len { + src.reserve(len - src.len()); + return Ok(None); + } + Ok(Some(src.split_to(len).freeze())) + } +} +impl Encoder for PgFrame { + type Error = io::Error; + + fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> { + dst.extend_from_slice(&item); + Ok(()) + } +} + +/// If the client doesn't support channel bindings, it can be exploited. +#[tokio::test] +async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { + let (server, client, client_config, server_config) = proxy_mitm(Intercept::None).await; + let proxy = tokio::spawn(dummy_proxy( + client, + Some(server_config), + Scram::new("password")?, + )); + + let _client_err = tokio_postgres::Config::new() + .channel_binding(tokio_postgres::config::ChannelBinding::Disable) + .user("user") + .dbname("db") + .password("password") + .ssl_mode(SslMode::Require) + .connect_raw(server, client_config.make_tls_connect()?) + .await?; + + proxy.await? +} + +/// If the client chooses SCRAM-PLUS, it will fail +#[tokio::test] +async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> { + connect_failure( + Intercept::None, + tokio_postgres::config::ChannelBinding::Prefer, + ) + .await +} + +/// If the MITM pretends like SCRAM-PLUS isn't available, but the client supports it, it will fail +#[tokio::test] +async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> { + connect_failure( + Intercept::Methods, + tokio_postgres::config::ChannelBinding::Prefer, + ) + .await +} + +/// If the MITM pretends like the client doesn't support channel bindings, it will fail +#[tokio::test] +async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Result<()> { + connect_failure( + Intercept::SASLResponse, + tokio_postgres::config::ChannelBinding::Prefer, + ) + .await +} + +/// If the client chooses SCRAM-PLUS, it will fail +#[tokio::test] +async fn scram_auth_require_channel_binding() -> anyhow::Result<()> { + connect_failure( + Intercept::None, + tokio_postgres::config::ChannelBinding::Require, + ) + .await +} + +/// If the client requires SCRAM-PLUS, and it is spoofed to remove SCRAM-PLUS, it will fail +#[tokio::test] +async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> { + connect_failure( + Intercept::Methods, + tokio_postgres::config::ChannelBinding::Require, + ) + .await +} + +/// If the client requires SCRAM-PLUS, and it is spoofed to remove SCRAM-PLUS, it will fail +#[tokio::test] +async fn scram_auth_require_channel_binding_intercept_response() -> anyhow::Result<()> { + connect_failure( + Intercept::SASLResponse, + tokio_postgres::config::ChannelBinding::Require, + ) + .await +} + +async fn connect_failure( + intercept: Intercept, + channel_binding: tokio_postgres::config::ChannelBinding, +) -> anyhow::Result<()> { + let (server, client, client_config, server_config) = proxy_mitm(intercept).await; + let proxy = tokio::spawn(dummy_proxy( + client, + Some(server_config), + Scram::new("password")?, + )); + + let _client_err = tokio_postgres::Config::new() + .channel_binding(channel_binding) + .user("user") + .dbname("db") + .password("password") + .ssl_mode(SslMode::Require) + .connect_raw(server, client_config.make_tls_connect()?) + .await + .err() + .context("client shouldn't be able to connect")?; + + let _server_err = proxy + .await? + .err() + .context("server shouldn't accept client")?; + + Ok(()) +} diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs index 776adabe5587..13d681de6dc9 100644 --- a/proxy/src/sasl/channel_binding.rs +++ b/proxy/src/sasl/channel_binding.rs @@ -36,9 +36,9 @@ impl<'a> ChannelBinding<&'a str> { impl ChannelBinding { /// Encode channel binding data as base64 for subsequent checks. - pub fn encode( + pub fn encode<'a, E>( &self, - get_cbind_data: impl FnOnce(&T) -> Result, + get_cbind_data: impl FnOnce(&T) -> Result<&'a [u8], E>, ) -> Result, E> { use ChannelBinding::*; Ok(match self { @@ -51,12 +51,11 @@ impl ChannelBinding { "eSws".into() } Required(mode) => { - let msg = format!( - "p={mode},,{data}", - mode = mode, - data = get_cbind_data(mode)? - ); - base64::encode(msg).into() + use std::io::Write; + let mut cbind_input = vec![]; + write!(&mut cbind_input, "p={mode},,",).unwrap(); + cbind_input.extend_from_slice(get_cbind_data(mode)?); + base64::encode(&cbind_input).into() } }) } @@ -77,7 +76,7 @@ mod tests { ]; for (cb, input) in cases { - assert_eq!(cb.encode(|_| anyhow::Ok("bar".to_owned()))?, input); + assert_eq!(cb.encode(|_| anyhow::Ok(b"bar"))?, input); } Ok(()) diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 2de26af96b3b..63271309e17f 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -22,9 +22,12 @@ pub use secret::ServerSecret; use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; -// TODO: add SCRAM-SHA-256-PLUS +const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; +const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS"; + /// A list of supported SCRAM methods. -pub const METHODS: &[&str] = &["SCRAM-SHA-256"]; +pub const METHODS: &[&str] = &[SCRAM_SHA_256_PLUS, SCRAM_SHA_256]; +pub const METHODS_WITHOUT_PLUS: &[&str] = &[SCRAM_SHA_256]; /// Decode base64 into array without any heap allocations fn base64_decode_array(input: impl AsRef<[u8]>) -> Option<[u8; N]> { @@ -80,7 +83,11 @@ mod tests { const NONCE: [u8; 18] = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ]; - let mut exchange = Exchange::new(&secret, || NONCE, None); + let mut exchange = Exchange::new( + &secret, + || NONCE, + crate::config::TlsServerEndPoint::Undefined, + ); let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO"; let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0="; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 882769a70d91..319d9b101484 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -5,9 +5,11 @@ use super::messages::{ }; use super::secret::ServerSecret; use super::signature::SignatureBuilder; +use crate::config; use crate::sasl::{self, ChannelBinding, Error as SaslError}; /// The only channel binding mode we currently support. +#[derive(Debug)] struct TlsServerEndPoint; impl std::fmt::Display for TlsServerEndPoint { @@ -43,20 +45,20 @@ pub struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], - cert_digest: Option<&'a [u8]>, + tls_server_end_point: config::TlsServerEndPoint, } impl<'a> Exchange<'a> { pub fn new( secret: &'a ServerSecret, nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], - cert_digest: Option<&'a [u8]>, + tls_server_end_point: config::TlsServerEndPoint, ) -> Self { Self { state: ExchangeState::Initial, secret, nonce, - cert_digest, + tls_server_end_point, } } } @@ -71,6 +73,14 @@ impl sasl::Mechanism for Exchange<'_> { let client_first_message = ClientFirstMessage::parse(input) .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?; + // If the flag is set to "y" and the server supports channel + // binding, the server MUST fail authentication + if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer + && self.tls_server_end_point.supported() + { + return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used")); + } + let server_first_message = client_first_message.build_server_first_message( &(self.nonce)(), &self.secret.salt_base64, @@ -94,10 +104,11 @@ impl sasl::Mechanism for Exchange<'_> { let client_final_message = ClientFinalMessage::parse(input) .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?; - let channel_binding = cbind_flag.encode(|_| { - self.cert_digest - .map(base64::encode) - .ok_or(SaslError::ChannelBindingFailed("no cert digest provided")) + let channel_binding = cbind_flag.encode(|_| match &self.tls_server_end_point { + config::TlsServerEndPoint::Sha256(x) => Ok(x), + config::TlsServerEndPoint::Undefined => { + Err(SaslError::ChannelBindingFailed("no cert digest provided")) + } })?; // This might've been caused by a MITM attack diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 23deda3ae67e..45f8132393e8 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -23,6 +23,7 @@ use hyper::{ Body, Method, Request, Response, }; +use std::net::SocketAddr; use std::task::Poll; use std::{future::ready, sync::Arc}; use tls_listener::TlsListener; @@ -102,7 +103,7 @@ pub async fn task_main( let session_id = uuid::Uuid::new_v4(); request_handler( - req, config, conn_pool, cancel_map, session_id, sni_name, + req, config, conn_pool, cancel_map, session_id, sni_name, peer_addr, ) .instrument(info_span!( "serverless", @@ -170,6 +171,7 @@ async fn request_handler( cancel_map: Arc, session_id: uuid::Uuid, sni_hostname: Option, + peer_addr: SocketAddr, ) -> Result, ApiError> { let host = request .headers() @@ -187,9 +189,15 @@ async fn request_handler( tokio::spawn( async move { - if let Err(e) = - websocket::serve_websocket(websocket, config, &cancel_map, session_id, host) - .await + if let Err(e) = websocket::serve_websocket( + websocket, + config, + &cancel_map, + session_id, + host, + peer_addr, + ) + .await { error!(session_id = ?session_id, "error in websocket connection: {e:#}"); } @@ -205,6 +213,7 @@ async fn request_handler( sni_hostname, conn_pool, session_id, + peer_addr, &config.http_config, ) .await diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index b753bc8918bc..ca7a9ad0a0c7 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -8,7 +8,8 @@ use pbkdf2::{ Params, Pbkdf2, }; use pq_proto::StartupMessageParams; -use std::{collections::HashMap, sync::Arc}; +use smol_str::SmolStr; +use std::{collections::HashMap, net::SocketAddr, sync::Arc}; use std::{ fmt, task::{ready, Poll}, @@ -21,7 +22,8 @@ use tokio::time; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; use crate::{ - auth, console, + auth::{self, check_peer_addr_is_in_list}, + console, proxy::{ neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER, @@ -40,16 +42,16 @@ const MAX_CONNS_PER_ENDPOINT: usize = 20; #[derive(Debug, Clone)] pub struct ConnInfo { - pub username: String, - pub dbname: String, - pub hostname: String, - pub password: String, - pub options: Option, + pub username: SmolStr, + pub dbname: SmolStr, + pub hostname: SmolStr, + pub password: SmolStr, + pub options: Option, } impl ConnInfo { // hm, change to hasher to avoid cloning? - pub fn db_and_user(&self) -> (String, String) { + pub fn db_and_user(&self) -> (SmolStr, SmolStr) { (self.dbname.clone(), self.username.clone()) } } @@ -69,7 +71,7 @@ struct ConnPoolEntry { // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. pub struct EndpointConnPool { - pools: HashMap<(String, String), DbUserConnPool>, + pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>, total_conns: usize, } @@ -94,7 +96,7 @@ pub struct GlobalConnPool { // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>, /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. /// That seems like far too much effort, so we're using a relaxed increment counter instead. @@ -144,6 +146,7 @@ impl GlobalConnPool { conn_info: &ConnInfo, force_new: bool, session_id: uuid::Uuid, + peer_addr: SocketAddr, ) -> anyhow::Result { let mut client: Option = None; let mut latency_timer = LatencyTimer::new("http"); @@ -203,6 +206,7 @@ impl GlobalConnPool { conn_id, session_id, latency_timer, + peer_addr, ) .await } else { @@ -225,6 +229,7 @@ impl GlobalConnPool { conn_id, session_id, latency_timer, + peer_addr, ) .await }; @@ -323,7 +328,7 @@ impl GlobalConnPool { Ok(()) } - fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc> { + fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); @@ -401,6 +406,7 @@ async fn connect_to_compute( conn_id: uuid::Uuid, session_id: uuid::Uuid, latency_timer: LatencyTimer, + peer_addr: SocketAddr, ) -> anyhow::Result { let tls = config.tls_config.as_ref(); let common_names = tls.and_then(|tls| tls.common_names.clone()); @@ -411,12 +417,13 @@ async fn connect_to_compute( ("application_name", APP_NAME), ("options", conn_info.options.as_deref().unwrap_or("")), ]); - - let creds = config - .auth_backend - .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, Some(&conn_info.hostname), common_names)) - .transpose()?; + let creds = auth::ClientCredentials::parse( + ¶ms, + Some(&conn_info.hostname), + common_names, + peer_addr, + )?; + let backend = config.auth_backend.as_ref().map(|_| creds); let console_options = neon_options(¶ms); @@ -425,8 +432,14 @@ async fn connect_to_compute( application_name: Some(APP_NAME), options: console_options.as_deref(), }; - - let node_info = creds + // TODO(anna): this is a bit hacky way, consider using console notification listener. + if !config.disable_ip_check_for_http { + let allowed_ips = backend.get_allowed_ips(&extra).await?; + if !check_peer_addr_is_in_list(&peer_addr.ip(), &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed().into()); + } + } + let node_info = backend .wake_compute(&extra) .await? .context("missing cache entry from wake_compute")?; @@ -439,7 +452,7 @@ async fn connect_to_compute( }, node_info, &extra, - &creds, + &backend, latency_timer, ) .await @@ -456,7 +469,7 @@ async fn connect_to_compute_once( let (client, mut connection) = config .user(&conn_info.username) - .password(&conn_info.password) + .password(&*conn_info.password) .dbname(&conn_info.dbname) .connect_timeout(timeout) .connect(tokio_postgres::NoTls) @@ -470,8 +483,8 @@ async fn connect_to_compute_once( info!(%conn_info, %session, "new connection"); }); let ids = Ids { - endpoint_id: node_info.aux.endpoint_id.to_string(), - branch_id: node_info.aux.branch_id.to_string(), + endpoint_id: node_info.aux.endpoint_id.clone(), + branch_id: node_info.aux.branch_id.clone(), }; tokio::spawn( diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 4a9829e360d7..6c337a837cdb 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,3 +1,4 @@ +use std::net::SocketAddr; use std::sync::Arc; use anyhow::bail; @@ -13,6 +14,7 @@ use hyper::{Body, HeaderMap, Request}; use serde_json::json; use serde_json::Map; use serde_json::Value; +use tokio_postgres::error::DbError; use tokio_postgres::types::Kind; use tokio_postgres::types::Type; use tokio_postgres::GenericClient; @@ -180,16 +182,16 @@ fn get_conn_info( for (key, value) in pairs { if key == "options" { - options = Some(value.to_string()); + options = Some(value.into()); break; } } Ok(ConnInfo { - username: username.to_owned(), - dbname: dbname.to_owned(), - hostname: hostname.to_owned(), - password: password.to_owned(), + username: username.into(), + dbname: dbname.into(), + hostname: hostname.into(), + password: password.into(), options, }) } @@ -200,11 +202,19 @@ pub async fn handle( sni_hostname: Option, conn_pool: Arc, session_id: uuid::Uuid, + peer_addr: SocketAddr, config: &'static HttpConfig, ) -> Result, ApiError> { let result = tokio::time::timeout( - config.sql_over_http_timeout, - handle_inner(request, sni_hostname, conn_pool, session_id), + config.timeout, + handle_inner( + config, + request, + sni_hostname, + conn_pool, + session_id, + peer_addr, + ), ) .await; let mut response = match result { @@ -212,14 +222,33 @@ pub async fn handle( Ok(r) => r, Err(e) => { let message = format!("{:?}", e); - let code = e.downcast_ref::().and_then(|e| { - e.code() - .map(|s| serde_json::to_value(s.code()).unwrap_or_default()) - }); - let code = match code { - Some(c) => c, - None => Value::Null, - }; + let db_error = e + .downcast_ref::() + .and_then(|e| e.as_db_error()); + fn get<'a, T: serde::Serialize>( + db: Option<&'a DbError>, + x: impl FnOnce(&'a DbError) -> T, + ) -> Value { + db.map(x) + .and_then(|t| serde_json::to_value(t).ok()) + .unwrap_or_default() + } + + // TODO(conrad): db_error.position() + let code = get(db_error, |db| db.code().code()); + let severity = get(db_error, |db| db.severity()); + let detail = get(db_error, |db| db.detail()); + let hint = get(db_error, |db| db.hint()); + let where_ = get(db_error, |db| db.where_()); + let table = get(db_error, |db| db.table()); + let column = get(db_error, |db| db.column()); + let schema = get(db_error, |db| db.schema()); + let datatype = get(db_error, |db| db.datatype()); + let constraint = get(db_error, |db| db.constraint()); + let file = get(db_error, |db| db.file()); + let line = get(db_error, |db| db.line()); + let routine = get(db_error, |db| db.routine()); + error!( ?code, "sql-over-http per-client task finished with an error: {e:#}" @@ -227,14 +256,29 @@ pub async fn handle( // TODO: this shouldn't always be bad request. json_response( StatusCode::BAD_REQUEST, - json!({ "message": message, "code": code }), + json!({ + "message": message, + "code": code, + "detail": detail, + "hint": hint, + "severity": severity, + "where": where_, + "table": table, + "column": column, + "schema": schema, + "datatype": datatype, + "constraint": constraint, + "file": file, + "line": line, + "routine": routine, + }), )? } }, Err(_) => { let message = format!( "HTTP-Connection timed out, execution time exeeded {} seconds", - config.sql_over_http_timeout.as_secs() + config.timeout.as_secs() ); error!(message); json_response( @@ -252,10 +296,12 @@ pub async fn handle( #[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)] async fn handle_inner( + config: &'static HttpConfig, request: Request, sni_hostname: Option, conn_pool: Arc, session_id: uuid::Uuid, + peer_addr: SocketAddr, ) -> anyhow::Result> { NUM_CONNECTIONS_ACCEPTED_COUNTER .with_label_values(&["http"]) @@ -276,7 +322,8 @@ async fn handle_inner( let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); // Allow connection pooling only if explicitly requested - let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); + // or if we have decided that http pool is no longer opt-in + let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); // isolation level, read only and deferrable @@ -314,7 +361,9 @@ async fn handle_inner( let body = hyper::body::to_bytes(request.into_body()).await?; let payload: Payload = serde_json::from_slice(&body)?; - let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?; + let mut client = conn_pool + .get(&conn_info, !allow_pool, session_id, peer_addr) + .await?; let mut response = Response::builder() .status(StatusCode::OK) diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 86141ab64f81..8fb9a3dee4f4 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -11,6 +11,7 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; use pin_project_lite::pin_project; use std::{ + net::SocketAddr, pin::Pin, task::{ready, Context, Poll}, }; @@ -132,6 +133,7 @@ pub async fn serve_websocket( cancel_map: &CancelMap, session_id: uuid::Uuid, hostname: Option, + peer_addr: SocketAddr, ) -> anyhow::Result<()> { let websocket = websocket.await?; handle_client( @@ -140,6 +142,7 @@ pub async fn serve_websocket( session_id, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, + peer_addr, ) .await?; Ok(()) diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 6210601a80d4..f48b3fe39ff3 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,7 +1,8 @@ +use crate::config::TlsServerEndPoint; use crate::error::UserFacingError; use anyhow::bail; use bytes::BytesMut; -use pin_project_lite::pin_project; + use pq_proto::framed::{ConnectionError, Framed}; use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; use rustls::ServerConfig; @@ -17,7 +18,7 @@ use tokio_rustls::server::TlsStream; /// or [`AsyncWrite`] to prevent subtle errors (e.g. trying /// to pass random malformed bytes through the connection). pub struct PqStream { - framed: Framed, + pub(crate) framed: Framed, } impl PqStream { @@ -118,19 +119,21 @@ impl PqStream { } } -pin_project! { - /// Wrapper for upgrading raw streams into secure streams. - /// NOTE: it should be possible to decompose this object as necessary. - #[project = StreamProj] - pub enum Stream { - /// We always begin with a raw stream, - /// which may then be upgraded into a secure stream. - Raw { #[pin] raw: S }, +/// Wrapper for upgrading raw streams into secure streams. +pub enum Stream { + /// We always begin with a raw stream, + /// which may then be upgraded into a secure stream. + Raw { raw: S }, + Tls { /// We box [`TlsStream`] since it can be quite large. - Tls { #[pin] tls: Box> }, - } + tls: Box>, + /// Channel binding parameter + tls_server_end_point: TlsServerEndPoint, + }, } +impl Unpin for Stream {} + impl Stream { /// Construct a new instance from a raw stream. pub fn from_raw(raw: S) -> Self { @@ -141,7 +144,17 @@ impl Stream { pub fn sni_hostname(&self) -> Option<&str> { match self { Stream::Raw { .. } => None, - Stream::Tls { tls } => tls.get_ref().1.server_name(), + Stream::Tls { tls, .. } => tls.get_ref().1.server_name(), + } + } + + pub fn tls_server_end_point(&self) -> TlsServerEndPoint { + match self { + Stream::Raw { .. } => TlsServerEndPoint::Undefined, + Stream::Tls { + tls_server_end_point, + .. + } => *tls_server_end_point, } } } @@ -158,12 +171,9 @@ pub enum StreamUpgradeError { impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> Result { + pub async fn upgrade(self, cfg: Arc) -> Result, StreamUpgradeError> { match self { - Stream::Raw { raw } => { - let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?); - Ok(Stream::Tls { tls }) - } + Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } @@ -171,50 +181,46 @@ impl Stream { impl AsyncRead for Stream { fn poll_read( - self: Pin<&mut Self>, + mut self: Pin<&mut Self>, context: &mut task::Context<'_>, buf: &mut ReadBuf<'_>, ) -> task::Poll> { - use StreamProj::*; - match self.project() { - Raw { raw } => raw.poll_read(context, buf), - Tls { tls } => tls.poll_read(context, buf), + match &mut *self { + Self::Raw { raw } => Pin::new(raw).poll_read(context, buf), + Self::Tls { tls, .. } => Pin::new(tls).poll_read(context, buf), } } } impl AsyncWrite for Stream { fn poll_write( - self: Pin<&mut Self>, + mut self: Pin<&mut Self>, context: &mut task::Context<'_>, buf: &[u8], ) -> task::Poll> { - use StreamProj::*; - match self.project() { - Raw { raw } => raw.poll_write(context, buf), - Tls { tls } => tls.poll_write(context, buf), + match &mut *self { + Self::Raw { raw } => Pin::new(raw).poll_write(context, buf), + Self::Tls { tls, .. } => Pin::new(tls).poll_write(context, buf), } } fn poll_flush( - self: Pin<&mut Self>, + mut self: Pin<&mut Self>, context: &mut task::Context<'_>, ) -> task::Poll> { - use StreamProj::*; - match self.project() { - Raw { raw } => raw.poll_flush(context), - Tls { tls } => tls.poll_flush(context), + match &mut *self { + Self::Raw { raw } => Pin::new(raw).poll_flush(context), + Self::Tls { tls, .. } => Pin::new(tls).poll_flush(context), } } fn poll_shutdown( - self: Pin<&mut Self>, + mut self: Pin<&mut Self>, context: &mut task::Context<'_>, ) -> task::Poll> { - use StreamProj::*; - match self.project() { - Raw { raw } => raw.poll_shutdown(context), - Tls { tls } => tls.poll_shutdown(context), + match &mut *self { + Self::Raw { raw } => Pin::new(raw).poll_shutdown(context), + Self::Tls { tls, .. } => Pin::new(tls).poll_shutdown(context), } } } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 180b5f7199b4..789a4c680ce7 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -6,6 +6,7 @@ use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_S use dashmap::{mapref::entry::Entry, DashMap}; use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; +use smol_str::SmolStr; use std::{ convert::Infallible, sync::{ @@ -29,8 +30,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] pub struct Ids { - pub endpoint_id: String, - pub branch_id: String, + pub endpoint_id: SmolStr, + pub branch_id: SmolStr, } #[derive(Debug)] @@ -290,8 +291,8 @@ mod tests { // register a new counter let counter = metrics.register(Ids { - endpoint_id: "e1".to_string(), - branch_id: "b1".to_string(), + endpoint_id: "e1".into(), + branch_id: "b1".into(), }); // the counter should be observed despite 0 egress diff --git a/pyproject.toml b/pyproject.toml index 396edabe1006..536efeab5683 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,8 +33,8 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.8.6" -pytest-rerunfailures = "^11.1.2" +aiohttp = "3.9.0" +pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" zstandard = "^0.21.0" diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index 0f3e5630e85e..e26f2c6d6b34 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -6,8 +6,6 @@ license.workspace = true [dependencies] aws-sdk-s3.workspace = true -aws-smithy-http.workspace = true -aws-types.workspace = true either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true @@ -30,7 +28,7 @@ itertools.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } reqwest = { workspace = true, default-features = false, features = ["rustls-tls", "json"] } -aws-config = { workspace = true, default-features = false, features = ["rustls", "credentials-sso"] } +aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] } pageserver = { path = "../pageserver" } remote_storage = { path = "../libs/remote_storage" } diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 64702fca3d15..510a12866366 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -94,11 +94,10 @@ pub(crate) async fn branch_cleanup_and_check_errors( != index_part.get_disk_consistent_lsn() { result.errors.push(format!( - "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", - index_part.metadata.disk_consistent_lsn(), - index_part.get_disk_consistent_lsn(), - - )) + "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", + index_part.metadata.disk_consistent_lsn(), + index_part.get_disk_consistent_lsn(), + )) } if index_part.layer_metadata.is_empty() { @@ -109,8 +108,8 @@ pub(crate) async fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), - )) + "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), + )) } let layer_map_key = (layer, metadata.generation); @@ -136,7 +135,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( // a new generation that didn't upload an index yet. // // Even so, a layer that is not referenced by the index could just - // be something enqueued for deletion, so while this check is valid + // be something enqueued for deletion, so while this check is valid // for indicating that a layer is garbage, it is not an indicator // of a problem. gen < &index_part_generation) @@ -251,10 +250,7 @@ pub(crate) async fn list_timeline_blobs( pin_mut!(stream); while let Some(obj) = stream.next().await { let obj = obj?; - let key = match obj.key() { - Some(k) => k, - None => continue, - }; + let key = obj.key(); let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket); match blob_name { @@ -287,7 +283,7 @@ pub(crate) async fn list_timeline_blobs( let (index_part_object, index_part_generation) = match index_parts .iter() .filter_map(|k| { - let key = k.key().unwrap(); + let key = k.key(); // Stripping the index key to the last part, because RemotePath doesn't // like absolute paths, and depending on prefix_in_bucket it's possible // for the keys we read back to start with a slash. @@ -308,8 +304,7 @@ pub(crate) async fn list_timeline_blobs( errors.push("S3 list response got no index_part.json file".to_string()); } - if let Some(index_part_object_key) = index_part_object.as_ref().and_then(|object| object.key()) - { + if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) { let index_part_bytes = download_object_with_retries( s3_client, &timeline_dir_target.bucket_name, diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs index daeb5e97778e..f27e1d7f6594 100644 --- a/s3_scrubber/src/garbage.rs +++ b/s3_scrubber/src/garbage.rs @@ -323,7 +323,7 @@ async fn do_delete( let delete_request = s3_client .delete_objects() .bucket(bucket_name) - .delete(Delete::builder().set_objects(Some(request_keys)).build()); + .delete(Delete::builder().set_objects(Some(request_keys)).build()?); delete_request .send() .await diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index 777276a4d1fb..e5465952fbad 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -16,6 +16,7 @@ use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; use aws_config::sso::SsoCredentialsProvider; +use aws_config::BehaviorVersion; use aws_sdk_s3::config::Region; use aws_sdk_s3::{Client, Config}; @@ -245,6 +246,7 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie }; let mut builder = Config::builder() + .behavior_version(BehaviorVersion::v2023_11_09()) .region(bucket_region) .credentials_provider(credentials_provider); diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs index 8095071c1fd1..4cfa77cfc139 100644 --- a/s3_scrubber/src/metadata_stream.rs +++ b/s3_scrubber/src/metadata_stream.rs @@ -20,7 +20,6 @@ pub fn stream_tenants<'a>( let new_entry_ids = fetch_response .common_prefixes() - .unwrap_or_default() .iter() .filter_map(|prefix| prefix.prefix()) .filter_map(|prefix| -> Option<&str> { @@ -72,7 +71,6 @@ pub async fn stream_tenant_timelines<'a>( let new_entry_ids = fetch_response .common_prefixes() - .unwrap_or_default() .iter() .filter_map(|prefix| prefix.prefix()) .filter_map(|prefix| -> Option<&str> { @@ -116,15 +114,15 @@ pub(crate) fn stream_listing<'a>( list_objects_with_retries(s3_client, target, continuation_token.clone()).await?; if target.delimiter.is_empty() { - for object_id in fetch_response.contents().unwrap_or_default().iter().filter_map(|object| object.key()).map(|i| - ObjectIdentifier::builder().key(i).build() - ) { + for object_key in fetch_response.contents().iter().filter_map(|object| object.key()) + { + let object_id = ObjectIdentifier::builder().key(object_key).build()?; yield object_id; } } else { - for prefix in fetch_response.common_prefixes().unwrap_or_default() - .iter().filter_map(|p| p.prefix().map(|k| ObjectIdentifier::builder().key(k).build())) { - yield prefix; + for prefix in fetch_response.common_prefixes().iter().filter_map(|p| p.prefix()) { + let object_id = ObjectIdentifier::builder().key(prefix).build()?; + yield object_id; } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fc7e834bd2fd..9545dc2dd5c8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -434,8 +434,6 @@ def __init__( # Pageserver remote storage self.pageserver_remote_storage = pageserver_remote_storage - # Extensions remote storage - self.ext_remote_storage: Optional[S3Storage] = None # Safekeepers remote storage self.sk_remote_storage: Optional[RemoteStorage] = None @@ -534,24 +532,6 @@ def enable_pageserver_remote_storage( ) self.pageserver_remote_storage = ret - def enable_extensions_remote_storage(self, kind: RemoteStorageKind): - assert self.ext_remote_storage is None, "already configured extensions remote storage" - - # there is an assumption that REAL_S3 for extensions is never - # cleaned up these are also special in that they have a hardcoded - # bucket and region, which is most likely the same as our normal - ext = self._configure_and_create_remote_storage( - kind, - RemoteStorageUser.EXTENSIONS, - bucket_name="neon-dev-extensions-eu-central-1", - bucket_region="eu-central-1", - ) - assert isinstance( - ext, S3Storage - ), "unsure why, but only MOCK_S3 and REAL_S3 are currently supported for extensions" - ext.cleanup = False - self.ext_remote_storage = ext - def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind): assert self.sk_remote_storage is None, "sk_remote_storage already configured" @@ -608,8 +588,7 @@ def cleanup_local_storage(self): directory_to_clean.rmdir() def cleanup_remote_storage(self): - # extensions are currently not cleaned up, disabled when creating - for x in [self.pageserver_remote_storage, self.ext_remote_storage, self.sk_remote_storage]: + for x in [self.pageserver_remote_storage, self.sk_remote_storage]: if isinstance(x, S3Storage): x.do_cleanup() @@ -713,7 +692,6 @@ def __init__(self, config: NeonEnvBuilder): self.pageservers: List[NeonPageserver] = [] self.broker = config.broker self.pageserver_remote_storage = config.pageserver_remote_storage - self.ext_remote_storage = config.ext_remote_storage self.safekeepers_remote_storage = config.sk_remote_storage self.pg_version = config.pg_version # Binary path for pageserver, safekeeper, etc @@ -1436,45 +1414,25 @@ def endpoint_create( def endpoint_start( self, endpoint_id: str, - pg_port: int, - http_port: int, safekeepers: Optional[List[int]] = None, - tenant_id: Optional[TenantId] = None, - lsn: Optional[Lsn] = None, - branch_name: Optional[str] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", "start", - "--tenant-id", - str(tenant_id or self.env.initial_tenant), - "--pg-version", - self.env.pg_version, ] if remote_ext_config is not None: args.extend(["--remote-ext-config", remote_ext_config]) - if lsn is not None: - args.append(f"--lsn={lsn}") - args.extend(["--pg-port", str(pg_port)]) - args.extend(["--http-port", str(http_port)]) if safekeepers is not None: args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) - if branch_name is not None: - args.extend(["--branch-name", branch_name]) if endpoint_id is not None: args.append(endpoint_id) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) - storage = self.env.ext_remote_storage - s3_env_vars = None - if isinstance(storage, S3Storage): - s3_env_vars = storage.access_env_vars() - - res = self.raw_cli(args, extra_env_vars=s3_env_vars) + res = self.raw_cli(args) res.check_returncode() return res @@ -1495,15 +1453,12 @@ def endpoint_reconfigure( def endpoint_stop( self, endpoint_id: str, - tenant_id: Optional[TenantId] = None, destroy=False, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", "stop", - "--tenant-id", - str(tenant_id or self.env.initial_tenant), ] if destroy: args.append("--destroy") @@ -1599,7 +1554,7 @@ def stop(self, immediate: bool = False) -> "NeonAttachmentService": self.running = False return self - def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int: + def attach_hook_issue(self, tenant_id: TenantId, pageserver_id: int) -> int: response = requests.post( f"{self.env.control_plane_api}/attach-hook", json={"tenant_id": str(tenant_id), "node_id": pageserver_id}, @@ -1609,6 +1564,13 @@ def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int: assert isinstance(gen, int) return gen + def attach_hook_drop(self, tenant_id: TenantId): + response = requests.post( + f"{self.env.control_plane_api}/attach-hook", + json={"tenant_id": str(tenant_id), "node_id": None}, + ) + response.raise_for_status() + def __enter__(self) -> "NeonAttachmentService": return self @@ -1808,13 +1770,20 @@ def tenant_attach( to call into the pageserver HTTP client. """ if self.env.attachment_service is not None: - generation = self.env.attachment_service.attach_hook(tenant_id, self.id) + generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) else: generation = None client = self.http_client() return client.tenant_attach(tenant_id, config, config_null, generation=generation) + def tenant_detach(self, tenant_id: TenantId): + if self.env.attachment_service is not None: + self.env.attachment_service.attach_hook_drop(tenant_id) + + client = self.http_client() + return client.tenant_detach(tenant_id) + def append_pageserver_param_overrides( params_to_update: List[str], @@ -1889,7 +1858,8 @@ def run_capture( command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None, - **kwargs: Any, + with_command_header=True, + **popen_kwargs: Any, ) -> str: """ Run one of the postgres binaries, with stderr and stdout redirected to a file. @@ -1902,7 +1872,13 @@ def run_capture( log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) base_path, _, _ = subprocess_capture( - self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs + self.log_dir, + command, + env=env, + cwd=cwd, + check=True, + with_command_header=with_command_header, + **popen_kwargs, ) return base_path @@ -2145,6 +2121,7 @@ def extra_args(self) -> list[str]: # Console auth backend params *["--auth-backend", "console"], *["--auth-endpoint", self.endpoint], + *["--sql-over-http-pool-opt-in", "false"], ] if self.fixed_rate_limit is not None: args += [ @@ -2420,6 +2397,10 @@ def static_proxy( # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql` vanilla_pg.start() vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") + vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS neon_control_plane") + vanilla_pg.safe_psql( + "CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))" + ) proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() @@ -2520,9 +2501,6 @@ def start( self.env.neon_cli.endpoint_start( self.endpoint_id, - pg_port=self.pg_port, - http_port=self.http_port, - tenant_id=self.tenant_id, safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, @@ -2582,6 +2560,17 @@ def respec(self, **kwargs): with open(config_path, "w") as file: json.dump(dict(data_dict, **kwargs), file, indent=4) + # Mock the extension part of spec passed from control plane for local testing + # endpooint.rs adds content of this file as a part of the spec.json + def create_remote_extension_spec(self, spec: dict[str, Any]): + """Create a remote extension spec file for the endpoint.""" + remote_extensions_spec_path = os.path.join( + self.endpoint_path(), "remote_extensions_spec.json" + ) + + with open(remote_extensions_spec_path, "w") as file: + json.dump(spec, file, indent=4) + def stop(self) -> "Endpoint": """ Stop the Postgres instance if it's running. @@ -2591,7 +2580,7 @@ def stop(self) -> "Endpoint": if self.running: assert self.endpoint_id is not None self.env.neon_cli.endpoint_stop( - self.endpoint_id, self.tenant_id, check_return_code=self.check_stop_result + self.endpoint_id, check_return_code=self.check_stop_result ) self.running = False @@ -2605,7 +2594,7 @@ def stop_and_destroy(self) -> "Endpoint": assert self.endpoint_id is not None self.env.neon_cli.endpoint_stop( - self.endpoint_id, self.tenant_id, True, check_return_code=self.check_stop_result + self.endpoint_id, True, check_return_code=self.check_stop_result ) self.endpoint_id = None self.running = False @@ -3040,6 +3029,11 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name test_dir = top_output_dir / test_name.replace("/", "-") + + # We rerun flaky tests multiple times, use a separate directory for each run. + if (suffix := getattr(request.node, "execution_count", None)) is not None: + test_dir = test_dir.parent / f"{test_dir.name}-{suffix}" + log.info(f"get_test_output_dir is {test_dir}") # make mypy happy assert isinstance(test_dir, Path) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 2f1d68b92c0e..76aa40122f53 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -4,7 +4,7 @@ import time from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple import requests from requests.adapters import HTTPAdapter @@ -100,6 +100,15 @@ def kind_count(self) -> Dict[str, int]: counts[hist_layer.kind] += 1 return counts + def delta_layers(self) -> List[HistoricLayerInfo]: + return [x for x in self.historic_layers if x.kind == "Delta"] + + def image_layers(self) -> List[HistoricLayerInfo]: + return [x for x in self.historic_layers if x.kind == "Image"] + + def historic_by_name(self) -> Set[str]: + return set(x.layer_file_name for x in self.historic_layers) + @dataclass class TenantConfig: @@ -254,6 +263,7 @@ def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): def tenant_delete(self, tenant_id: TenantId): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) + return res def tenant_load(self, tenant_id: TenantId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load") @@ -352,12 +362,16 @@ def timeline_create( new_timeline_id: TimelineId, ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, + existing_initdb_timeline_id: Optional[TimelineId] = None, **kwargs, ) -> Dict[Any, Any]: body: Dict[str, Any] = { "new_timeline_id": str(new_timeline_id), "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, + "existing_initdb_timeline_id": str(existing_initdb_timeline_id) + if existing_initdb_timeline_id + else None, } if pg_version != PgVersion.NOT_SET: body["pg_version"] = int(pg_version) @@ -416,6 +430,10 @@ def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs def timeline_gc( self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] ) -> dict[str, Any]: + """ + Unlike most handlers, this will wait for the layers to be actually + complete registering themselves to the deletion queue. + """ self.is_testing_enabled_or_skip() log.info( diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 007ff387f41c..e7b78cfb9a97 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,7 +1,7 @@ import time -from typing import TYPE_CHECKING, Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional -from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef +from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient @@ -235,10 +235,14 @@ def timeline_delete_wait_completed( from fixtures.neon_fixtures import NeonEnvBuilder -def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None): +def assert_prefix_empty( + neon_env_builder: "NeonEnvBuilder", + prefix: Optional[str] = None, + allowed_postfix: Optional[str] = None, +): response = list_prefix(neon_env_builder, prefix) keys = response["KeyCount"] - objects = response.get("Contents", []) + objects: List[ObjectTypeDef] = response.get("Contents", []) common_prefixes = response.get("CommonPrefixes", []) remote_storage = neon_env_builder.pageserver_remote_storage @@ -261,7 +265,18 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}" ) - assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}" + filtered_count = 0 + if allowed_postfix is None: + filtered_count = len(objects) + else: + for _obj in objects: + key: str = str(response.get("Key", [])) + if not (allowed_postfix.endswith(key)): + filtered_count += 1 + + assert ( + filtered_count == 0 + ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}" def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None): diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 6e857766e55f..cda788b2a480 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -49,7 +49,8 @@ def subprocess_capture( echo_stdout=False, capture_stdout=False, timeout=None, - **kwargs: Any, + with_command_header=True, + **popen_kwargs: Any, ) -> Tuple[str, Optional[str], int]: """Run a process and bifurcate its output to files and the `log` logger @@ -86,13 +87,23 @@ def __init__(self, in_file, out_file, echo: bool, capture: bool): self.captured = "" def run(self): + first = with_command_header for line in self.in_file: + if first: + # do this only after receiving any input so that we can + # keep deleting empty files, or leave it out completly if + # it was unwanted (using the file as input later for example) + first = False + # prefix the files with the command line so that we can + # later understand which file is for what command + self.out_file.write((f"# {' '.join(cmd)}\n\n").encode("utf-8")) + # Only bother decoding if we are going to do something more than stream to a file if self.echo or self.capture: string = line.decode(encoding="utf-8", errors="replace") if self.echo: - log.info(string) + log.info(string.strip()) if self.capture: self.captured += string @@ -107,7 +118,7 @@ def run(self): p = subprocess.Popen( cmd, - **kwargs, + **popen_kwargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) @@ -138,17 +149,19 @@ def run(self): _global_counter = 0 +_global_counter_lock = threading.Lock() def global_counter() -> int: - """A really dumb global counter. + """A really dumb but thread-safe global counter. This is useful for giving output files a unique number, so if we run the same command multiple times we can keep their output separate. """ - global _global_counter - _global_counter += 1 - return _global_counter + global _global_counter, _global_counter_lock + with _global_counter_lock: + _global_counter += 1 + return _global_counter def print_gc_result(row: Dict[str, Any]): diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index d113e9e6375e..7ad65821d45b 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -14,7 +14,7 @@ Some handy pytest flags for local development: - `-s` shows test output - `-k` selects a test to run - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) -- `--cleanup-test-ouput` cleans up after each test +- `--preserve-database-files` to skip cleanup # What performance tests do we have and how we run them diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 4ea21eb378a1..3ac0f16e4bac 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -437,9 +437,9 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "openssl" -version = "0.10.57" +version = "0.10.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bac25ee399abb46215765b1cb35bc0212377e58a061560d8b29b024fd0430e7c" +checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" dependencies = [ "bitflags 2.4.1", "cfg-if", @@ -469,9 +469,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.93" +version = "0.9.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4d56a4c0478783083cfafcc42493dd4a981d41669da64b4572a2a089b51b1d" +checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" dependencies = [ "cc", "libc", diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 4911fc09d63c..84a322039a66 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -114,7 +114,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) [ ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*", ".*Timeline got dropped without initializing, cleaning its files.*", - ".*Failed to load index_part from remote storage, failed creation?.*", ] ) @@ -144,8 +143,13 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) ), "pageserver should clean its temp timeline files on timeline creation failure" -def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() +# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups +@pytest.mark.parametrize("exit_or_return", ["return", "exit"]) +def test_timeline_init_break_before_checkpoint_recreate( + neon_env_builder: NeonEnvBuilder, exit_or_return: str +): + env = neon_env_builder.init_configs() + env.start() pageserver_http = env.pageserver.http_client() env.pageserver.allowed_errors.extend( @@ -156,6 +160,7 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn ] ) + pageserver_http.tenant_create(env.initial_tenant) tenant_id = env.initial_tenant timelines_dir = env.pageserver.timeline_dir(tenant_id) @@ -166,13 +171,17 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b") # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. - pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) - with pytest.raises(Exception, match="before-checkpoint-new-timeline"): - _ = env.neon_cli.create_timeline( - "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id - ) + failpoint = "before-checkpoint-new-timeline" + pattern = failpoint + if exit_or_return == "exit": + # in reality a read error happens, but there are automatic retries which now fail because pageserver is dead + pattern = "Connection aborted." - # Restart the page server + pageserver_http.configure_failpoints((failpoint, exit_or_return)) + with pytest.raises(Exception, match=pattern): + _ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id) + + # Restart the page server (with the failpoint disabled) env.pageserver.restart(immediate=True) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. @@ -186,11 +195,9 @@ def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEn timeline_dirs == initial_timeline_dirs ), "pageserver should clean its temp timeline files on timeline creation failure" - # Disable the failpoint again - pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "off")) # creating the branch should have worked now - new_timeline_id = env.neon_cli.create_timeline( - "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id + new_timeline_id = TimelineId( + pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"] ) assert timeline_id == new_timeline_id diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 98f6677c00a5..f3c6af442739 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -411,7 +411,6 @@ def check_neon_works( config.initial_tenant = snapshot_config["default_tenant_id"] config.pg_distrib_dir = pg_distrib_dir config.remote_storage = None - config.ext_remote_storage = None config.sk_remote_storage = None # Use the "target" binaries to launch the storage nodes @@ -435,8 +434,11 @@ def check_neon_works( pg_port = port_distributor.get_port() http_port = port_distributor.get_port() - cli_current.endpoint_start("main", pg_port=pg_port, http_port=http_port) - request.addfinalizer(lambda: cli_current.endpoint_stop("main")) + cli_current.endpoint_create( + branch_name="main", pg_port=pg_port, http_port=http_port, endpoint_id="ep-main" + ) + cli_current.endpoint_start("ep-main") + request.addfinalizer(lambda: cli_current.endpoint_stop("ep-main")) connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" pg_bin.run_capture( diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 775ad102416a..27eb05ac0912 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -1,316 +1,137 @@ import os import shutil -import threading from contextlib import closing from pathlib import Path +from typing import Any, Dict import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) -from fixtures.pg_version import PgVersion, skip_on_postgres -from fixtures.remote_storage import ( - RemoteStorageKind, - S3Storage, - available_s3_storages, -) - - -# Cleaning up downloaded files is important for local tests -# or else one test could reuse the files from another test or another test run -def cleanup(pg_version): - PGDIR = Path(f"pg_install/v{pg_version}") - - LIB_DIR = PGDIR / Path("lib/postgresql") - cleanup_lib_globs = ["anon*", "postgis*", "pg_buffercache*"] - cleanup_lib_glob_paths = [LIB_DIR.glob(x) for x in cleanup_lib_globs] - - SHARE_DIR = PGDIR / Path("share/postgresql/extension") - cleanup_ext_globs = [ - "anon*", - "address_standardizer*", - "postgis*", - "pageinspect*", - "pg_buffercache*", - "pgrouting*", - ] - cleanup_ext_glob_paths = [SHARE_DIR.glob(x) for x in cleanup_ext_globs] - - all_glob_paths = cleanup_lib_glob_paths + cleanup_ext_glob_paths - all_cleanup_files = [] - for file_glob in all_glob_paths: - for file in file_glob: - all_cleanup_files.append(file) - - for file in all_cleanup_files: - try: - os.remove(file) - log.info(f"removed file {file}") - except Exception as err: - log.info( - f"skipping remove of file {file} because it doesn't exist.\ - this may be expected or unexpected depending on the test {err}" - ) - - cleanup_folders = [SHARE_DIR / Path("anon"), PGDIR / Path("download_extensions")] - for folder in cleanup_folders: - try: - shutil.rmtree(folder) - log.info(f"removed folder {folder}") - except Exception as err: - log.info( - f"skipping remove of folder {folder} because it doesn't exist.\ - this may be expected or unexpected depending on the test {err}" - ) +from fixtures.pg_version import PgVersion +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response -def upload_files(env): - log.info("Uploading test files to mock bucket") - os.chdir("test_runner/regress/data/extension_test") - for path in os.walk("."): - prefix, _, files = path - for file in files: - # the [2:] is to remove the leading "./" - full_path = os.path.join(prefix, file)[2:] - - with open(full_path, "rb") as f: - log.info(f"UPLOAD {full_path} to ext/{full_path}") - assert isinstance(env.pageserver_remote_storage, S3Storage) - env.pageserver_remote_storage.client.upload_fileobj( - f, - env.ext_remote_storage.bucket_name, - f"ext/{full_path}", - ) - os.chdir("../../../..") - - -# Test downloading remote extension. -@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") -@pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) -@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") -def test_remote_extensions( +# use neon_env_builder_local fixture to override the default neon_env_builder fixture +# and use a test-specific pg_install instead of shared one +@pytest.fixture(scope="function") +def neon_env_builder_local( neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, + test_output_dir: Path, + pg_distrib_dir: Path, pg_version: PgVersion, -): - neon_env_builder.enable_extensions_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start() - tenant_id, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id) - - assert env.ext_remote_storage is not None # satisfy mypy - - # For MOCK_S3 we upload test files. - # For REAL_S3 we use the files already in the bucket - if remote_storage_kind == RemoteStorageKind.MOCK_S3: - upload_files(env) - - # Start a compute node and check that it can download the extensions - # and use them to CREATE EXTENSION and LOAD - endpoint = env.endpoints.create_start( - "test_remote_extensions", - tenant_id=tenant_id, - remote_ext_config=env.ext_remote_storage.to_string(), - # config_lines=["log_min_messages=debug3"], +) -> NeonEnvBuilder: + test_local_pginstall = test_output_dir / "pg_install" + log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}") + shutil.copytree( + pg_distrib_dir / pg_version.v_prefixed, test_local_pginstall / pg_version.v_prefixed ) - try: - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - # Check that appropriate control files were downloaded - cur.execute("SELECT * FROM pg_available_extensions") - all_extensions = [x[0] for x in cur.fetchall()] - log.info(all_extensions) - assert "anon" in all_extensions - # postgis is on real s3 but not mock s3. - # it's kind of a big file, would rather not upload to github - if remote_storage_kind == RemoteStorageKind.REAL_S3: - assert "postgis" in all_extensions - # this may fail locally if dependency is missing - # we don't really care about the error, - # we just want to make sure it downloaded - try: - cur.execute("CREATE EXTENSION postgis") - except Exception as err: - log.info(f"(expected) error creating postgis extension: {err}") - # we do not check the error, so this is basically a NO-OP - # however checking the log you can make sure that it worked - # and also get valuable information about how long loading the extension took + neon_env_builder.pg_distrib_dir = test_local_pginstall + log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}") - # this is expected to fail on my computer because I don't have the pgcrypto extension - try: - cur.execute("CREATE EXTENSION anon") - except Exception as err: - log.info("error creating anon extension") - assert "pgcrypto" in str(err), "unexpected error creating anon extension" - finally: - cleanup(pg_version) + return neon_env_builder -# Test downloading remote library. -@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") -@pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) -@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") -def test_remote_library( - neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, - pg_version: PgVersion, +def test_remote_extensions( + httpserver: HTTPServer, + neon_env_builder_local: NeonEnvBuilder, + httpserver_listen_address, + pg_version, ): - neon_env_builder.enable_extensions_remote_storage(remote_storage_kind) - env = neon_env_builder.init_start() - tenant_id, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id) - - assert env.ext_remote_storage is not None # satisfy mypy - - # For MOCK_S3 we upload test files. - # For REAL_S3 we use the files already in the bucket - if remote_storage_kind == RemoteStorageKind.MOCK_S3: - upload_files(env) + if pg_version == PgVersion.V16: + pytest.skip("TODO: PG16 extension building") + + # setup mock http server + # that expects request for anon.tar.zst + # and returns the requested file + (host, port) = httpserver_listen_address + extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway" + + build_tag = os.environ.get("BUILD_TAG", "latest") + archive_path = f"{build_tag}/v{pg_version}/extensions/anon.tar.zst" + + def endpoint_handler_build_tag(request: Request) -> Response: + log.info(f"request: {request}") + + file_name = "anon.tar.zst" + file_path = f"test_runner/regress/data/extension_test/5670669815/v{pg_version}/extensions/anon.tar.zst" + file_size = os.path.getsize(file_path) + fh = open(file_path, "rb") + + return Response( + fh, + mimetype="application/octet-stream", + headers=[ + ("Content-Length", str(file_size)), + ("Content-Disposition", 'attachment; filename="%s"' % file_name), + ], + direct_passthrough=True, + ) + + httpserver.expect_request( + f"/pg-ext-s3-gateway/{archive_path}", method="GET" + ).respond_with_handler(endpoint_handler_build_tag) + + # Start a compute node with remote_extension spec + # and check that it can download the extensions and use them to CREATE EXTENSION. + env = neon_env_builder_local.init_start() + env.neon_cli.create_branch("test_remote_extensions") + endpoint = env.endpoints.create( + "test_remote_extensions", + config_lines=["log_min_messages=debug3"], + ) - # and use them to run LOAD library - endpoint = env.endpoints.create_start( - "test_remote_library", - tenant_id=tenant_id, - remote_ext_config=env.ext_remote_storage.to_string(), - # config_lines=["log_min_messages=debug3"], + # mock remote_extensions spec + spec: Dict[str, Any] = { + "library_index": { + "anon": "anon", + }, + "extension_data": { + "anon": { + "archive_path": "", + "control_data": { + "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = 'Data anonymization tools'\ndefault_version = '1.1.0'\ndirectory='extension/anon'\nrelocatable = false\nrequires = 'pgcrypto'\nsuperuser = false\nmodule_pathname = '$libdir/anon'\ntrusted = true\n" + }, + }, + }, + } + spec["extension_data"]["anon"]["archive_path"] = archive_path + + endpoint.create_remote_extension_spec(spec) + + endpoint.start( + remote_ext_config=extensions_endpoint, ) + + # this is expected to fail if there's no pgcrypto extension, that's ok + # we just want to check that the extension was downloaded try: with closing(endpoint.connect()) as conn: with conn.cursor() as cur: - # try to load library - try: - cur.execute("LOAD 'anon'") - except Exception as err: - log.info(f"error loading anon library: {err}") - raise AssertionError("unexpected error loading anon library") from err + # Check that appropriate files were downloaded + cur.execute("CREATE EXTENSION anon") + res = [x[0] for x in cur.fetchall()] + log.info(res) + except Exception as err: + assert "pgcrypto" in str(err), f"unexpected error creating anon extension {err}" - # test library which name is different from extension name - # this may fail locally if dependency is missing - # however, it does successfully download the postgis archive - if remote_storage_kind == RemoteStorageKind.REAL_S3: - try: - cur.execute("LOAD 'postgis_topology-3'") - except Exception as err: - log.info("error loading postgis_topology-3") - assert "No such file or directory" in str( - err - ), "unexpected error loading postgis_topology-3" - finally: - cleanup(pg_version) + httpserver.check() -# Here we test a complex extension -# which has multiple extensions in one archive +# TODO +# 1. Test downloading remote library. +# +# 2. Test a complex extension, which has multiple extensions in one archive # using postgis as an example -# @pytest.mark.skipif( -# RemoteStorageKind.REAL_S3 not in available_s3_storages(), -# reason="skipping test because real s3 not enabled", -# ) -@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") -@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") -def test_multiple_extensions_one_archive( - neon_env_builder: NeonEnvBuilder, - pg_version: PgVersion, -): - neon_env_builder.enable_extensions_remote_storage(RemoteStorageKind.REAL_S3) - env = neon_env_builder.init_start() - tenant_id, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id) - - assert env.ext_remote_storage is not None # satisfy mypy - - endpoint = env.endpoints.create_start( - "test_multiple_extensions_one_archive", - tenant_id=tenant_id, - remote_ext_config=env.ext_remote_storage.to_string(), - ) - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE EXTENSION address_standardizer;") - cur.execute("CREATE EXTENSION address_standardizer_data_us;") - # execute query to ensure that it works - cur.execute( - "SELECT house_num, name, suftype, city, country, state, unit \ - FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \ - 'One Rust Place, Boston, MA 02109');" - ) - res = cur.fetchall() - log.info(res) - assert len(res) > 0 - - cleanup(pg_version) - - -# Test that extension is downloaded after endpoint restart, -# when the library is used in the query. # +# 3.Test that extension is downloaded after endpoint restart, +# when the library is used in the query. # Run the test with mutliple simultaneous connections to an endpoint. # to ensure that the extension is downloaded only once. # -@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") -def test_extension_download_after_restart( - neon_env_builder: NeonEnvBuilder, - pg_version: PgVersion, -): - # TODO: PG15 + PG16 extension building - if "v14" not in pg_version: # test set only has extension built for v14 - return None - - neon_env_builder.enable_extensions_remote_storage(RemoteStorageKind.MOCK_S3) - env = neon_env_builder.init_start() - tenant_id, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id) - - assert env.ext_remote_storage is not None # satisfy mypy - - # For MOCK_S3 we upload test files. - upload_files(env) - - endpoint = env.endpoints.create_start( - "test_extension_download_after_restart", - tenant_id=tenant_id, - remote_ext_config=env.ext_remote_storage.to_string(), - config_lines=["log_min_messages=debug3"], - ) - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE extension pg_buffercache;") - cur.execute("SELECT * from pg_buffercache;") - res = cur.fetchall() - assert len(res) > 0 - log.info(res) - - # shutdown compute node - endpoint.stop() - # remove extension files locally - cleanup(pg_version) - - # spin up compute node again (there are no extension files available, because compute is stateless) - endpoint = env.endpoints.create_start( - "test_extension_download_after_restart", - tenant_id=tenant_id, - remote_ext_config=env.ext_remote_storage.to_string(), - config_lines=["log_min_messages=debug3"], - ) - - # connect to compute node and run the query - # that will trigger the download of the extension - def run_query(endpoint, thread_id: int): - log.info("thread_id {%d} starting", thread_id) - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("SELECT * from pg_buffercache;") - res = cur.fetchall() - assert len(res) > 0 - log.info("thread_id {%d}, res = %s", thread_id, res) - - threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)] - - for thread in threads: - thread.start() - for thread in threads: - thread.join() - - cleanup(pg_version) +# 4. Test that private extensions are only downloaded when they are present in the spec. +# diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index 214f1f33a836..a456c0686267 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -20,6 +20,7 @@ def test_fullbackup( pg_bin: PgBin, port_distributor: PortDistributor, pg_distrib_dir: Path, + test_output_dir: Path, ): env = neon_env_builder.init_start() @@ -49,10 +50,12 @@ def test_fullbackup( restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] - result_basepath = pg_bin.run_capture(cmd, env=psql_env) - tar_output_file = result_basepath + ".stdout" - subprocess_capture(env.repo_dir, ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)]) + tar_output_file = test_output_dir / "fullbackup.tar" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] + pg_bin.run_capture(cmd, env=psql_env) + subprocess_capture( + env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)] + ) # HACK # fullbackup returns neon specific pg_control and first WAL segment diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index d357bd0ee451..8da5f1eec2ea 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -163,7 +163,9 @@ def import_tar(base, wal): assert endpoint.safe_psql("select count(*) from t") == [(300000,)] -def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): +def test_import_from_pageserver_small( + pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path +): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -177,7 +179,7 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu num_rows = 3000 lsn = _generate_data(num_rows, endpoint) - _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) + _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir) @pytest.mark.timeout(1800) @@ -185,7 +187,9 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu # the test back after finding the failure cause. # @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255") -def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): +def test_import_from_pageserver_multisegment( + pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path +): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -205,7 +209,9 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB - tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) + tar_output_file = _import( + num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir + ) # Check if the backup data contains multiple segment files cnt_seg_files = 0 @@ -246,7 +252,8 @@ def _import( pg_bin: PgBin, timeline: TimelineId, pg_distrib_dir: Path, -) -> str: + test_output_dir: Path, +) -> Path: """Test importing backup data to the pageserver. Args: @@ -263,9 +270,9 @@ def _import( # Get a fullbackup from pageserver query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] - result_basepath = pg_bin.run_capture(cmd, env=psql_env) - tar_output_file = result_basepath + ".stdout" + tar_output_file = test_output_dir / "fullbackup.tar" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] + pg_bin.run_capture(cmd, env=psql_env) # Stop the first pageserver instance, erase all its data env.endpoints.stop_all() @@ -299,7 +306,7 @@ def _import( "--base-lsn", str(lsn), "--base-tarfile", - os.path.join(tar_output_file), + str(tar_output_file), "--pg-version", env.pg_version, ] @@ -315,9 +322,17 @@ def _import( # Take another fullbackup query = f"fullbackup { tenant} {timeline} {lsn}" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] - result_basepath = pg_bin.run_capture(cmd, env=psql_env) - new_tar_output_file = result_basepath + ".stdout" + new_tar_output_file = test_output_dir / "fullbackup-new.tar" + cmd = [ + "psql", + "--no-psqlrc", + env.pageserver.connstr(), + "-c", + query, + "-o", + str(new_tar_output_file), + ] + pg_bin.run_capture(cmd, env=psql_env) # Check it's the same as the first fullbackup # TODO pageserver should be checking checksum diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index becdd9ff80ca..46b72fbca500 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -1,3 +1,4 @@ +import pytest from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.port_distributor import PortDistributor @@ -11,19 +12,50 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por env.neon_cli.start() env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True) + main_branch_name = "main" pg_port = port_distributor.get_port() http_port = port_distributor.get_port() - env.neon_cli.endpoint_start( - endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port + env.neon_cli.endpoint_create( + main_branch_name, pg_port, http_port, endpoint_id="ep-basic-main" ) + env.neon_cli.endpoint_start("ep-basic-main") branch_name = "migration-check" - - env.neon_cli.create_branch(new_branch_name=branch_name) + env.neon_cli.create_branch(branch_name) pg_port = port_distributor.get_port() http_port = port_distributor.get_port() - env.neon_cli.endpoint_start( - f"ep-{branch_name}", pg_port, http_port, branch_name=branch_name + env.neon_cli.endpoint_create( + branch_name, pg_port, http_port, endpoint_id=f"ep-{branch_name}" ) + env.neon_cli.endpoint_start(f"ep-{branch_name}") finally: env.neon_cli.stop() + + +def test_neon_two_primary_endpoints_fail( + neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor +): + """ + Two primary endpoints with same tenant and timeline will not run together + """ + env = neon_env_builder.init_start() + branch_name = "main" + + pg_port = port_distributor.get_port() + http_port = port_distributor.get_port() + env.neon_cli.endpoint_create(branch_name, pg_port, http_port, "ep1") + + pg_port = port_distributor.get_port() + http_port = port_distributor.get_port() + # ep1 is not running so create will succeed + env.neon_cli.endpoint_create(branch_name, pg_port, http_port, "ep2") + + env.neon_cli.endpoint_start("ep1") + + expected_message = f'attempting to create a duplicate primary endpoint on tenant {env.initial_tenant}, timeline {env.initial_timeline}: endpoint "ep1" exists already. please don\'t do this, it is not supported.' + with pytest.raises(RuntimeError): + assert expected_message in env.neon_cli.endpoint_start("ep2").stderr + + env.neon_cli.endpoint_stop("ep1") + # ep1 is stopped so create ep2 will succeed + env.neon_cli.endpoint_start("ep2") diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index c3f4ad476f6a..66cc286aba38 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -282,7 +282,7 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): # Now advance the generation in the control plane: subsequent validations # from the running pageserver will fail. No more deletions should happen. - env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver) + env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) generate_uploads_and_deletions(env, init=False) assert_deletion_queue(ps_http, lambda n: n > 0) @@ -397,7 +397,7 @@ def assert_header_written(): if keep_attachment == KeepAttachment.LOSE: some_other_pageserver = 101010 assert env.attachment_service is not None - env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver) + env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) env.pageserver.start() diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py new file mode 100644 index 000000000000..f53357981162 --- /dev/null +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -0,0 +1,74 @@ +import psycopg2 +import pytest +from fixtures.neon_fixtures import ( + NeonProxy, + VanillaPostgres, +) + +TABLE_NAME = "neon_control_plane.endpoints" + + +# Proxy uses the same logic for psql and websockets. +@pytest.mark.asyncio +async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres): + # Shouldn't be able to connect to this project + vanilla_pg.safe_psql( + f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')" + ) + # Should be able to connect to this project + vanilla_pg.safe_psql( + f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')" + ) + + def check_cannot_connect(**kwargs): + with pytest.raises(psycopg2.Error) as exprinfo: + static_proxy.safe_psql(**kwargs) + text = str(exprinfo.value).strip() + assert "This IP address is not allowed to connect" in text + + # no SNI, deprecated `options=project` syntax (before we had several endpoint in project) + check_cannot_connect(query="select 1", sslsni=0, options="project=private-project") + + # no SNI, new `options=endpoint` syntax + check_cannot_connect(query="select 1", sslsni=0, options="endpoint=private-project") + + # with SNI + check_cannot_connect(query="select 1", host="private-project.localtest.me") + + # no SNI, deprecated `options=project` syntax (before we had several endpoint in project) + out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project") + assert out[0][0] == 1 + + # no SNI, new `options=endpoint` syntax + out = static_proxy.safe_psql(query="select 1", sslsni=0, options="endpoint=generic-project") + assert out[0][0] == 1 + + # with SNI + out = static_proxy.safe_psql(query="select 1", host="generic-project.localtest.me") + assert out[0][0] == 1 + + +@pytest.mark.asyncio +async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres): + static_proxy.safe_psql("create user http_auth with password 'http' superuser") + + # Shouldn't be able to connect to this project + vanilla_pg.safe_psql( + f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')" + ) + + def query(status: int, query: str, *args): + static_proxy.http_query( + query, + args, + user="http_auth", + password="http", + expected_code=status, + ) + + query(400, "select 1;") # ip address is not allowed + # Should be able to connect to this project + vanilla_pg.safe_psql( + f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'" + ) + query(200, "select 1;") # should work now diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 31bc97703e5f..9c2bb2db115e 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -588,6 +588,7 @@ def assert_compacted_and_uploads_queued(): env.pageserver.allowed_errors.extend( [ ".* ERROR .*Error processing HTTP request: InternalServerError\\(The timeline or pageserver is shutting down", + ".* ERROR .*queue is in state Stopped.*", ".* ERROR .*[Cc]ould not flush frozen layer.*", ] ) @@ -602,7 +603,12 @@ def assert_compacted_and_uploads_queued(): assert isinstance(env.pageserver_remote_storage, LocalFsStorage) remote_timeline_path = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) - assert not list(remote_timeline_path.iterdir()) + filtered = [ + path + for path in remote_timeline_path.iterdir() + if not (path.name.endswith("initdb.tar.zst")) + ] + assert len(filtered) == 0 # timeline deletion should kill ongoing uploads, so, the metric will be gone assert get_queued_count(file_kind="index", op_kind="upload") is None @@ -763,9 +769,7 @@ def test_compaction_waits_for_upload( neon_env_builder: NeonEnvBuilder, ): """ - Compaction waits for outstanding uploads to complete, so that it avoids deleting layers - files that have not yet been uploaded. This test forces a race between upload and - compaction. + This test forces a race between upload and compaction. """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) @@ -784,6 +788,16 @@ def test_compaction_waits_for_upload( timeline_id = env.initial_timeline client = env.pageserver.http_client() + layers_at_creation = client.layer_map_info(tenant_id, timeline_id) + deltas_at_creation = len(layers_at_creation.delta_layers()) + assert ( + deltas_at_creation == 1 + ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle" + + # Make new layer uploads get stuck. + # Note that timeline creation waits for the initial layers to reach remote storage. + # So at this point, the `layers_at_creation` are in remote storage. + client.configure_failpoints(("before-upload-layer-pausable", "pause")) with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: # Build two tables with some data inside @@ -791,85 +805,71 @@ def test_compaction_waits_for_upload( wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) client.timeline_checkpoint(tenant_id, timeline_id) + deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers()) + assert ( + deltas_at_first == 2 + ), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement." endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)") + endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1") wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - # Now make the flushing hang and update one small piece of data - client.configure_failpoints(("before-upload-layer-pausable", "pause")) + layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name() + upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name() - endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1") + assert len(upload_stuck_layers) > 0 - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + for name in upload_stuck_layers: + path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name + assert path.exists(), "while uploads are stuck the layers should be present on disk" - checkpoint_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue() - compact_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue() - compact_barrier = threading.Barrier(2) + # now this will do the L0 => L1 compaction and want to remove + # upload_stuck_layers and the original initdb L0 + client.timeline_checkpoint(tenant_id, timeline_id) - def checkpoint_in_background(): - try: - log.info("Checkpoint starting") - client.timeline_checkpoint(tenant_id, timeline_id) - log.info("Checkpoint complete") - checkpoint_result.put(None) - except PageserverApiException as e: - log.info("Checkpoint errored: {e}") - checkpoint_result.put(e) + # as uploads are paused, the the upload_stuck_layers should still be with us + for name in upload_stuck_layers: + path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name + assert path.exists(), "uploads are stuck still over compaction" - def compact_in_background(): - compact_barrier.wait() - try: - log.info("Compaction starting") - client.timeline_compact(tenant_id, timeline_id) - log.info("Compaction complete") - compact_result.put(None) - except PageserverApiException as e: - log.info("Compaction errored: {e}") - compact_result.put(e) - - checkpoint_thread = threading.Thread(target=checkpoint_in_background) - checkpoint_thread.start() + compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name() + overlap = compacted_layers.intersection(upload_stuck_layers) + assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction" + assert ( + len(compacted_layers) == 1 + ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)" - compact_thread = threading.Thread(target=compact_in_background) - compact_thread.start() + def layer_deletes_completed(): + m = client.get_metric_value("pageserver_layer_gcs_count_total", {"state": "completed"}) + if m is None: + return 0 + return int(m) - try: - # Start the checkpoint, see that it blocks - log.info("Waiting to see checkpoint hang...") - time.sleep(5) - assert checkpoint_result.empty() - - # Start the compaction, see that it finds work to do but blocks - compact_barrier.wait() - log.info("Waiting to see compaction hang...") - time.sleep(5) - assert compact_result.empty() - - # This is logged once compaction is started, but before we wait for operations to complete - assert env.pageserver.log_contains("compact_level0_phase1 stats available.") - - # Once we unblock uploads the compaction should complete successfully - log.info("Disabling failpoint") - client.configure_failpoints(("before-upload-layer-pausable", "off")) - log.info("Awaiting compaction result") - assert compact_result.get(timeout=10) is None - log.info("Awaiting checkpoint result") - assert checkpoint_result.get(timeout=10) is None - - except Exception: - # Log the actual failure's backtrace here, before we proceed to join threads - log.exception("Failure, cleaning up...") - raise - finally: - compact_barrier.abort() + # if initdb created an initial delta layer, it might already be gc'd + # because it was uploaded before the failpoint was enabled. however, the + # deletion is not guaranteed to be complete. + assert layer_deletes_completed() <= 1 - checkpoint_thread.join() - compact_thread.join() + client.configure_failpoints(("before-upload-layer-pausable", "off")) # Ensure that this actually terminates wait_upload_queue_empty(client, tenant_id, timeline_id) - # We should not have hit the error handling path in uploads where the remote file is gone + def until_layer_deletes_completed(): + deletes = layer_deletes_completed() + log.info(f"layer_deletes: {deletes}") + # ensure that initdb delta layer AND the previously stuck are now deleted + assert deletes >= len(upload_stuck_layers) + 1 + + wait_until(10, 1, until_layer_deletes_completed) + + for name in upload_stuck_layers: + path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name + assert ( + not path.exists() + ), "l0 should now be removed because of L0 => L1 compaction and completed uploads" + + # We should not have hit the error handling path in uploads where a uploaded file is gone assert not env.pageserver.log_contains( "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more." ) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 2fdcfca67153..fcc3243e817e 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -336,10 +336,15 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( ): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + # disable compaction so that it will not download the layer for repartitioning + "compaction_period": "0s" + } + ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - (tenant_id, timeline_id) = env.neon_cli.create_tenant() + (tenant_id, timeline_id) = env.initial_tenant, env.initial_timeline ps_http = env.pageserver.http_client() def get_metric(): diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 0dd1f9a29598..89c474286a03 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,3 +1,4 @@ +import concurrent.futures import enum import os import shutil @@ -284,6 +285,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( str(tenant_id), ) ), + allowed_postfix="initdb.tar.zst", ) @@ -474,4 +476,95 @@ def tenant_is_deleted(): deletion.join() -# TODO test concurrent deletions with "hang" failpoint +def test_tenant_delete_concurrent( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + """ + Validate that concurrent delete requests to the same tenant behave correctly: + exactly one should succeed. + + This is a reproducer for https://github.com/neondatabase/neon/issues/5936 + """ + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + ps_http = env.pageserver.http_client() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Populate some data + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + + CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken" + + env.pageserver.allowed_errors.extend( + [ + # lucky race with stopping from flushing a layer we fail to schedule any uploads + ".*layer flush task.+: could not flush frozen layer: update_metadata_file", + # Errors logged from our 4xx requests + f".*{CONFLICT_MESSAGE}.*", + ] + ) + + BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove" + BEFORE_RUN_FAILPOINT = "tenant-delete-before-run" + + # We will let the initial delete run until right before it would remove + # the tenant's TenantSlot. This pauses it in a state where the tenant + # is visible in Stopping state, and concurrent requests should fail with 4xx. + ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause")) + + def delete_tenant(): + return ps_http.tenant_delete(tenant_id) + + def hit_remove_failpoint(): + assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") + + def hit_run_failpoint(): + assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") + + with concurrent.futures.ThreadPoolExecutor() as executor: + background_200_req = executor.submit(delete_tenant) + assert background_200_req.result(timeout=10).status_code == 202 + + # Wait until the first request completes its work and is blocked on removing + # the TenantSlot from tenant manager. + wait_until(100, 0.1, hit_remove_failpoint) + + # Start another request: this should fail when it sees a tenant in Stopping state + with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): + ps_http.tenant_delete(tenant_id) + + # Start another background request, which will pause after acquiring a TenantSlotGuard + # but before completing. + ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause")) + background_4xx_req = executor.submit(delete_tenant) + wait_until(100, 0.1, hit_run_failpoint) + + # The TenantSlot is still present while the original request is hung before + # final removal + assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + + # Permit the original request to run to success + ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off")) + + # Permit the duplicate background request to run to completion and fail. + ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off")) + with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): + background_4xx_req.result(timeout=10) + + # Physical deletion should have happened + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + # Zero tenants remain (we deleted the default tenant) + assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 090d58672161..c81be4153007 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -290,10 +290,12 @@ def test_pageserver_with_empty_tenants( env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append( - ".*marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + ".*marking .* as locally complete, while it doesnt exist in remote index.*", + ".*load failed.*list timelines directory.*", + ] ) - env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*") client = env.pageserver.http_client() diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 2e1fcd38fe07..b1a2755394e3 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -308,8 +308,10 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ) timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id) + # Check local is empty - assert not timeline_dir.exists() + assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0 + # Check no delete mark present assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists() diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index eb983488231b..24cbe344572e 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -146,6 +146,72 @@ def wait_for_pageserver_catchup(endpoint_main: Endpoint, polling_interval=1, tim time.sleep(polling_interval) +def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup") + + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + + endpoint_main = env.endpoints.create( + "test_timeline_size_quota_on_startup", + # Set small limit for the test + config_lines=["neon.max_cluster_size=30MB"], + ) + endpoint_main.start() + + log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch") + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo (t text)") + + # Insert many rows. This query must fail because of space limit + try: + for _i in range(5000): + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100) g + """ + ) + + # If we get here, the timeline size limit failed + log.error("Query unexpectedly succeeded") + raise AssertionError() + + except psycopg2.errors.DiskFull as err: + log.info(f"Query expectedly failed with: {err}") + + # Restart endpoint that reached the limit to ensure that it doesn't fail on startup + # i.e. the size limit is not enforced during startup. + endpoint_main.stop() + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + # which is needed for neon.pg_cluster_size() to work + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + # ensure that the limit is enforced after startup + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + # This query must fail because of space limit + try: + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + # If we get here, the timeline size limit failed + log.error("Query unexpectedly succeeded") + raise AssertionError() + + except psycopg2.errors.DiskFull as err: + log.info(f"Query expectedly failed with: {err}") + + def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() client = env.pageserver.http_client() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 05c60eb102dd..b7eaaf39bc37 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1,6 +1,5 @@ import filecmp import os -import pathlib import random import shutil import signal @@ -639,7 +638,7 @@ class ProposerPostgres(PgProtocol): def __init__( self, pgdata_dir: str, - pg_bin, + pg_bin: PgBin, tenant_id: TenantId, timeline_id: TimelineId, listen_addr: str, @@ -665,7 +664,7 @@ def config_file_path(self) -> str: def create_dir_config(self, safekeepers: str): """Create dir and config for running --sync-safekeepers""" - pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True) + Path(self.pg_data_dir_path()).mkdir(exist_ok=True) with open(self.config_file_path(), "w") as f: cfg = [ "synchronous_standby_names = 'walproposer'\n", @@ -691,7 +690,7 @@ def sync_safekeepers(self) -> Lsn: "PGDATA": self.pg_data_dir_path(), } - basepath = self.pg_bin.run_capture(command, env) + basepath = self.pg_bin.run_capture(command, env, with_command_header=False) log.info(f"postgres --sync-safekeepers output: {basepath}") diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index b039b3625578..4a9ffeee4b70 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -1,6 +1,7 @@ import sys import tarfile import tempfile +import time from pathlib import Path import pytest @@ -125,3 +126,43 @@ def test_wal_restore_initdb( ) log.info(f"original lsn: {original_lsn}, restored lsn: {restored_lsn}") assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] + + +def test_wal_restore_http( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, +): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql("create table t as select generate_series(1,300000)") + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps_client = env.pageserver.http_client() + + # shut down the endpoint and delete the timeline from the pageserver + endpoint.stop() + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + test_output_dir / "initdb.tar.zst" + + (env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / "initdb.tar.zst") + + ps_client.timeline_delete(tenant_id, timeline_id) + time.sleep(2) + + # verify that it is indeed deleted + # TODO + + # issue the restoration command + ps_client.timeline_create( + tenant_id=tenant_id, + new_timeline_id=timeline_id, + existing_initdb_timeline_id=timeline_id, + pg_version=env.pg_version, + ) + + # the table is back now! + restored = env.endpoints.create_start("main") + assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 763000f1d087..e3a22b729220 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 763000f1d0873b827829c41f2f6f799ffc0de55c +Subproject commit e3a22b72922055f9212eca12700190f118578362 diff --git a/vendor/revisions.json b/vendor/revisions.json index 377357e13178..c4cea208eeb0 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "763000f1d0873b827829c41f2f6f799ffc0de55c", + "postgres-v16": "e3a22b72922055f9212eca12700190f118578362", "postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a", "postgres-v14": "dd067cf656f6810a25aca6025633d32d02c5085a" } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 2aa935fac675..6f0ebe5f665b 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -13,6 +13,10 @@ commands: user: nobody sysvInitAction: respawn shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter' + - name: sql-exporter + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: @@ -46,6 +50,77 @@ files: } memory {} } + - filename: sql_exporter.yml + content: | + # Configuration for sql_exporter + # Global defaults. + global: + # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: 10s + # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: 500ms + # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: 0s + # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + # as will concurrent scrapes. + max_connections: 1 + # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + # always be the same as max_connections. + max_idle_connections: 1 + # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + # If 0, connections are not closed due to a connection's age. + max_connection_lifetime: 5m + + # The target to monitor and the collectors to execute on it. + target: + # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + # the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable' + + # Collectors (referenced by name) to execute on the target. + # Glob patterns are supported (see for syntax). + collectors: [neon_collector] + + # Collector files specifies a list of globs. One collector definition is read from each matching file. + # Glob patterns are supported (see for syntax). + collector_files: + - "neon_collector.yml" + - filename: neon_collector.yml + content: | + collector_name: neon_collector + metrics: + - metric_name: lfc_misses + type: gauge + help: 'lfc_misses' + key_labels: + values: [lfc_misses] + query: | + select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; + + - metric_name: lfc_used + type: gauge + help: 'lfc_used' + key_labels: + values: [lfc_used] + query: | + select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; + + - metric_name: lfc_hits + type: gauge + help: 'lfc_hits' + key_labels: + values: [lfc_hits] + query: | + select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; + + - metric_name: lfc_writes + type: gauge + help: 'lfc_writes' + key_labels: + values: [lfc_writes] + query: | + select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + build: | # Build cgroup-tools # @@ -82,17 +157,20 @@ build: | FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter + FROM burningalchemist/sql_exporter:0.13 AS sql-exporter + # Build pgbouncer # FROM debian:bullseye-slim AS pgbouncer RUN set -e \ && apt-get update \ && apt-get install -y \ - curl \ build-essential \ - pkg-config \ + curl \ libevent-dev \ - libssl-dev + libssl-dev \ + patchutils \ + pkg-config ENV PGBOUNCER_VERSION 1.21.0 ENV PGBOUNCER_GITPATH 1_21_0 @@ -100,6 +178,7 @@ build: | && curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \ && tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \ && cd pgbouncer-${PGBOUNCER_VERSION} \ + && curl https://github.com/pgbouncer/pgbouncer/commit/a7b3c0a5f4caa9dbe92743d04cf1e28c4c05806c.patch | filterdiff --include a/src/server.c | patch -p1 \ && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ && make -j $(nproc) \ && make install @@ -114,13 +193,19 @@ merge: | COPY cgconfig.conf /etc/cgconfig.conf COPY pgbouncer.ini /etc/pgbouncer.ini + COPY sql_exporter.yml /etc/sql_exporter.yml + COPY neon_collector.yml /etc/neon_collector.yml + RUN set -e \ && chown postgres:postgres /etc/pgbouncer.ini \ && chmod 0644 /etc/pgbouncer.ini \ - && chmod 0644 /etc/cgconfig.conf + && chmod 0644 /etc/cgconfig.conf \ + && chmod 0644 /etc/sql_exporter.yml \ + && chmod 0644 /etc/neon_collector.yml COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter + COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 66828fb53d39..3e46731adf8e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -14,12 +14,16 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] anyhow = { version = "1", features = ["backtrace"] } -aws-config = { version = "0.56", default-features = false, features = ["credentials-sso", "rustls"] } -aws-runtime = { version = "0.56", default-features = false, features = ["event-stream"] } -aws-sigv4 = { version = "0.56", features = ["sign-eventstream"] } -aws-smithy-http = { version = "0.56", default-features = false, features = ["event-stream", "rt-tokio"] } +aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] } +aws-runtime = { version = "1", default-features = false, features = ["event-stream", "sigv4a"] } +aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } +aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } +aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } +aws-smithy-runtime-api = { version = "1", features = ["client", "http-02x", "http-auth"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } +base64ct = { version = "1", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } clap = { version = "4", features = ["derive", "string"] } @@ -36,6 +40,7 @@ futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hex = { version = "0.4", features = ["serde"] } +hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } @@ -56,13 +61,14 @@ scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } smallvec = { version = "1", default-features = false, features = ["write"] } +subtle = { version = "2" } time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "io"] } toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } toml_edit = { version = "0.19", features = ["serde"] } -tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } +tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } tungstenite = { version = "0.20" }