diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index d6219c31b4a5..d07e3e32e8f2 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -7,10 +7,9 @@ inputs: type: boolean required: false default: false - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true outputs: base-url: @@ -84,12 +83,11 @@ runs: ALLURE_VERSION: 2.27.0 ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 3c83656c8940..8548a886cf34 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -8,10 +8,9 @@ inputs: unique-key: description: 'string to distinguish different results in the same run' required: true - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" @@ -36,12 +35,11 @@ runs: env: REPORT_DIR: ${{ inputs.report-dir }} - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report - name: Upload test results diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index d6b1fac9f70c..14b2ef8eace6 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -15,19 +15,17 @@ inputs: prefix: description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false - aws_oicd_role_arn: - description: "the OIDC role arn for aws auth" - required: false - default: "" + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 - name: Download artifact diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index dd5c890f5bbd..9a0261d43045 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -48,10 +48,9 @@ inputs: description: 'benchmark durations JSON' required: false default: '{}' - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" @@ -62,7 +61,7 @@ runs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Download Neon binaries for the previous release if: inputs.build_type != 'remote' @@ -71,7 +70,7 @@ runs: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon-previous prefix: latest - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Download compatibility snapshot if: inputs.build_type != 'remote' @@ -83,7 +82,7 @@ runs: # The lack of compatibility snapshot (for example, for the new Postgres version) # shouldn't fail the whole job. Only relevant test should fail. skip-if-does-not-exist: true - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Checkout if: inputs.needs_postgres_source == 'true' @@ -221,19 +220,19 @@ runs: # The lack of compatibility snapshot shouldn't fail the job # (for example if we didn't run the test for non build-and-test workflow) skip-if-does-not-exist: true - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report + - name: Upload test results if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-store with: report-dir: /tmp/test_output/allure/results unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }} - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml index 9e3a7cba24f2..1bbea5400fe2 100644 --- a/.github/actions/save-coverage-data/action.yml +++ b/.github/actions/save-coverage-data/action.yml @@ -14,11 +14,11 @@ runs: name: coverage-data-artifact path: /tmp/coverage skip-if-does-not-exist: true # skip if there's no previous coverage to download - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Upload coverage data uses: ./.github/actions/upload with: name: coverage-data-artifact path: /tmp/coverage - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index 6616d0889933..ac5579ccea61 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -14,7 +14,7 @@ inputs: prefix: description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false - aws_oicd_role_arn: + aws-oicd-role-arn: description: "the OIDC role arn for aws auth" required: false default: "" @@ -61,7 +61,7 @@ runs: uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 - name: Upload artifact diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index 371d815fc8a6..fd328586b3c0 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -70,7 +70,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # we create a table that has one row for each database that we want to restore with the status whether the restore is done - name: Create benchmark_restore_status table if it does not exist diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 456399f3c360..4263bacce8d1 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -264,7 +264,7 @@ jobs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact path: /tmp/neon - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data @@ -308,7 +308,7 @@ jobs: real_s3_region: eu-central-1 rerun_failed: true pg_version: ${{ matrix.pg_version }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 2d37be883723..ab0f2a615517 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -105,7 +105,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -123,7 +123,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests @@ -153,7 +153,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -205,7 +205,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Run Logical Replication benchmarks uses: ./.github/actions/run-python-test-set @@ -216,7 +216,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -233,7 +233,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -245,7 +245,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -308,6 +308,7 @@ jobs: "image": [ "'"$image_default"'" ], "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, @@ -407,10 +408,10 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project - if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) + if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -429,7 +430,7 @@ jobs: neonvm-captest-sharding-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} ;; - neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) + neonvm-captest-new | neonvm-captest-new-many-tables | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -446,6 +447,26 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB + # without (neonvm-captest-new) + # and with (neonvm-captest-new-many-tables) many relations in the database + - name: Create many relations before the run + if: contains(fromJson('["neonvm-captest-new-many-tables"]'), matrix.platform) + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations + pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + TEST_NUM_RELATIONS: 10000 + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -455,7 +476,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -470,7 +491,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -485,7 +506,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -503,7 +524,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -614,7 +635,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -629,7 +650,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -640,7 +661,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -711,7 +732,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -743,7 +764,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 43200 -k test_clickbench pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -757,7 +778,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -822,7 +843,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get Connstring Secret Name run: | @@ -861,7 +882,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -873,7 +894,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -931,7 +952,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -963,7 +984,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -974,7 +995,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a3943cba9127..12b1ac98ac87 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -212,7 +212,7 @@ jobs: fi echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - name: Run cargo clippy (debug) - run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS + run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS - name: Check documentation generation run: cargo doc --workspace --no-deps --document-private-items @@ -303,6 +303,11 @@ jobs: benchmarks: if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -331,6 +336,7 @@ jobs: extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} pg_version: v16 + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -343,6 +349,11 @@ jobs: report-benchmarks-failures: needs: [ benchmarks, create-test-report ] if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure' + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write runs-on: ubuntu-22.04 steps: @@ -383,7 +394,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -445,14 +456,14 @@ jobs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get coverage artifact uses: ./.github/actions/download with: name: coverage-data-artifact path: /tmp/coverage - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge @@ -1024,6 +1035,11 @@ jobs: trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] runs-on: ubuntu-22.04 + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write steps: - name: Set PR's status to pending and request a remote CI test run: | @@ -1264,6 +1280,12 @@ jobs: echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 3600 + - name: Promote compatibility snapshot and Neon artifact env: BUCKET: neon-github-public-dev diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 2fc26baa212a..09d6acd32561 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -21,6 +21,8 @@ concurrency: permissions: id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write jobs: regress: @@ -79,7 +81,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create a new branch id: create-branch @@ -95,10 +97,12 @@ jobs: test_selection: cloud_regress pg_version: ${{matrix.pg-version}} extra_params: -m remote_cluster + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}} - name: Delete branch + if: always() uses: ./.github/actions/neon-branch-delete with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} @@ -110,7 +114,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 677303226395..fc33c0a980a6 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -13,7 +13,7 @@ on: # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 9 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually - + defaults: run: shell: bash -euxo pipefail {0} @@ -28,7 +28,7 @@ jobs: strategy: fail-fast: false # allow other variants to continue even if one fails matrix: - target_project: [new_empty_project, large_existing_project] + target_project: [new_empty_project, large_existing_project] permissions: contents: write statuses: write @@ -56,7 +56,7 @@ jobs: with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role - name: Download Neon artifact uses: ./.github/actions/download @@ -64,7 +64,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project if: ${{ matrix.target_project == 'new_empty_project' }} @@ -95,7 +95,7 @@ jobs: project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - name: Initialize Neon project + - name: Initialize Neon project if: ${{ matrix.target_project == 'large_existing_project' }} env: BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }} @@ -123,7 +123,7 @@ jobs: ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV - - name: Invoke pgcopydb + - name: Invoke pgcopydb uses: ./.github/actions/run-python-test-set with: build_type: remote @@ -132,7 +132,7 @@ jobs: extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb pg_version: v16 save_perf_report: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }} TARGET_PROJECT_TYPE: ${{ matrix.target_project }} @@ -144,7 +144,7 @@ jobs: run: | export LD_LIBRARY_PATH=${PG_16_LIB_PATH} ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+" - + - name: Delete Neon Project if: ${{ always() && matrix.target_project == 'new_empty_project' }} uses: ./.github/actions/neon-project-delete diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 9f5a16feca6e..af877029e49a 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -21,15 +21,17 @@ defaults: run: shell: bash -euo pipefail {0} -permissions: - id-token: write # aws-actions/configure-aws-credentials - concurrency: group: ${{ github.workflow }} cancel-in-progress: false jobs: trigger_bench_on_ec2_machine_in_eu_central_1: + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write runs-on: [ self-hosted, small ] container: image: neondatabase/build-tools:pinned-bookworm @@ -135,7 +137,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 5c999d3810c6..4947907eb068 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -96,7 +96,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -113,6 +113,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} @@ -129,7 +130,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -163,7 +164,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -180,6 +181,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} @@ -196,7 +198,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} diff --git a/Cargo.lock b/Cargo.lock index e2d5e03613b1..420def152d0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" [[package]] name = "addr2line" -version = "0.21.0" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -23,6 +23,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "ahash" version = "0.8.11" @@ -871,17 +877,17 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.69" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" dependencies = [ "addr2line", - "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.8.0", "object", "rustc-demangle", + "windows-targets 0.52.6", ] [[package]] @@ -1127,7 +1133,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -1268,6 +1274,7 @@ dependencies = [ "chrono", "clap", "compute_api", + "fail", "flate2", "futures", "hyper 0.14.30", @@ -2107,7 +2114,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.7.1", ] [[package]] @@ -2308,9 +2315,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.1" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "git-version" @@ -3404,6 +3411,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "0.8.11" @@ -3638,9 +3654,9 @@ dependencies = [ [[package]] name = "object" -version = "0.32.2" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] @@ -4401,11 +4417,13 @@ dependencies = [ "bindgen", "bytes", "crc32c", + "criterion", "env_logger", "log", "memoffset 0.9.0", "once_cell", "postgres", + "pprof", "regex", "serde", "thiserror", @@ -5062,6 +5080,7 @@ dependencies = [ "once_cell", "pin-project-lite", "rand 0.8.5", + "reqwest", "scopeguard", "serde", "serde_json", @@ -5320,9 +5339,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" @@ -5535,6 +5554,7 @@ dependencies = [ "remote_storage", "reqwest", "safekeeper_api", + "safekeeper_client", "scopeguard", "sd-notify", "serde", @@ -5565,10 +5585,25 @@ name = "safekeeper_api" version = "0.1.0" dependencies = [ "const_format", + "postgres_ffi", + "pq_proto", "serde", + "tokio", "utils", ] +[[package]] +name = "safekeeper_client" +version = "0.1.0" +dependencies = [ + "reqwest", + "safekeeper_api", + "serde", + "thiserror", + "utils", + "workspace_hack", +] + [[package]] name = "same-file" version = "1.0.6" @@ -7200,6 +7235,7 @@ dependencies = [ "anyhow", "arc-swap", "async-compression", + "backtrace", "bincode", "byteorder", "bytes", @@ -7210,12 +7246,14 @@ dependencies = [ "criterion", "diatomic-waker", "fail", + "flate2", "futures", "git-version", "hex", "hex-literal", "humantime", "hyper 0.14.30", + "itertools 0.10.5", "jemalloc_pprof", "jsonwebtoken", "metrics", @@ -7572,7 +7610,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ "windows-core", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -7581,7 +7619,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -7599,7 +7637,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -7619,17 +7657,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -7640,9 +7679,9 @@ checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" @@ -7652,9 +7691,9 @@ checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" @@ -7664,9 +7703,15 @@ checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" @@ -7676,9 +7721,9 @@ checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" @@ -7688,9 +7733,9 @@ checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" @@ -7700,9 +7745,9 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" @@ -7712,9 +7757,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" diff --git a/Cargo.toml b/Cargo.toml index 0654c25a3d67..885f02ba8190 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ members = [ "pageserver/pagebench", "proxy", "safekeeper", + "safekeeper/client", "storage_broker", "storage_controller", "storage_controller/client", @@ -51,6 +52,7 @@ anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" +backtrace = "0.3.74" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" @@ -233,6 +235,7 @@ postgres_initdb = { path = "./libs/postgres_initdb" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } +safekeeper_client = { path = "./safekeeper/client" } desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. storage_controller_client = { path = "./storage_controller/client" } diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 33d2a1028521..06aaf9e7f423 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -35,10 +35,12 @@ RUN case $DEBIAN_VERSION in \ ;; \ esac && \ apt update && \ - apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \ + apt install --no-install-recommends --no-install-suggests -y \ + ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \ - $VERSION_INSTALLS + $VERSION_INSTALLS \ + && apt clean && rm -rf /var/lib/apt/lists/* ######################################################################################### # @@ -113,10 +115,12 @@ ARG DEBIAN_VERSION ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install --no-install-recommends -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ + apt install --no-install-recommends --no-install-suggests -y \ + gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ - protobuf-c-compiler xsltproc + protobuf-c-compiler xsltproc \ + && apt clean && rm -rf /var/lib/apt/lists/* # Postgis 3.5.0 requires SFCGAL 1.4+ @@ -143,9 +147,9 @@ RUN case "${DEBIAN_VERSION}" in \ wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \ echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ - cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ - DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ - make clean && cp -R /sfcgal/* / + cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \ + ninja clean && cp -R /sfcgal/* / ENV PATH="/usr/local/pgsql/bin:$PATH" @@ -213,9 +217,9 @@ RUN case "${PG_VERSION}" in \ echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ + cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \ + ninja -j $(getconf _NPROCESSORS_ONLN) && \ + ninja -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \ @@ -235,7 +239,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch RUN apt update && \ - apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang + apt install --no-install-recommends --no-install-suggests -y \ + ninja-build python3-dev libncurses5 binutils clang \ + && apt clean && rm -rf /var/lib/apt/lists/* # plv8 3.2.3 supports v17 # last release v3.2.3 - Sep 7, 2024 @@ -301,9 +307,10 @@ RUN mkdir -p /h3/usr/ && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ - cmake .. -DCMAKE_BUILD_TYPE=Release && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - DESTDIR=/h3 make install && \ + cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \ + && ninja -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 ninja install && \ cp -R /h3/usr / && \ rm -rf build @@ -650,14 +657,15 @@ FROM build-deps AS rdkit-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ - apt-get install --no-install-recommends -y \ +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ libeigen3-dev \ - libboost-all-dev + libboost-all-dev \ + && apt clean && rm -rf /var/lib/apt/lists/* # rdkit Release_2024_09_1 supports v17 # last release Release_2024_09_1 - Sep 27, 2024 @@ -693,6 +701,8 @@ RUN case "${PG_VERSION}" in \ -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \ -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \ -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \ + -D RDK_TEST_MULTITHREADED=OFF \ + -D RDK_BUILD_CPP_TESTS=OFF \ -D RDK_USE_URF=OFF \ -D RDK_BUILD_PGSQL=ON \ -D RDK_PGSQL_STATIC=ON \ @@ -704,9 +714,10 @@ RUN case "${PG_VERSION}" in \ -D RDK_INSTALL_COMIC_FONTS=OFF \ -D RDK_BUILD_FREETYPE_SUPPORT=OFF \ -D CMAKE_BUILD_TYPE=Release \ + -GNinja \ . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ + ninja -j $(getconf _NPROCESSORS_ONLN) && \ + ninja -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control ######################################################################################### @@ -849,8 +860,9 @@ FROM build-deps AS rust-extensions-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ - apt-get install --no-install-recommends -y curl libclang-dev && \ +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ + apt clean && rm -rf /var/lib/apt/lists/* && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot @@ -885,8 +897,9 @@ FROM build-deps AS rust-extensions-build-pgrx12 ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ - apt-get install --no-install-recommends -y curl libclang-dev && \ +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ + apt clean && rm -rf /var/lib/apt/lists/* && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot @@ -914,18 +927,22 @@ FROM rust-extensions-build-pgrx12 AS pg-onnx-build # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25). # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise -RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \ +RUN apt update && apt install --no-install-recommends --no-install-suggests -y \ + python3 python3-pip python3-venv && \ + apt clean && rm -rf /var/lib/apt/lists/* && \ python3 -m venv venv && \ . venv/bin/activate && \ python3 -m pip install cmake==3.30.5 && \ wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ - ./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root + ./build.sh --config Release --parallel --cmake_generator Ninja \ + --skip_submodule_sync --skip_tests --allow_running_as_root FROM pg-onnx-build AS pgrag-pg-build -RUN apt-get install -y protobuf-compiler && \ +RUN apt update && apt install --no-install-recommends --no-install-suggests -y protobuf-compiler \ + && apt clean && rm -rf /var/lib/apt/lists/* && \ wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \ @@ -1168,6 +1185,25 @@ RUN case "${PG_VERSION}" in \ make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control +######################################################################################### +# +# Layer "pg_repack" +# compile pg_repack extension +# +######################################################################################### + +FROM build-deps AS pg-repack-build +ARG PG_VERSION +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH="/usr/local/pgsql/bin/:$PATH" + +RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \ + echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \ + mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -1213,6 +1249,7 @@ COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-repack-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -1248,7 +1285,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ ######################################################################################### # -# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries +# Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries # ######################################################################################### FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools @@ -1258,7 +1295,7 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto +RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy ######################################################################################### # @@ -1279,8 +1316,8 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_ FROM debian:$DEBIAN_FLAVOR AS pgbouncer RUN set -e \ - && apt-get update \ - && apt-get install --no-install-recommends -y \ + && apt update \ + && apt install --no-install-suggests --no-install-recommends -y \ build-essential \ git \ ca-certificates \ @@ -1288,7 +1325,8 @@ RUN set -e \ automake \ libevent-dev \ libtool \ - pkg-config + pkg-config \ + && apt clean && rm -rf /var/lib/apt/lists/* # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) ENV PGBOUNCER_TAG=pgbouncer_1_22_1 @@ -1300,20 +1338,6 @@ RUN set -e \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= -######################################################################################### -# -# Compile the Neon-specific `local_proxy` binary -# -######################################################################################### -FROM $REPOSITORY/$IMAGE:$TAG AS local_proxy -ARG BUILD_TAG -ENV BUILD_TAG=$BUILD_TAG - -USER nonroot -# Copy entire project to get Cargo.* files with proper dependencies for the whole project -COPY --chown=nonroot . . -RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin local_proxy - ######################################################################################### # # Layers "postgres-exporter" and "sql-exporter" @@ -1453,7 +1477,7 @@ COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/ COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini # local_proxy and its config -COPY --from=local_proxy --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy # Metrics exporter binaries and configuration files @@ -1518,28 +1542,30 @@ RUN apt update && \ locales \ procps \ ca-certificates \ + curl \ + unzip \ $VERSION_INSTALLS && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2 -# used by fast_import +# aws cli is used by fast_import (curl and unzip above are at this time only used for this installation step) ARG TARGETARCH -ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb RUN set -ex; \ - \ - # Determine the expected checksum based on TARGETARCH if [ "${TARGETARCH}" = "amd64" ]; then \ - CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \ + TARGETARCH_ALT="x86_64"; \ + CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \ elif [ "${TARGETARCH}" = "arm64" ]; then \ - CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \ + TARGETARCH_ALT="aarch64"; \ + CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \ else \ echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ fi; \ - \ - # Compute and validate the checksum - echo "${CHECKSUM} /tmp/s5cmd.deb" | sha256sum -c - -RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb + curl -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ + echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ + unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ + /tmp/awscliv2/aws/install; \ + rm -rf /tmp/awscliv2.zip /tmp/awscliv2; \ + true ENV LANG=en_US.utf8 USER postgres diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index aa6cc1cfc8a9..f8f4cab63ba2 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -3,7 +3,7 @@ metrics: [ import 'sql_exporter/checkpoints_req.libsonnet', import 'sql_exporter/checkpoints_timed.libsonnet', - import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet', + import 'sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet', import 'sql_exporter/compute_current_lsn.libsonnet', import 'sql_exporter/compute_logical_snapshot_files.libsonnet', import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet', diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet similarity index 61% rename from compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet index 02c803cfa6e6..31725bd179af 100644 --- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet +++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet @@ -1,10 +1,10 @@ { - metric_name: 'compute_backpressure_throttling_seconds', - type: 'gauge', + metric_name: 'compute_backpressure_throttling_seconds_total', + type: 'counter', help: 'Time compute has spent throttled', key_labels: null, values: [ 'throttled', ], - query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql', + query: importstr 'sql_exporter/compute_backpressure_throttling_seconds_total.sql', } diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql similarity index 100% rename from compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql diff --git a/compute/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch index a4b93d0260a3..3f0bb84ae737 100644 --- a/compute/patches/cloud_regress_pg16.patch +++ b/compute/patches/cloud_regress_pg16.patch @@ -981,7 +981,7 @@ index fc42d418bf..e38f517574 100644 CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out -index 8475231735..1afae5395f 100644 +index 8475231735..0653946337 100644 --- a/src/test/regress/expected/password.out +++ b/src/test/regress/expected/password.out @@ -12,11 +12,11 @@ SET password_encryption = 'md5'; -- ok @@ -1006,65 +1006,63 @@ index 8475231735..1afae5395f 100644 -----------------+--------------------------------------------------- - regress_passwd1 | md5783277baca28003b33453252be4dbb34 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3 -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: - regress_passwd4 | + regress_passwd4 | SCRAM-SHA-256$4096:$: (4 rows) -- Rename a role -@@ -54,24 +54,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -54,24 +54,16 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; -- passwords. SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +--- already encrypted with MD5, use as it is +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- This looks like a valid SCRAM-SHA-256 secret, but it is not - -- so it should be hashed with SCRAM-SHA-256. - CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- These may look like valid MD5 secrets, but they are not, so they - -- should be hashed with SCRAM-SHA-256. - -- trailing garbage at the end - CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- invalid length - CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount'; -@@ -81,63 +87,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -81,11 +73,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: regress_passwd4 | SCRAM-SHA-256$4096:$: - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023 -- regress_passwd6 | SCRAM-SHA-256$4096:$: -- regress_passwd7 | SCRAM-SHA-256$4096:$: -- regress_passwd8 | SCRAM-SHA-256$4096:$: - regress_passwd9 | SCRAM-SHA-256$1024:$: --(9 rows) -+(5 rows) - ++ regress_passwd5 | SCRAM-SHA-256$4096:$: + regress_passwd6 | SCRAM-SHA-256$4096:$: + regress_passwd7 | SCRAM-SHA-256$4096:$: + regress_passwd8 | SCRAM-SHA-256$4096:$: +@@ -95,23 +87,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ -- An empty password is not allowed, in any form CREATE ROLE regress_passwd_empty PASSWORD ''; NOTICE: empty string is not a valid password, clearing password @@ -1082,56 +1080,37 @@ index 8475231735..1afae5395f 100644 -(1 row) +(0 rows) - -- Test with invalid stored and server keys. - -- - -- The first is valid, to act as a control. The others have too long - -- stored/server keys. They will be re-hashed. - CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed - FROM pg_authid - WHERE rolname LIKE 'regress_passwd_sha_len%' +@@ -120,7 +109,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw ORDER BY rolname; -- rolname | is_rolpassword_rehashed ---------------------------+------------------------- + rolname | is_rolpassword_rehashed + -------------------------+------------------------- - regress_passwd_sha_len0 | f -- regress_passwd_sha_len1 | t -- regress_passwd_sha_len2 | t --(3 rows) -+ rolname | is_rolpassword_rehashed -+---------+------------------------- -+(0 rows) - - DROP ROLE regress_passwd1; - DROP ROLE regress_passwd2; - DROP ROLE regress_passwd3; - DROP ROLE regress_passwd4; - DROP ROLE regress_passwd5; -+ERROR: role "regress_passwd5" does not exist - DROP ROLE regress_passwd6; -+ERROR: role "regress_passwd6" does not exist - DROP ROLE regress_passwd7; -+ERROR: role "regress_passwd7" does not exist ++ regress_passwd_sha_len0 | t + regress_passwd_sha_len1 | t + regress_passwd_sha_len2 | t + (3 rows) +@@ -135,6 +124,7 @@ DROP ROLE regress_passwd7; DROP ROLE regress_passwd8; -+ERROR: role "regress_passwd8" does not exist DROP ROLE regress_passwd9; DROP ROLE regress_passwd_empty; +ERROR: role "regress_passwd_empty" does not exist DROP ROLE regress_passwd_sha_len0; -+ERROR: role "regress_passwd_sha_len0" does not exist DROP ROLE regress_passwd_sha_len1; -+ERROR: role "regress_passwd_sha_len1" does not exist DROP ROLE regress_passwd_sha_len2; -+ERROR: role "regress_passwd_sha_len2" does not exist - -- all entries should have been removed - SELECT rolname, rolpassword - FROM pg_authid diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 5b9dba7b32..cc408dad42 100644 --- a/src/test/regress/expected/privileges.out @@ -3194,7 +3173,7 @@ index 1a6c61f49d..1c31ac6a53 100644 -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql -index 53e86b0b6c..f07cf1ec54 100644 +index 53e86b0b6c..0303fdfe96 100644 --- a/src/test/regress/sql/password.sql +++ b/src/test/regress/sql/password.sql @@ -10,11 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok @@ -3213,23 +3192,59 @@ index 53e86b0b6c..f07cf1ec54 100644 -- check list of created entries -- -@@ -42,14 +42,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -42,26 +42,18 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +--- already encrypted with MD5, use as it is +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; + +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Changing the SCRAM iteration count + SET scram_iterations = 1024; +@@ -78,13 +70,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; + ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; + SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Check that the invalid secrets were re-hashed. A re-hashed secret + -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 249df17a58..b258e7f26a 100644 --- a/src/test/regress/sql/privileges.sql diff --git a/compute/patches/cloud_regress_pg17.patch b/compute/patches/cloud_regress_pg17.patch index cbe84ef54be7..e57447a2c6ee 100644 --- a/compute/patches/cloud_regress_pg17.patch +++ b/compute/patches/cloud_regress_pg17.patch @@ -1014,10 +1014,10 @@ index fc42d418bf..e38f517574 100644 CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out -index 924d6e001d..5966531db6 100644 +index 924d6e001d..7fdda73439 100644 --- a/src/test/regress/expected/password.out +++ b/src/test/regress/expected/password.out -@@ -12,13 +12,13 @@ SET password_encryption = 'md5'; -- ok +@@ -12,13 +12,11 @@ SET password_encryption = 'md5'; -- ok SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; @@ -1026,9 +1026,7 @@ index 924d6e001d..5966531db6 100644 -CREATE ROLE regress_passwd2; -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; @@ -1037,71 +1035,69 @@ index 924d6e001d..5966531db6 100644 -- check list of created entries -- -- The scram secret will look something like: -@@ -32,10 +32,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -32,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5783277baca28003b33453252be4dbb34 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3 -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: - regress_passwd4 | + regress_passwd4 | SCRAM-SHA-256$4096:$: (4 rows) -- Rename a role -@@ -56,24 +56,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -56,24 +54,17 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; -- passwords. SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- This looks like a valid SCRAM-SHA-256 secret, but it is not - -- so it should be hashed with SCRAM-SHA-256. - CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- These may look like valid MD5 secrets, but they are not, so they - -- should be hashed with SCRAM-SHA-256. - -- trailing garbage at the end - CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- invalid length - CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount'; -@@ -83,63 +89,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -83,11 +74,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: regress_passwd4 | SCRAM-SHA-256$4096:$: - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023 -- regress_passwd6 | SCRAM-SHA-256$4096:$: -- regress_passwd7 | SCRAM-SHA-256$4096:$: -- regress_passwd8 | SCRAM-SHA-256$4096:$: - regress_passwd9 | SCRAM-SHA-256$1024:$: --(9 rows) -+(5 rows) - ++ regress_passwd5 | SCRAM-SHA-256$4096:$: + regress_passwd6 | SCRAM-SHA-256$4096:$: + regress_passwd7 | SCRAM-SHA-256$4096:$: + regress_passwd8 | SCRAM-SHA-256$4096:$: +@@ -97,23 +88,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ -- An empty password is not allowed, in any form CREATE ROLE regress_passwd_empty PASSWORD ''; NOTICE: empty string is not a valid password, clearing password @@ -1119,56 +1115,37 @@ index 924d6e001d..5966531db6 100644 -(1 row) +(0 rows) - -- Test with invalid stored and server keys. - -- - -- The first is valid, to act as a control. The others have too long - -- stored/server keys. They will be re-hashed. - CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed - FROM pg_authid - WHERE rolname LIKE 'regress_passwd_sha_len%' +@@ -122,7 +110,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw ORDER BY rolname; -- rolname | is_rolpassword_rehashed ---------------------------+------------------------- + rolname | is_rolpassword_rehashed + -------------------------+------------------------- - regress_passwd_sha_len0 | f -- regress_passwd_sha_len1 | t -- regress_passwd_sha_len2 | t --(3 rows) -+ rolname | is_rolpassword_rehashed -+---------+------------------------- -+(0 rows) - - DROP ROLE regress_passwd1; - DROP ROLE regress_passwd2; - DROP ROLE regress_passwd3; - DROP ROLE regress_passwd4; - DROP ROLE regress_passwd5; -+ERROR: role "regress_passwd5" does not exist - DROP ROLE regress_passwd6; -+ERROR: role "regress_passwd6" does not exist - DROP ROLE regress_passwd7; -+ERROR: role "regress_passwd7" does not exist ++ regress_passwd_sha_len0 | t + regress_passwd_sha_len1 | t + regress_passwd_sha_len2 | t + (3 rows) +@@ -137,6 +125,7 @@ DROP ROLE regress_passwd7; DROP ROLE regress_passwd8; -+ERROR: role "regress_passwd8" does not exist DROP ROLE regress_passwd9; DROP ROLE regress_passwd_empty; +ERROR: role "regress_passwd_empty" does not exist DROP ROLE regress_passwd_sha_len0; -+ERROR: role "regress_passwd_sha_len0" does not exist DROP ROLE regress_passwd_sha_len1; -+ERROR: role "regress_passwd_sha_len1" does not exist DROP ROLE regress_passwd_sha_len2; -+ERROR: role "regress_passwd_sha_len2" does not exist - -- all entries should have been removed - SELECT rolname, rolpassword - FROM pg_authid diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 1296da0d57..f43fffa44c 100644 --- a/src/test/regress/expected/privileges.out @@ -3249,10 +3226,10 @@ index 1a6c61f49d..1c31ac6a53 100644 -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql -index bb82aa4aa2..7424c91b10 100644 +index bb82aa4aa2..dd8a05e24d 100644 --- a/src/test/regress/sql/password.sql +++ b/src/test/regress/sql/password.sql -@@ -10,13 +10,13 @@ SET password_encryption = 'scram-sha-256'; -- ok +@@ -10,13 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; @@ -3261,9 +3238,7 @@ index bb82aa4aa2..7424c91b10 100644 -CREATE ROLE regress_passwd2; -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; @@ -3272,23 +3247,59 @@ index bb82aa4aa2..7424c91b10 100644 -- check list of created entries -- -@@ -44,14 +44,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -44,26 +42,19 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; + +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Changing the SCRAM iteration count + SET scram_iterations = 1024; +@@ -80,13 +71,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; + ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; + SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Check that the invalid secrets were re-hashed. A re-hashed secret + -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 5880bc018d..27aa952b18 100644 --- a/src/test/regress/sql/privileges.sql diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index c0c390caef81..9525b278183f 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -7,7 +7,7 @@ license.workspace = true [features] default = [] # Enables test specific features. -testing = [] +testing = ["fail/failpoints"] [dependencies] base64.workspace = true @@ -19,6 +19,7 @@ camino.workspace = true chrono.workspace = true cfg-if.workspace = true clap.workspace = true +fail.workspace = true flate2.workspace = true futures.workspace = true hyper0 = { workspace = true, features = ["full"] } diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index bb248734a8cf..26ae25ec2047 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -67,12 +67,15 @@ use compute_tools::params::*; use compute_tools::spec::*; use compute_tools::swap::resize_swap; use rlimit::{setrlimit, Resource}; +use utils::failpoint_support; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { + let scenario = failpoint_support::init(); + let (build_tag, clap_args) = init()?; // enable core dumping for all child processes @@ -100,6 +103,8 @@ fn main() -> Result<()> { maybe_delay_exit(delay_exit); + scenario.teardown(); + deinit_and_exit(wait_pg_result); } @@ -419,9 +424,13 @@ fn start_postgres( "running compute with features: {:?}", state.pspec.as_ref().unwrap().spec.features ); - // before we release the mutex, fetch the swap size (if any) for later. - let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes; - let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes; + // before we release the mutex, fetch some parameters for later. + let &ComputeSpec { + swap_size_bytes, + disk_quota_bytes, + disable_lfc_resizing, + .. + } = &state.pspec.as_ref().unwrap().spec; drop(state); // Launch remaining service threads @@ -526,11 +535,18 @@ fn start_postgres( // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); + // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC + let pgconnstr = if disable_lfc_resizing.unwrap_or(false) { + None + } else { + file_cache_connstr.cloned() + }; + let vm_monitor = rt.as_ref().map(|rt| { rt.spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { cgroup: cgroup.cloned(), - pgconnstr: file_cache_connstr.cloned(), + pgconnstr, addr: vm_monitor_addr.clone(), })), token.clone(), diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index b6db3eb11abc..793ec4cf1094 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -34,12 +34,12 @@ use nix::unistd::Pid; use tracing::{info, info_span, warn, Instrument}; use utils::fs_ext::is_directory_empty; +#[path = "fast_import/aws_s3_sync.rs"] +mod aws_s3_sync; #[path = "fast_import/child_stdio_to_log.rs"] mod child_stdio_to_log; #[path = "fast_import/s3_uri.rs"] mod s3_uri; -#[path = "fast_import/s5cmd.rs"] -mod s5cmd; #[derive(clap::Parser)] struct Args { @@ -326,7 +326,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { } info!("upload pgdata"); - s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/")) + aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/")) .await .context("sync dump directory to destination")?; @@ -334,10 +334,10 @@ pub(crate) async fn main() -> anyhow::Result<()> { { let status_dir = working_directory.join("status"); std::fs::create_dir(&status_dir).context("create status directory")?; - let status_file = status_dir.join("status"); + let status_file = status_dir.join("pgdata"); std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) .context("write status file")?; - s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata")) + aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/")) .await .context("sync status directory to destination")?; } diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs similarity index 50% rename from compute_tools/src/bin/fast_import/s5cmd.rs rename to compute_tools/src/bin/fast_import/aws_s3_sync.rs index d2d9a79736fe..5fa58c8f875f 100644 --- a/compute_tools/src/bin/fast_import/s5cmd.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -4,24 +4,21 @@ use camino::Utf8Path; use super::s3_uri::S3Uri; pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { - let mut builder = tokio::process::Command::new("s5cmd"); - // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL - if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") { - builder.arg("--endpoint-url").arg(val); - } + let mut builder = tokio::process::Command::new("aws"); builder + .arg("s3") .arg("sync") .arg(local.as_str()) .arg(remote.to_string()); let st = builder .spawn() - .context("spawn s5cmd")? + .context("spawn aws s3 sync")? .wait() .await - .context("wait for s5cmd")?; + .context("wait for aws s3 sync")?; if st.success() { Ok(()) } else { - Err(anyhow::anyhow!("s5cmd failed")) + Err(anyhow::anyhow!("aws s3 sync failed")) } } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d72a04f2f979..78f6033429e9 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1181,8 +1181,19 @@ impl ComputeNode { let mut conf = postgres::config::Config::from(conf); conf.application_name("compute_ctl:migrations"); - let mut client = conf.connect(NoTls)?; - handle_migrations(&mut client).context("apply_config handle_migrations") + match conf.connect(NoTls) { + Ok(mut client) => { + if let Err(e) = handle_migrations(&mut client) { + error!("Failed to run migrations: {}", e); + } + } + Err(e) => { + error!( + "Failed to connect to the compute for running migrations: {}", + e + ); + } + }; }); Ok::<(), anyhow::Error>(()) diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 7fa6426d8f9e..a4b1a63e6d5d 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -24,8 +24,11 @@ use metrics::proto::MetricFamily; use metrics::Encoder; use metrics::TextEncoder; use tokio::task; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; use tracing_utils::http::OtelName; +use utils::failpoint_support::failpoints_handler; +use utils::http::error::ApiError; use utils::http::request::must_get_query_param; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { @@ -310,6 +313,18 @@ async fn routes(req: Request, compute: &Arc) -> Response { + match failpoints_handler(req, CancellationToken::new()).await { + Ok(r) => r, + Err(ApiError::BadRequest(e)) => { + render_json_error(&e.to_string(), StatusCode::BAD_REQUEST) + } + Err(_) => { + render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + // download extension files from remote extension storage on demand (&Method::POST, route) if route.starts_with("/extension_server/") => { info!("serving {:?} POST request", route); diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 22ab145edaec..1f3de65806a8 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -1,13 +1,16 @@ use anyhow::{Context, Result}; +use fail::fail_point; use postgres::Client; use tracing::info; +/// Runs a series of migrations on a target database pub(crate) struct MigrationRunner<'m> { client: &'m mut Client, migrations: &'m [&'m str], } impl<'m> MigrationRunner<'m> { + /// Create a new migration runner pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self { // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64 assert!(migrations.len() + 1 < i64::MAX as usize); @@ -15,6 +18,7 @@ impl<'m> MigrationRunner<'m> { Self { client, migrations } } + /// Get the current value neon_migration.migration_id fn get_migration_id(&mut self) -> Result { let query = "SELECT id FROM neon_migration.migration_id"; let row = self @@ -25,37 +29,61 @@ impl<'m> MigrationRunner<'m> { Ok(row.get::<&str, i64>("id")) } + /// Update the neon_migration.migration_id value + /// + /// This function has a fail point called compute-migration, which can be + /// used if you would like to fail the application of a series of migrations + /// at some point. fn update_migration_id(&mut self, migration_id: i64) -> Result<()> { - let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id); + // We use this fail point in order to check that failing in the + // middle of applying a series of migrations fails in an expected + // manner + if cfg!(feature = "testing") { + let fail = (|| { + fail_point!("compute-migration", |fail_migration_id| { + migration_id == fail_migration_id.unwrap().parse::().unwrap() + }); + + false + })(); + + if fail { + return Err(anyhow::anyhow!(format!( + "migration {} was configured to fail because of a failpoint", + migration_id + ))); + } + } self.client - .simple_query(&setval) + .query( + "UPDATE neon_migration.migration_id SET id = $1", + &[&migration_id], + ) .context("run_migrations update id")?; Ok(()) } - fn prepare_migrations(&mut self) -> Result<()> { - let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; - self.client.simple_query(query)?; - - let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; - self.client.simple_query(query)?; - - let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; - self.client.simple_query(query)?; - - let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; - self.client.simple_query(query)?; - - let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; - self.client.simple_query(query)?; + /// Prepare the migrations the target database for handling migrations + fn prepare_database(&mut self) -> Result<()> { + self.client + .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")?; + self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)")?; + self.client.simple_query( + "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", + )?; + self.client + .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")?; + self.client + .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")?; Ok(()) } + /// Run the configrured set of migrations pub fn run_migrations(mut self) -> Result<()> { - self.prepare_migrations()?; + self.prepare_database()?; let mut current_migration = self.get_migration_id()? as usize; while current_migration < self.migrations.len() { @@ -69,6 +97,11 @@ impl<'m> MigrationRunner<'m> { if migration.starts_with("-- SKIP") { info!("Skipping migration id={}", migration_id!(current_migration)); + + // Even though we are skipping the migration, updating the + // migration ID should help keep logic easy to understand when + // trying to understand the state of a cluster. + self.update_migration_id(migration_id!(current_migration))?; } else { info!( "Running migration id={}:\n{}\n", @@ -87,7 +120,6 @@ impl<'m> MigrationRunner<'m> { ) })?; - // Migration IDs start at 1 self.update_migration_id(migration_id!(current_migration))?; self.client diff --git a/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql new file mode 100644 index 000000000000..0c81cef1c41d --- /dev/null +++ b/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql @@ -0,0 +1,9 @@ +DO $$ +DECLARE + bypassrls boolean; +BEGIN + SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser'; + IF NOT bypassrls THEN + RAISE EXCEPTION 'neon_superuser cannot bypass RLS'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/tests/0002-alter_roles.sql b/compute_tools/src/migrations/tests/0002-alter_roles.sql new file mode 100644 index 000000000000..433f7b34f7df --- /dev/null +++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql @@ -0,0 +1,25 @@ +DO $$ +DECLARE + role record; +BEGIN + FOR role IN + SELECT rolname AS name, rolinherit AS inherit + FROM pg_roles + WHERE pg_has_role(rolname, 'neon_superuser', 'member') + LOOP + IF NOT role.inherit THEN + RAISE EXCEPTION '% cannot inherit', quote_ident(role.name); + END IF; + END LOOP; + + FOR role IN + SELECT rolname AS name, rolbypassrls AS bypassrls + FROM pg_roles + WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member') + AND NOT starts_with(rolname, 'pg_') + LOOP + IF role.bypassrls THEN + RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name); + END IF; + END LOOP; +END $$; diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql new file mode 100644 index 000000000000..b164d61295ec --- /dev/null +++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql @@ -0,0 +1,10 @@ +DO $$ +BEGIN + IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN + RETURN; + END IF; + + IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN + RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql new file mode 100644 index 000000000000..acb8dd417d26 --- /dev/null +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql @@ -0,0 +1,19 @@ +DO $$ +DECLARE + monitor record; +BEGIN + SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, + admin_option AS admin + INTO monitor + FROM pg_auth_members + WHERE roleid = 'pg_monitor'::regrole + AND member = 'pg_monitor'::regrole; + + IF NOT monitor.member THEN + RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor'; + END IF; + + IF NOT monitor.admin THEN + RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql new file mode 100644 index 000000000000..f99101bd6544 --- /dev/null +++ b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql new file mode 100644 index 000000000000..f99101bd6544 --- /dev/null +++ b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql new file mode 100644 index 000000000000..f99101bd6544 --- /dev/null +++ b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql new file mode 100644 index 000000000000..f99101bd6544 --- /dev/null +++ b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql new file mode 100644 index 000000000000..f99101bd6544 --- /dev/null +++ b/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql new file mode 100644 index 000000000000..af7f50e95d18 --- /dev/null +++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql @@ -0,0 +1,13 @@ +DO $$ +DECLARE + can_execute boolean; +BEGIN + SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute')) + INTO can_execute + FROM pg_proc + WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot') + AND pronamespace = 'pg_catalog'::regnamespace; + IF NOT can_execute THEN + RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql new file mode 100644 index 000000000000..e55dcdc3b60e --- /dev/null +++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql @@ -0,0 +1,13 @@ +DO $$ +DECLARE + can_execute boolean; +BEGIN + SELECT has_function_privilege('neon_superuser', oid, 'execute') + INTO can_execute + FROM pg_proc + WHERE proname = 'pg_show_replication_origin_status' + AND pronamespace = 'pg_catalog'::regnamespace; + IF NOT can_execute THEN + RAISE EXCEPTION 'neon_superuser cannot execute pg_show_replication_origin_status'; + END IF; +END $$; diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 1ea443b026a8..c73debae4c40 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -19,6 +19,7 @@ use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; use control_plane::{broker, local_env}; +use nix::fcntl::{flock, FlockArg}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -36,6 +37,8 @@ use safekeeper_api::{ }; use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; +use std::fs::File; +use std::os::fd::AsRawFd; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; @@ -689,6 +692,21 @@ struct TimelineTreeEl { pub children: BTreeSet, } +/// A flock-based guard over the neon_local repository directory +struct RepoLock { + _file: File, +} + +impl RepoLock { + fn new() -> Result { + let repo_dir = File::open(local_env::base_path())?; + let repo_dir_fd = repo_dir.as_raw_fd(); + flock(repo_dir_fd, FlockArg::LockExclusive)?; + + Ok(Self { _file: repo_dir }) + } +} + // Main entry point for the 'neon_local' CLI utility // // This utility helps to manage neon installation. That includes following: @@ -700,9 +718,14 @@ fn main() -> Result<()> { let cli = Cli::parse(); // Check for 'neon init' command first. - let subcommand_result = if let NeonLocalCmd::Init(args) = cli.command { - handle_init(&args).map(|env| Some(Cow::Owned(env))) + let (subcommand_result, _lock) = if let NeonLocalCmd::Init(args) = cli.command { + (handle_init(&args).map(|env| Some(Cow::Owned(env))), None) } else { + // This tool uses a collection of simple files to store its state, and consequently + // it is not generally safe to run multiple commands concurrently. Rather than expect + // all callers to know this, use a lock file to protect against concurrent execution. + let _repo_lock = RepoLock::new().unwrap(); + // all other commands need an existing config let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; let original_env = env.clone(); @@ -728,11 +751,12 @@ fn main() -> Result<()> { NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env), }; - if &original_env != env { + let subcommand_result = if &original_env != env { subcommand_result.map(|()| Some(Cow::Borrowed(env))) } else { subcommand_result.map(|()| None) - } + }; + (subcommand_result, Some(_repo_lock)) }; match subcommand_result { @@ -922,7 +946,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { } else { // User (likely interactive) did not provide a description of the environment, give them the default NeonLocalInitConf { - control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + control_plane_api: Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap()), broker: NeonBroker { listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), }, @@ -1718,18 +1742,15 @@ async fn handle_start_all_impl( broker::start_broker_process(env, &retry_timeout).await }); - // Only start the storage controller if the pageserver is configured to need it - if env.control_plane_api.is_some() { - js.spawn(async move { - let storage_controller = StorageController::from_env(env); - storage_controller - .start(NeonStorageControllerStartArgs::with_default_instance_id( - retry_timeout, - )) - .await - .map_err(|e| e.context("start storage_controller")) - }); - } + js.spawn(async move { + let storage_controller = StorageController::from_env(env); + storage_controller + .start(NeonStorageControllerStartArgs::with_default_instance_id( + retry_timeout, + )) + .await + .map_err(|e| e.context("start storage_controller")) + }); for ps_conf in &env.pageservers { js.spawn(async move { @@ -1774,10 +1795,6 @@ async fn neon_start_status_check( const RETRY_INTERVAL: Duration = Duration::from_millis(100); const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5); - if env.control_plane_api.is_none() { - return Ok(()); - } - let storcon = StorageController::from_env(env); let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 1fdf32605161..5e47ec48117a 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -316,6 +316,10 @@ impl Endpoint { // and can cause errors like 'no unpinned buffers available', see // conf.append("shared_buffers", "1MB"); + // Postgres defaults to effective_io_concurrency=1, which does not exercise the pageserver's + // batching logic. Set this to 2 so that we exercise the code a bit without letting + // individual tests do a lot of concurrent work on underpowered test machines + conf.append("effective_io_concurrency", "2"); conf.append("fsync", "off"); conf.append("max_connections", "100"); conf.append("wal_level", "logical"); @@ -581,6 +585,7 @@ impl Endpoint { features: self.features.clone(), swap_size_bytes: None, disk_quota_bytes: None, + disable_lfc_resizing: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 032c88a829f7..5b82acb3a53f 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -76,7 +76,7 @@ pub struct LocalEnv { // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - pub control_plane_api: Option, + pub control_plane_api: Url, // Control plane upcall API for storage controller. If set, this will be propagated into the // storage controller's configuration. @@ -133,7 +133,7 @@ pub struct NeonLocalInitConf { pub storage_controller: Option, pub pageservers: Vec, pub safekeepers: Vec, - pub control_plane_api: Option>, + pub control_plane_api: Option, pub control_plane_compute_hook_api: Option>, } @@ -180,7 +180,7 @@ impl NeonStorageControllerConf { const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); // Very tight heartbeat interval to speed up tests - const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100); + const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(1000); } impl Default for NeonStorageControllerConf { @@ -535,7 +535,7 @@ impl LocalEnv { storage_controller, pageservers, safekeepers, - control_plane_api, + control_plane_api: control_plane_api.unwrap(), control_plane_compute_hook_api, branch_name_mappings, } @@ -638,7 +638,7 @@ impl LocalEnv { storage_controller: self.storage_controller.clone(), pageservers: vec![], // it's skip_serializing anyway safekeepers: self.safekeepers.clone(), - control_plane_api: self.control_plane_api.clone(), + control_plane_api: Some(self.control_plane_api.clone()), control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), branch_name_mappings: self.branch_name_mappings.clone(), }, @@ -768,7 +768,7 @@ impl LocalEnv { storage_controller: storage_controller.unwrap_or_default(), pageservers: pageservers.iter().map(Into::into).collect(), safekeepers, - control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_api: control_plane_api.unwrap(), control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), branch_name_mappings: Default::default(), }; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 9d3f0183458f..ef5b3d65934c 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -95,21 +95,19 @@ impl PageServerNode { let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; - if let Some(control_plane_api) = &self.env.control_plane_api { - overrides.push(format!( - "control_plane_api='{}'", - control_plane_api.as_str() - )); - - // Storage controller uses the same auth as pageserver: if JWT is enabled - // for us, we will also need it to talk to them. - if matches!(conf.http_auth_type, AuthType::NeonJWT) { - let jwt_token = self - .env - .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) - .unwrap(); - overrides.push(format!("control_plane_api_token='{}'", jwt_token)); - } + overrides.push(format!( + "control_plane_api='{}'", + self.env.control_plane_api.as_str() + )); + + // Storage controller uses the same auth as pageserver: if JWT is enabled + // for us, we will also need it to talk to them. + if matches!(conf.http_auth_type, AuthType::NeonJWT) { + let jwt_token = self + .env + .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) + .unwrap(); + overrides.push(format!("control_plane_api_token='{}'", jwt_token)); } if !conf.other.contains_key("remote_storage") { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index b70bd2e1b5ef..22d2420ed4dc 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -338,7 +338,7 @@ impl StorageController { .port(), ) } else { - let listen_url = self.env.control_plane_api.clone().unwrap(); + let listen_url = self.env.control_plane_api.clone(); let listen = format!( "{}:{}", @@ -708,7 +708,7 @@ impl StorageController { } else { // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out // for general purpose API access. - let listen_url = self.env.control_plane_api.clone().unwrap(); + let listen_url = self.env.control_plane_api.clone(); Url::from_str(&format!( "http://{}:{}/{path}", listen_url.host_str().unwrap(), diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index df07216fde78..6ee1044c1839 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -5,7 +5,8 @@ use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, + SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, + TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -211,6 +212,8 @@ enum Command { #[arg(long)] timeout: humantime::Duration, }, + /// List safekeepers known to the storage controller + Safekeepers {}, } #[derive(Parser)] @@ -1020,6 +1023,31 @@ async fn main() -> anyhow::Result<()> { "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" ); } + Command::Safekeepers {} => { + let mut resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/safekeeper".to_string(), + None, + ) + .await?; + + resp.sort_by(|a, b| a.id.cmp(&b.id)); + + let mut table = comfy_table::Table::new(); + table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]); + for sk in resp { + table.add_row([ + format!("{}", sk.id), + format!("{}", sk.version), + sk.host, + format!("{}", sk.port), + format!("{}", sk.http_port), + sk.availability_zone_id.to_string(), + ]); + } + println!("{table}"); + } } Ok(()) diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index 8e582e74e15d..0308cab4515a 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -132,11 +132,6 @@ "name": "cron.database", "value": "postgres", "vartype": "string" - }, - { - "name": "session_preload_libraries", - "value": "anon", - "vartype": "string" } ] }, diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index c97dfaa901e8..063664d0c67d 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -35,11 +35,11 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do echo "clean up containers if exists" cleanup PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) - # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option - if [ $pg_version -eq 17 ]; then + # The support of pg_anon not yet added to PG17, so we have to add the corresponding option for other PG versions + if [ "${pg_version}" -ne 17 ]; then SPEC_PATH="compute_wrapper/var/db/postgres/specs" mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak - jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json + jq '.cluster.settings += [{"name": "session_preload_libraries","value": "anon","vartype": "string"}]' "${SPEC_PATH}/spec.bak" > "${SPEC_PATH}/spec.json" fi PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d @@ -106,8 +106,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do fi fi cleanup - # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option - if [ $pg_version -eq 17 ]; then - mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json + # Restore the original spec.json + if [ "$pg_version" -ne 17 ]; then + mv "$SPEC_PATH/spec.bak" "$SPEC_PATH/spec.json" fi done diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 6d9c353cda1b..54d6a1d38f7f 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -67,6 +67,15 @@ pub struct ComputeSpec { #[serde(default)] pub disk_quota_bytes: Option, + /// Disables the vm-monitor behavior that resizes LFC on upscale/downscale, instead relying on + /// the initial size of LFC. + /// + /// This is intended for use when the LFC size is being overridden from the default but + /// autoscaling is still enabled, and we don't want the vm-monitor to interfere with the custom + /// LFC sizing. + #[serde(default)] + pub disable_lfc_resizing: Option, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs index 7bb71db95cf4..7ce605bda850 100644 --- a/libs/desim/src/time.rs +++ b/libs/desim/src/time.rs @@ -91,7 +91,7 @@ impl Timing { /// Return true if there is a ready event. fn is_event_ready(&self, queue: &mut BinaryHeap) -> bool { - queue.peek().map_or(false, |x| x.time <= self.now()) + queue.peek().is_some_and(|x| x.time <= self.now()) } /// Clear all pending events. diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index ec7b81423a44..faf11e487c55 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -372,6 +372,23 @@ pub struct MetadataHealthListOutdatedResponse { pub health_records: Vec, } +/// Publicly exposed safekeeper description +/// +/// The `active` flag which we have in the DB is not included on purpose: it is deprecated. +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeeperDescribeResponse { + pub id: NodeId, + pub region_id: String, + /// 1 is special, it means just created (not currently posted to storcon). + /// Zero or negative is not really expected. + /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. + pub version: i64, + pub host: String, + pub port: i32, + pub http_port: i32, + pub availability_zone_id: String, +} + #[cfg(test)] mod test { use super::*; diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 373329c9b464..f0cd713c38b6 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -565,6 +565,10 @@ impl Key { && self.field5 == 0 && self.field6 == u32::MAX } + + pub fn is_slru_dir_key(&self) -> bool { + slru_dir_kind(self).is_some() + } } #[inline(always)] diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5690b643f062..f3fc9fad760a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -6,6 +6,7 @@ pub mod utilization; use camino::Utf8PathBuf; pub use utilization::PageserverUtilization; +use core::ops::Range; use std::{ collections::HashMap, fmt::Display, @@ -28,6 +29,7 @@ use utils::{ }; use crate::{ + key::Key, reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, }; @@ -210,6 +212,68 @@ pub enum TimelineState { Broken { reason: String, backtrace: String }, } +#[serde_with::serde_as] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct CompactLsnRange { + pub start: Lsn, + pub end: Lsn, +} + +#[serde_with::serde_as] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct CompactKeyRange { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub start: Key, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub end: Key, +} + +impl From> for CompactLsnRange { + fn from(range: Range) -> Self { + Self { + start: range.start, + end: range.end, + } + } +} + +impl From> for CompactKeyRange { + fn from(range: Range) -> Self { + Self { + start: range.start, + end: range.end, + } + } +} + +impl From for Range { + fn from(range: CompactLsnRange) -> Self { + range.start..range.end + } +} + +impl From for Range { + fn from(range: CompactKeyRange) -> Self { + range.start..range.end + } +} + +impl CompactLsnRange { + pub fn above(lsn: Lsn) -> Self { + Self { + start: lsn, + end: Lsn::MAX, + } + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct CompactInfoResponse { + pub compact_key_range: Option, + pub compact_lsn_range: Option, + pub sub_compaction: bool, +} + #[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub new_timeline_id: TimelineId, diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index cf0cd3a46b88..4cc0a739e871 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -173,7 +173,11 @@ impl ShardIdentity { /// Return true if the key should be stored on all shards, not just one. pub fn is_key_global(&self, key: &Key) -> bool { - if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() { + if key.is_slru_block_key() + || key.is_slru_segment_size_key() + || key.is_aux_file_key() + || key.is_slru_dir_key() + { // Special keys that are only stored on shard 0 false } else if key.is_rel_block_key() { diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index e1f5443cbef3..b7a376841d45 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -9,9 +9,11 @@ regex.workspace = true bytes.workspace = true anyhow.workspace = true crc32c.workspace = true +criterion.workspace = true once_cell.workspace = true log.workspace = true memoffset.workspace = true +pprof.workspace = true thiserror.workspace = true serde.workspace = true utils.workspace = true @@ -24,3 +26,7 @@ postgres.workspace = true [build-dependencies] anyhow.workspace = true bindgen.workspace = true + +[[bench]] +name = "waldecoder" +harness = false diff --git a/libs/postgres_ffi/benches/README.md b/libs/postgres_ffi/benches/README.md new file mode 100644 index 000000000000..00a8980174fd --- /dev/null +++ b/libs/postgres_ffi/benches/README.md @@ -0,0 +1,26 @@ +## Benchmarks + +To run benchmarks: + +```sh +# All benchmarks. +cargo bench --package postgres_ffi + +# Specific file. +cargo bench --package postgres_ffi --bench waldecoder + +# Specific benchmark. +cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 + +# List available benchmarks. +cargo bench --package postgres_ffi --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 -- --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. \ No newline at end of file diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs new file mode 100644 index 000000000000..c8cf0d322a54 --- /dev/null +++ b/libs/postgres_ffi/benches/waldecoder.rs @@ -0,0 +1,49 @@ +use std::ffi::CStr; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; +use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; +use postgres_ffi::waldecoder::WalStreamDecoder; +use pprof::criterion::{Output, PProfProfiler}; +use utils::lsn::Lsn; + +const KB: usize = 1024; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_complete_record, +); +criterion_main!(benches); + +/// Benchmarks WalStreamDecoder::complete_record() for a logical message of varying size. +fn bench_complete_record(c: &mut Criterion) { + let mut g = c.benchmark_group("complete_record"); + for size in [64, KB, 8 * KB, 128 * KB] { + // Kind of weird to change the group throughput per benchmark, but it's the only way + // to vary it per benchmark. It works. + g.throughput(criterion::Throughput::Bytes(size as u64)); + g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap()); + } + + fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> { + const PREFIX: &CStr = c""; + let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX); + let value = vec![1; value_size]; + + let mut decoder = WalStreamDecoder::new(Lsn(0), 170000); + let msg = LogicalMessageGenerator::new(PREFIX, &value) + .next() + .unwrap() + .encode(Lsn(0)); + assert_eq!(msg.len(), size); + + b.iter(|| { + let msg = msg.clone(); // Bytes::clone() is cheap + decoder.complete_record(msg).unwrap(); + }); + + Ok(()) + } +} diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs index dc679eea3302..a72b035e17bc 100644 --- a/libs/postgres_ffi/src/wal_generator.rs +++ b/libs/postgres_ffi/src/wal_generator.rs @@ -106,11 +106,11 @@ impl WalGenerator { const TIMELINE_ID: u32 = 1; /// Creates a new WAL generator with the given record generator. - pub fn new(record_generator: R) -> WalGenerator { + pub fn new(record_generator: R, start_lsn: Lsn) -> WalGenerator { Self { record_generator, - lsn: Lsn(0), - prev_lsn: Lsn(0), + lsn: start_lsn, + prev_lsn: start_lsn, } } @@ -231,6 +231,22 @@ impl LogicalMessageGenerator { }; [&header.encode(), prefix, message].concat().into() } + + /// Computes how large a value must be to get a record of the given size. Convenience method to + /// construct records of pre-determined size. Panics if the record size is too small. + pub fn make_value_size(record_size: usize, prefix: &CStr) -> usize { + let xlog_header_size = XLOG_SIZE_OF_XLOG_RECORD; + let lm_header_size = size_of::(); + let prefix_size = prefix.to_bytes_with_nul().len(); + let data_header_size = match record_size - xlog_header_size - 2 { + 0..=255 => 2, + 256..=258 => panic!("impossible record_size {record_size}"), + 259.. => 5, + }; + record_size + .checked_sub(xlog_header_size + lm_header_size + prefix_size + data_header_size) + .expect("record_size too small") + } } impl Iterator for LogicalMessageGenerator { diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 9eb3f0e95abf..4a33dbe25b57 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -81,7 +81,7 @@ fn test_end_of_wal(test_name: &str) { continue; } let mut f = File::options().write(true).open(file.path()).unwrap(); - const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; + static ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; f.write_all( &ZEROS[0..min( WAL_SEGMENT_SIZE, diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml index f71c1599c7c2..f66a292d5eac 100644 --- a/libs/proxy/postgres-protocol2/Cargo.toml +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-protocol2" version = "0.1.0" -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs index 947f2f835d4b..6032440f9ad5 100644 --- a/libs/proxy/postgres-protocol2/src/lib.rs +++ b/libs/proxy/postgres-protocol2/src/lib.rs @@ -9,8 +9,7 @@ //! //! This library assumes that the `client_encoding` backend parameter has been //! set to `UTF8`. It will most likely not behave properly if that is not the case. -#![doc(html_root_url = "https://docs.rs/postgres-protocol/0.6")] -#![warn(missing_docs, rust_2018_idioms, clippy::all)] +#![warn(missing_docs, clippy::all)] use byteorder::{BigEndian, ByteOrder}; use bytes::{BufMut, BytesMut}; diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs index bc6168f33732..640f35ada3be 100644 --- a/libs/proxy/postgres-protocol2/src/message/frontend.rs +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -3,7 +3,6 @@ use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, BytesMut}; -use std::convert::TryFrom; use std::error::Error; use std::io; use std::marker; diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml index 58cfb5571f83..57efd94cd31b 100644 --- a/libs/proxy/postgres-types2/Cargo.toml +++ b/libs/proxy/postgres-types2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-types2" version = "0.1.0" -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index 18ba032151a9..d4f3afdfd46c 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -2,8 +2,7 @@ //! //! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it //! unless you want to define your own `ToSql` or `FromSql` definitions. -#![doc(html_root_url = "https://docs.rs/postgres-types/0.2")] -#![warn(clippy::all, rust_2018_idioms, missing_docs)] +#![warn(clippy::all, missing_docs)] use fallible_iterator::FallibleIterator; use postgres_protocol2::types; diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index 7130c1b7266f..56e7c4da47ce 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tokio-postgres2" version = "0.1.0" -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs index 0aa5c77e2230..f478717e0d2d 100644 --- a/libs/proxy/tokio-postgres2/src/connection.rs +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -33,10 +33,14 @@ pub struct Response { #[derive(PartialEq, Debug)] enum State { Active, - Terminating, Closing, } +enum WriteReady { + Terminating, + WaitingOnRead, +} + /// A connection to a PostgreSQL database. /// /// This is one half of what is returned when a new connection is established. It performs the actual IO with the @@ -51,7 +55,6 @@ pub struct Connection { /// HACK: we need this in the Neon Proxy to forward params. pub parameters: HashMap, receiver: mpsc::UnboundedReceiver, - pending_request: Option, pending_responses: VecDeque, responses: VecDeque, state: State, @@ -72,7 +75,6 @@ where stream, parameters, receiver, - pending_request: None, pending_responses, responses: VecDeque::new(), state: State::Active, @@ -93,26 +95,23 @@ where .map(|o| o.map(|r| r.map_err(Error::io))) } - fn poll_read(&mut self, cx: &mut Context<'_>) -> Result, Error> { - if self.state != State::Active { - trace!("poll_read: done"); - return Ok(None); - } - + /// Read and process messages from the connection to postgres. + /// client <- postgres + fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll> { loop { let message = match self.poll_response(cx)? { Poll::Ready(Some(message)) => message, - Poll::Ready(None) => return Err(Error::closed()), + Poll::Ready(None) => return Poll::Ready(Err(Error::closed())), Poll::Pending => { trace!("poll_read: waiting on response"); - return Ok(None); + return Poll::Pending; } }; let (mut messages, request_complete) = match message { BackendMessage::Async(Message::NoticeResponse(body)) => { let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?; - return Ok(Some(AsyncMessage::Notice(error))); + return Poll::Ready(Ok(AsyncMessage::Notice(error))); } BackendMessage::Async(Message::NotificationResponse(body)) => { let notification = Notification { @@ -120,7 +119,7 @@ where channel: body.channel().map_err(Error::parse)?.to_string(), payload: body.message().map_err(Error::parse)?.to_string(), }; - return Ok(Some(AsyncMessage::Notification(notification))); + return Poll::Ready(Ok(AsyncMessage::Notification(notification))); } BackendMessage::Async(Message::ParameterStatus(body)) => { self.parameters.insert( @@ -139,8 +138,10 @@ where let mut response = match self.responses.pop_front() { Some(response) => response, None => match messages.next().map_err(Error::parse)? { - Some(Message::ErrorResponse(error)) => return Err(Error::db(error)), - _ => return Err(Error::unexpected_message()), + Some(Message::ErrorResponse(error)) => { + return Poll::Ready(Err(Error::db(error))) + } + _ => return Poll::Ready(Err(Error::unexpected_message())), }, }; @@ -164,18 +165,14 @@ where request_complete, }); trace!("poll_read: waiting on sender"); - return Ok(None); + return Poll::Pending; } } } } + /// Fetch the next client request and enqueue the response sender. fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { - if let Some(messages) = self.pending_request.take() { - trace!("retrying pending request"); - return Poll::Ready(Some(messages)); - } - if self.receiver.is_closed() { return Poll::Ready(None); } @@ -193,74 +190,80 @@ where } } - fn poll_write(&mut self, cx: &mut Context<'_>) -> Result { + /// Process client requests and write them to the postgres connection, flushing if necessary. + /// client -> postgres + fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll> { loop { - if self.state == State::Closing { - trace!("poll_write: done"); - return Ok(false); - } - if Pin::new(&mut self.stream) .poll_ready(cx) .map_err(Error::io)? .is_pending() { trace!("poll_write: waiting on socket"); - return Ok(false); + + // poll_ready is self-flushing. + return Poll::Pending; } - let request = match self.poll_request(cx) { - Poll::Ready(Some(request)) => request, - Poll::Ready(None) if self.responses.is_empty() && self.state == State::Active => { + match self.poll_request(cx) { + // send the message to postgres + Poll::Ready(Some(RequestMessages::Single(request))) => { + Pin::new(&mut self.stream) + .start_send(request) + .map_err(Error::io)?; + } + // No more messages from the client, and no more responses to wait for. + // Send a terminate message to postgres + Poll::Ready(None) if self.responses.is_empty() => { trace!("poll_write: at eof, terminating"); - self.state = State::Terminating; let mut request = BytesMut::new(); frontend::terminate(&mut request); - RequestMessages::Single(FrontendMessage::Raw(request.freeze())) + let request = FrontendMessage::Raw(request.freeze()); + + Pin::new(&mut self.stream) + .start_send(request) + .map_err(Error::io)?; + + trace!("poll_write: sent eof, closing"); + trace!("poll_write: done"); + return Poll::Ready(Ok(WriteReady::Terminating)); } + // No more messages from the client, but there are still some responses to wait for. Poll::Ready(None) => { trace!( "poll_write: at eof, pending responses {}", self.responses.len() ); - return Ok(true); + ready!(self.poll_flush(cx))?; + return Poll::Ready(Ok(WriteReady::WaitingOnRead)); } + // Still waiting for a message from the client. Poll::Pending => { trace!("poll_write: waiting on request"); - return Ok(true); - } - }; - - match request { - RequestMessages::Single(request) => { - Pin::new(&mut self.stream) - .start_send(request) - .map_err(Error::io)?; - if self.state == State::Terminating { - trace!("poll_write: sent eof, closing"); - self.state = State::Closing; - } + ready!(self.poll_flush(cx))?; + return Poll::Pending; } } } } - fn poll_flush(&mut self, cx: &mut Context<'_>) -> Result<(), Error> { + fn poll_flush(&mut self, cx: &mut Context<'_>) -> Poll> { match Pin::new(&mut self.stream) .poll_flush(cx) .map_err(Error::io)? { - Poll::Ready(()) => trace!("poll_flush: flushed"), - Poll::Pending => trace!("poll_flush: waiting on socket"), + Poll::Ready(()) => { + trace!("poll_flush: flushed"); + Poll::Ready(Ok(())) + } + Poll::Pending => { + trace!("poll_flush: waiting on socket"); + Poll::Pending + } } - Ok(()) } fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll> { - if self.state != State::Closing { - return Poll::Pending; - } - match Pin::new(&mut self.stream) .poll_close(cx) .map_err(Error::io)? @@ -289,18 +292,30 @@ where &mut self, cx: &mut Context<'_>, ) -> Poll>> { - let message = self.poll_read(cx)?; - let want_flush = self.poll_write(cx)?; - if want_flush { - self.poll_flush(cx)?; + if self.state != State::Closing { + // if the state is still active, try read from and write to postgres. + let message = self.poll_read(cx)?; + let closing = self.poll_write(cx)?; + if let Poll::Ready(WriteReady::Terminating) = closing { + self.state = State::Closing; + } + + if let Poll::Ready(message) = message { + return Poll::Ready(Some(Ok(message))); + } + + // poll_read returned Pending. + // poll_write returned Pending or Ready(WriteReady::WaitingOnRead). + // if poll_write returned Ready(WriteReady::WaitingOnRead), then we are waiting to read more data from postgres. + if self.state != State::Closing { + return Poll::Pending; + } } - match message { - Some(message) => Poll::Ready(Some(Ok(message))), - None => match self.poll_shutdown(cx) { - Poll::Ready(Ok(())) => Poll::Ready(None), - Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))), - Poll::Pending => Poll::Pending, - }, + + match self.poll_shutdown(cx) { + Poll::Ready(Ok(())) => Poll::Ready(None), + Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))), + Poll::Pending => Poll::Pending, } } } diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 901ed0c96c68..9155dd82792a 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -1,5 +1,5 @@ //! An asynchronous, pipelined, PostgreSQL client. -#![warn(rust_2018_idioms, clippy::all)] +#![warn(clippy::all)] pub use crate::cancel_token::CancelToken; pub use crate::client::{Client, SocketConfig}; diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs index 427f77dd79b2..7e12992728dd 100644 --- a/libs/proxy/tokio-postgres2/src/to_statement.rs +++ b/libs/proxy/tokio-postgres2/src/to_statement.rs @@ -11,7 +11,7 @@ mod private { Query(&'a str), } - impl<'a> ToStatementType<'a> { + impl ToStatementType<'_> { pub async fn into_statement(self, client: &Client) -> Result { match self { ToStatementType::Statement(s) => Ok(s.clone()), diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 1816825bda7a..33fa6e89f501 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -18,6 +18,7 @@ camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true hyper = { workspace = true, features = ["client"] } futures.workspace = true +reqwest.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 32c51bc2add5..c89f50ef2b24 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -8,6 +8,7 @@ use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; +use std::sync::Arc; use std::time::Duration; use std::time::SystemTime; @@ -15,6 +16,8 @@ use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Context; use anyhow::Result; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; +use azure_core::HttpClient; +use azure_core::TransportOptions; use azure_core::{Continuable, RetryOptions}; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; @@ -80,8 +83,13 @@ impl AzureBlobStorage { StorageCredentials::token_credential(token_credential) }; - // we have an outer retry - let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none()); + let builder = ClientBuilder::new(account, credentials) + // we have an outer retry + .retry(RetryOptions::none()) + // Customize transport to configure conneciton pooling + .transport(TransportOptions::new(Self::reqwest_client( + azure_config.conn_pool_size, + ))); let client = builder.container_client(azure_config.container_name.to_owned()); @@ -106,6 +114,14 @@ impl AzureBlobStorage { }) } + fn reqwest_client(conn_pool_size: usize) -> Arc { + let client = reqwest::ClientBuilder::new() + .pool_max_idle_per_host(conn_pool_size) + .build() + .expect("failed to build `reqwest` client"); + Arc::new(client) + } + pub fn relative_path_to_name(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); let path_string = path.get_path().as_str(); @@ -544,9 +560,9 @@ impl RemoteStorage for AzureBlobStorage { .await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index f6ef31077c76..dd49d4d5e710 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -114,6 +114,16 @@ fn default_max_keys_per_list_response() -> Option { DEFAULT_MAX_KEYS_PER_LIST_RESPONSE } +fn default_azure_conn_pool_size() -> usize { + // Conservative default: no connection pooling. At time of writing this is the Azure + // SDK's default as well, due to historic reports of hard-to-reproduce issues + // (https://github.com/hyperium/hyper/issues/2312) + // + // However, using connection pooling is important to avoid exhausting client ports when + // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971) + 0 +} + impl Debug for S3Config { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("S3Config") @@ -146,6 +156,8 @@ pub struct AzureConfig { pub concurrency_limit: NonZeroUsize, #[serde(default = "default_max_keys_per_list_response")] pub max_keys_per_list_response: Option, + #[serde(default = "default_azure_conn_pool_size")] + pub conn_pool_size: usize, } fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { @@ -302,6 +314,7 @@ timeout = '5s'"; container_region = 'westeurope' upload_storage_class = 'INTELLIGENT_TIERING' timeout = '7s' + conn_pool_size = 8 "; let config = parse(toml).unwrap(); @@ -316,6 +329,7 @@ timeout = '5s'"; prefix_in_container: None, concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + conn_pool_size: 8, }), timeout: Duration::from_secs(7), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 2a3468f98685..7a864151ecef 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -341,9 +341,9 @@ pub trait RemoteStorage: Send + Sync + 'static { /// If the operation fails because of timeout or cancellation, the root cause of the error will be /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went /// through. - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()>; diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1a2d421c6618..a8b00173ba51 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -562,9 +562,9 @@ impl RemoteStorage for LocalFs { } } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { for path in paths { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 2891f92d0796..d3f19f0b119a 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -813,9 +813,9 @@ impl RemoteStorage for S3Bucket { .await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 51833c1fe658..63c24beb516d 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -181,9 +181,9 @@ impl RemoteStorage for UnreliableWrapper { self.delete_inner(path, true, cancel).await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?; diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 92d579fec866..15004dbf83f8 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -218,6 +218,7 @@ async fn create_azure_client( prefix_in_container: Some(format!("test_{millis}_{random:08x}/")), concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, + conn_pool_size: 8, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 14811232d33b..4234ec6779a2 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -5,6 +5,9 @@ edition.workspace = true license.workspace = true [dependencies] -serde.workspace = true const_format.workspace = true +serde.workspace = true +postgres_ffi.workspace = true +pq_proto.workspace = true +tokio.workspace = true utils.workspace = true diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs index 63c2c51188b8..be6923aca902 100644 --- a/libs/safekeeper_api/src/lib.rs +++ b/libs/safekeeper_api/src/lib.rs @@ -1,10 +1,27 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use const_format::formatcp; +use pq_proto::SystemId; +use serde::{Deserialize, Serialize}; /// Public API types pub mod models; +/// Consensus logical timestamp. Note: it is a part of sk control file. +pub type Term = u64; +pub const INVALID_TERM: Term = 0; + +/// Information about Postgres. Safekeeper gets it once and then verifies all +/// further connections from computes match. Note: it is a part of sk control +/// file. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ServerInfo { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + pub wal_seg_size: u32, +} + pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 28666d197afd..3e424a792c7f 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,10 +1,23 @@ +//! Types used in safekeeper http API. Many of them are also reused internally. + +use postgres_ffi::TimestampTz; use serde::{Deserialize, Serialize}; +use std::net::SocketAddr; +use tokio::time::Instant; use utils::{ - id::{NodeId, TenantId, TimelineId}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, + pageserver_feedback::PageserverFeedback, }; +use crate::{ServerInfo, Term}; + +#[derive(Debug, Serialize)] +pub struct SafekeeperStatus { + pub id: NodeId, +} + #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { pub tenant_id: TenantId, @@ -18,6 +31,161 @@ pub struct TimelineCreateRequest { pub local_start_lsn: Option, } +/// Same as TermLsn, but serializes LSN using display serializer +/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct TermSwitchApiEntry { + pub term: Term, + pub lsn: Lsn, +} + +/// Augment AcceptorState with last_log_term for convenience +#[derive(Debug, Serialize, Deserialize)] +pub struct AcceptorStateStatus { + pub term: Term, + pub epoch: Term, // aka last_log_term, old `epoch` name is left for compatibility + pub term_history: Vec, +} + +/// Things safekeeper should know about timeline state on peers. +/// Used as both model and internally. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PeerInfo { + pub sk_id: NodeId, + pub term: Term, + /// Term of the last entry. + pub last_log_term: Term, + /// LSN of the last record. + pub flush_lsn: Lsn, + pub commit_lsn: Lsn, + /// Since which LSN safekeeper has WAL. + pub local_start_lsn: Lsn, + /// When info was received. Serde annotations are not very useful but make + /// the code compile -- we don't rely on this field externally. + #[serde(skip)] + #[serde(default = "Instant::now")] + pub ts: Instant, + pub pg_connstr: String, + pub http_connstr: String, +} + +pub type FullTransactionId = u64; + +/// Hot standby feedback received from replica +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct HotStandbyFeedback { + pub ts: TimestampTz, + pub xmin: FullTransactionId, + pub catalog_xmin: FullTransactionId, +} + +pub const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; + +impl HotStandbyFeedback { + pub fn empty() -> HotStandbyFeedback { + HotStandbyFeedback { + ts: 0, + xmin: 0, + catalog_xmin: 0, + } + } +} + +/// Standby status update +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct StandbyReply { + pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. + pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. + pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. + pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. + pub reply_requested: bool, +} + +impl StandbyReply { + pub fn empty() -> Self { + StandbyReply { + write_lsn: Lsn::INVALID, + flush_lsn: Lsn::INVALID, + apply_lsn: Lsn::INVALID, + reply_ts: 0, + reply_requested: false, + } + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct StandbyFeedback { + pub reply: StandbyReply, + pub hs_feedback: HotStandbyFeedback, +} + +impl StandbyFeedback { + pub fn empty() -> Self { + StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: HotStandbyFeedback::empty(), + } + } +} + +/// Receiver is either pageserver or regular standby, which have different +/// feedbacks. +/// Used as both model and internally. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub enum ReplicationFeedback { + Pageserver(PageserverFeedback), + Standby(StandbyFeedback), +} + +/// Uniquely identifies a WAL service connection. Logged in spans for +/// observability. +pub type ConnectionId = u32; + +/// Serialize is used only for json'ing in API response. Also used internally. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalSenderState { + pub ttid: TenantTimelineId, + pub addr: SocketAddr, + pub conn_id: ConnectionId, + // postgres application_name + pub appname: Option, + pub feedback: ReplicationFeedback, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalReceiverState { + /// None means it is recovery initiated by us (this safekeeper). + pub conn_id: Option, + pub status: WalReceiverStatus, +} + +/// Walreceiver status. Currently only whether it passed voting stage and +/// started receiving the stream, but it is easy to add more if needed. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum WalReceiverStatus { + Voting, + Streaming, +} + +/// Info about timeline on safekeeper ready for reporting. +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineStatus { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub acceptor_state: AcceptorStateStatus, + pub pg_info: ServerInfo, + pub flush_lsn: Lsn, + pub timeline_start_lsn: Lsn, + pub local_start_lsn: Lsn, + pub commit_lsn: Lsn, + pub backup_lsn: Lsn, + pub peer_horizon_lsn: Lsn, + pub remote_consistent_lsn: Lsn, + pub peers: Vec, + pub walsenders: Vec, + pub walreceivers: Vec, +} + fn lsn_invalid() -> Lsn { Lsn::INVALID } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 66500fb141bc..02bf77760a8e 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -15,17 +15,20 @@ arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true anyhow.workspace = true +backtrace.workspace = true bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true diatomic-waker.workspace = true +flate2.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper0 = { workspace = true, features = ["full"] } +itertools.workspace = true fail.workspace = true -futures = { workspace = true} +futures = { workspace = true } jemalloc_pprof.workspace = true jsonwebtoken.workspace = true nix.workspace = true diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index 870684b399c6..701ba2d42cb9 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::*; -/// Declare a failpoint that can use the `pause` failpoint action. +/// Declare a failpoint that can use to `pause` failpoint action. /// We don't want to block the executor thread, hence, spawn_blocking + await. #[macro_export] macro_rules! pausable_failpoint { @@ -181,7 +181,7 @@ pub async fn failpoints_handler( ) -> Result, ApiError> { if !fail::has_failpoints() { return Err(ApiError::BadRequest(anyhow::anyhow!( - "Cannot manage failpoints because storage was compiled without failpoints support" + "Cannot manage failpoints because neon was compiled without failpoints support" ))); } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index d975b63677ac..9b37b699398e 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,15 +1,22 @@ use crate::auth::{AuthError, Claims, SwappableJwtAuth}; use crate::http::error::{api_error_handler, route_error_handler, ApiError}; use crate::http::request::{get_query_param, parse_query_param}; +use crate::pprof; +use ::pprof::protos::Message as _; +use ::pprof::ProfilerGuardBuilder; use anyhow::{anyhow, Context}; +use bytes::{Bytes, BytesMut}; use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; use hyper::http::HeaderValue; use hyper::Method; use hyper::{header::CONTENT_TYPE, Body, Request, Response}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; +use regex::Regex; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio::sync::{mpsc, Mutex}; +use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; @@ -18,11 +25,6 @@ use std::io::Write as _; use std::str::FromStr; use std::time::Duration; -use bytes::{Bytes, BytesMut}; -use pprof::protos::Message as _; -use tokio::sync::{mpsc, Mutex}; -use tokio_stream::wrappers::ReceiverStream; - static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", @@ -365,7 +367,7 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A // Take the profile. let report = tokio::task::spawn_blocking(move || { - let guard = pprof::ProfilerGuardBuilder::default() + let guard = ProfilerGuardBuilder::default() .frequency(frequency_hz) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) .build()?; @@ -457,10 +459,34 @@ pub async fn profile_heap_handler(req: Request) -> Result, } Format::Pprof => { - let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let data = tokio::task::spawn_blocking(move || { + let bytes = prof_ctl.dump_pprof()?; + // Symbolize the profile. + // TODO: consider moving this upstream to jemalloc_pprof and avoiding the + // serialization roundtrip. + static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { + // Functions to strip from profiles. If true, also remove child frames. + vec![ + (Regex::new("^__rust").unwrap(), false), + (Regex::new("^_start$").unwrap(), false), + (Regex::new("^irallocx_prof").unwrap(), true), + (Regex::new("^prof_alloc_prep").unwrap(), true), + (Regex::new("^std::rt::lang_start").unwrap(), false), + (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), + ] + }); + let profile = pprof::decode(&bytes)?; + let profile = pprof::symbolize(profile)?; + let profile = pprof::strip_locations( + profile, + &["libc", "libgcc", "pthread", "vdso"], + &STRIP_FUNCTIONS, + ); + pprof::encode(&profile) + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index bccd0e048814..2c56dd750f75 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -96,6 +96,8 @@ pub mod circuit_breaker; pub mod try_rcu; +pub mod pprof; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/pprof.rs b/libs/utils/src/pprof.rs new file mode 100644 index 000000000000..90910897bf17 --- /dev/null +++ b/libs/utils/src/pprof.rs @@ -0,0 +1,190 @@ +use flate2::write::{GzDecoder, GzEncoder}; +use flate2::Compression; +use itertools::Itertools as _; +use once_cell::sync::Lazy; +use pprof::protos::{Function, Line, Message as _, Profile}; +use regex::Regex; + +use std::borrow::Cow; +use std::collections::{HashMap, HashSet}; +use std::ffi::c_void; +use std::io::Write as _; + +/// Decodes a gzip-compressed Protobuf-encoded pprof profile. +pub fn decode(bytes: &[u8]) -> anyhow::Result { + let mut gz = GzDecoder::new(Vec::new()); + gz.write_all(bytes)?; + Ok(Profile::parse_from_bytes(&gz.finish()?)?) +} + +/// Encodes a pprof profile as gzip-compressed Protobuf. +pub fn encode(profile: &Profile) -> anyhow::Result> { + let mut gz = GzEncoder::new(Vec::new(), Compression::default()); + profile.write_to_writer(&mut gz)?; + Ok(gz.finish()?) +} + +/// Symbolizes a pprof profile using the current binary. +pub fn symbolize(mut profile: Profile) -> anyhow::Result { + if !profile.function.is_empty() { + return Ok(profile); // already symbolized + } + + // Collect function names. + let mut functions: HashMap = HashMap::new(); + let mut strings: HashMap = profile + .string_table + .into_iter() + .enumerate() + .map(|(i, s)| (s, i as i64)) + .collect(); + + // Helper to look up or register a string. + let mut string_id = |s: &str| -> i64 { + // Don't use .entry() to avoid unnecessary allocations. + if let Some(id) = strings.get(s) { + return *id; + } + let id = strings.len() as i64; + strings.insert(s.to_string(), id); + id + }; + + for loc in &mut profile.location { + if !loc.line.is_empty() { + continue; + } + + // Resolve the line and function for each location. + backtrace::resolve(loc.address as *mut c_void, |symbol| { + let Some(symname) = symbol.name() else { + return; + }; + let mut name = symname.to_string(); + + // Strip the Rust monomorphization suffix from the symbol name. + static SUFFIX_REGEX: Lazy = + Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex")); + if let Some(m) = SUFFIX_REGEX.find(&name) { + name.truncate(m.start()); + } + + let function_id = match functions.get(&name) { + Some(function) => function.id, + None => { + let id = functions.len() as u64 + 1; + let system_name = String::from_utf8_lossy(symname.as_bytes()); + let filename = symbol + .filename() + .map(|path| path.to_string_lossy()) + .unwrap_or(Cow::Borrowed("")); + let function = Function { + id, + name: string_id(&name), + system_name: string_id(&system_name), + filename: string_id(&filename), + ..Default::default() + }; + functions.insert(name, function); + id + } + }; + loc.line.push(Line { + function_id, + line: symbol.lineno().unwrap_or(0) as i64, + ..Default::default() + }); + }); + } + + // Store the resolved functions, and mark the mapping as resolved. + profile.function = functions.into_values().sorted_by_key(|f| f.id).collect(); + profile.string_table = strings + .into_iter() + .sorted_by_key(|(_, i)| *i) + .map(|(s, _)| s) + .collect(); + + for mapping in &mut profile.mapping { + mapping.has_functions = true; + mapping.has_filenames = true; + } + + Ok(profile) +} + +/// Strips locations (stack frames) matching the given mappings (substring) or function names +/// (regex). The function bool specifies whether child frames should be stripped as well. +/// +/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all +/// string references. +pub fn strip_locations( + mut profile: Profile, + mappings: &[&str], + functions: &[(Regex, bool)], +) -> Profile { + // Strip mappings. + let mut strip_mappings: HashSet = HashSet::new(); + + profile.mapping.retain(|mapping| { + let Some(name) = profile.string_table.get(mapping.filename as usize) else { + return true; + }; + if mappings.iter().any(|substr| name.contains(substr)) { + strip_mappings.insert(mapping.id); + return false; + } + true + }); + + // Strip functions. + let mut strip_functions: HashMap = HashMap::new(); + + profile.function.retain(|function| { + let Some(name) = profile.string_table.get(function.name as usize) else { + return true; + }; + for (regex, strip_children) in functions { + if regex.is_match(name) { + strip_functions.insert(function.id, *strip_children); + return false; + } + } + true + }); + + // Strip locations. The bool specifies whether child frames should be stripped too. + let mut strip_locations: HashMap = HashMap::new(); + + profile.location.retain(|location| { + for line in &location.line { + if let Some(strip_children) = strip_functions.get(&line.function_id) { + strip_locations.insert(location.id, *strip_children); + return false; + } + } + if strip_mappings.contains(&location.mapping_id) { + strip_locations.insert(location.id, false); + return false; + } + true + }); + + // Strip sample locations. + for sample in &mut profile.sample { + // First, find the uppermost function with child removal and truncate the stack. + if let Some(truncate) = sample + .location_id + .iter() + .rposition(|id| strip_locations.get(id) == Some(&true)) + { + sample.location_id.drain(..=truncate); + } + // Next, strip any individual frames without child removal. + sample + .location_id + .retain(|id| !strip_locations.contains_key(id)); + } + + profile +} diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 20f88868f91d..7779ffaf8b8d 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -272,7 +272,7 @@ struct CompactionJob { completed: bool, } -impl<'a, E> LevelCompactionState<'a, E> +impl LevelCompactionState<'_, E> where E: CompactionJobExecutor, { diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index 1853afffdd9d..e04bd153960f 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -224,9 +224,8 @@ impl Level { } // recalculate depth if this was the last event at this point - let more_events_at_this_key = events_iter - .peek() - .map_or(false, |next_e| next_e.key == e.key); + let more_events_at_this_key = + events_iter.peek().is_some_and(|next_e| next_e.key == e.key); if !more_events_at_this_key { let mut active_depth = 0; for (_end_lsn, is_image, _idx) in active_set.iter().rev() { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 5bc9b5ca1de9..8ed393a64586 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -148,7 +148,7 @@ pub trait CompactionDeltaLayer: CompactionLay Self: 'a; /// Return all keys in this delta layer. - fn load_keys<'a>( + fn load_keys( &self, ctx: &E::RequestContext, ) -> impl Future>>> + Send; diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 776c537d0308..673b80c313d9 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -143,7 +143,7 @@ impl interface::CompactionLayer for Arc { impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; - async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result> { + async fn load_keys(&self, _ctx: &MockRequestContext) -> anyhow::Result> { Ok(self.records.clone()) } } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index cae0ffb9805b..e1b5676f464b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -248,7 +248,7 @@ where } } -impl<'a, W> Basebackup<'a, W> +impl Basebackup<'_, W> where W: AsyncWrite + Send + Sync + Unpin, { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 567a69da3b12..b92ff4ebf946 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). -#[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +// TODO: disabled because concurrent CPU profiles cause seg faults. See: +// https://github.com/neondatabase/neon/issues/10225. +//#[allow(non_upper_case_globals)] +//#[export_name = "malloc_conf"] +//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; const PID_FILE_NAME: &str = "pageserver.pid"; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index db7d29385641..60ef4c3702f4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::DEFAULT_PG_VERSION; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, - TimelineInfo, + CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, + TimelineGcRequest, TimelineInfo, }; use utils::{ auth::SwappableJwtAuth, @@ -2039,6 +2039,34 @@ async fn timeline_cancel_compact_handler( .await } +// Get compact info of a timeline +async fn timeline_compact_info_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + let res = tenant.get_scheduled_compaction_tasks(timeline_id); + let mut resp = Vec::new(); + for item in res { + resp.push(CompactInfoResponse { + compact_key_range: item.compact_key_range, + compact_lsn_range: item.compact_lsn_range, + sub_compaction: item.sub_compaction, + }); + } + json_response(StatusCode::OK, resp) + } + .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await +} + // Run compaction immediately on given timeline. async fn timeline_compact_handler( mut request: Request, @@ -3400,6 +3428,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", |r| api_handler(r, timeline_gc_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", + |r| api_handler(r, timeline_compact_info_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_handler), diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 255bd01e259e..14c7e0d2f86d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1242,7 +1242,7 @@ pub struct DatadirModification<'a> { pending_metadata_bytes: usize, } -impl<'a> DatadirModification<'a> { +impl DatadirModification<'_> { // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed. @@ -1263,7 +1263,7 @@ impl<'a> DatadirModification<'a> { pub(crate) fn has_dirty_data(&self) -> bool { self.pending_data_batch .as_ref() - .map_or(false, |b| b.has_data()) + .is_some_and(|b| b.has_data()) } /// Set the current lsn @@ -1319,18 +1319,23 @@ impl<'a> DatadirModification<'a> { let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); let empty_dir = Value::Image(buf); - self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); - self.put( - slru_dir_to_key(SlruKind::MultiXactMembers), - empty_dir.clone(), - ); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); - self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); + + // Initialize SLRUs on shard 0 only: creating these on other shards would be + // harmless but they'd just be dropped on later compaction. + if self.tline.tenant_shard_id.is_shard_zero() { + self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.put( + slru_dir_to_key(SlruKind::MultiXactMembers), + empty_dir.clone(), + ); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); + } Ok(()) } @@ -2225,7 +2230,7 @@ impl<'a> DatadirModification<'a> { assert!(!self .pending_data_batch .as_ref() - .map_or(false, |b| b.updates_key(&key))); + .is_some_and(|b| b.updates_key(&key))); } } @@ -2294,7 +2299,7 @@ pub enum Version<'a> { Modified(&'a DatadirModification<'a>), } -impl<'a> Version<'a> { +impl Version<'_> { async fn get( &self, timeline: &Timeline, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 99289d5f15f7..2e4c47c6e40f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3122,6 +3122,23 @@ impl Tenant { } } + pub(crate) fn get_scheduled_compaction_tasks( + &self, + timeline_id: TimelineId, + ) -> Vec { + use itertools::Itertools; + let guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard + .get(&timeline_id) + .map(|tline_pending_tasks| { + tline_pending_tasks + .iter() + .map(|x| x.options.clone()) + .collect_vec() + }) + .unwrap_or_default() + } + /// Schedule a compaction task for a timeline. pub(crate) async fn schedule_compaction( &self, @@ -5759,13 +5776,13 @@ mod tests { use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; + #[cfg(feature = "testing")] + use models::CompactLsnRange; #[cfg(feature = "testing")] use pageserver_api::record::NeonWalRecord; #[cfg(feature = "testing")] use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; #[cfg(feature = "testing")] - use timeline::CompactLsnRange; - #[cfg(feature = "testing")] use timeline::GcInfo; static TEST_KEY: Lazy = @@ -9634,7 +9651,7 @@ mod tests { #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { - use timeline::CompactLsnRange; + use models::CompactLsnRange; let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; let (tenant, ctx) = harness.load().await; diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index dd70f6bbff8c..7b55df52a54a 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -35,7 +35,7 @@ pub struct CompressionInfo { pub compressed_size: Option, } -impl<'a> BlockCursor<'a> { +impl BlockCursor<'_> { /// Read a blob into a new buffer. pub async fn read_blob( &self, diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 2bd7f2d619aa..990211f80a92 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -89,7 +89,7 @@ pub(crate) enum BlockReaderRef<'a> { VirtualFile(&'a VirtualFile), } -impl<'a> BlockReaderRef<'a> { +impl BlockReaderRef<'_> { #[inline(always)] async fn read_blk( &self, diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index 1e8fa8d1d64e..f98356242e1c 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -1,12 +1,15 @@ use std::collections::BTreeSet; use itertools::Itertools; +use pageserver_compaction::helpers::overlaps_with; use super::storage_layer::LayerName; /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). /// -/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, +/// The function implements a fast path check and a slow path check. +/// +/// The fast path checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, /// /// ```plain /// | | | | @@ -25,31 +28,47 @@ use super::storage_layer::LayerName; /// | | | 4 | | | /// /// If layer 2 and 4 contain the same single key, this is also a valid layer map. +/// +/// However, if a partial compaction is still going on, it is possible that we get a layer map not satisfying the above condition. +/// Therefore, we fallback to simply check if any of the two delta layers overlap. (See "A slow path...") pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) let mut all_delta_layers = Vec::new(); for name in metadata { if let LayerName::Delta(layer) = name { - if layer.key_range.start.next() != layer.key_range.end { - all_delta_layers.push(layer.clone()); - } + all_delta_layers.push(layer.clone()); } } for layer in &all_delta_layers { - let lsn_range = &layer.lsn_range; - lsn_split_point.insert(lsn_range.start); - lsn_split_point.insert(lsn_range.end); + if layer.key_range.start.next() != layer.key_range.end { + let lsn_range = &layer.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } } - for layer in &all_delta_layers { + for (idx, layer) in all_delta_layers.iter().enumerate() { + if layer.key_range.start.next() == layer.key_range.end { + continue; + } let lsn_range = layer.lsn_range.clone(); let intersects = lsn_split_point.range(lsn_range).collect_vec(); if intersects.len() > 1 { - let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", - layer, - intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") - ); - return Some(err); + // A slow path to check if the layer intersects with any other delta layer. + for (other_idx, other_layer) in all_delta_layers.iter().enumerate() { + if other_idx == idx { + // do not check self intersects with self + continue; + } + if overlaps_with(&layer.lsn_range, &other_layer.lsn_range) + && overlaps_with(&layer.key_range, &other_layer.key_range) + { + let err = format!( + "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", + layer, other_layer + ); + return Some(err); + } + } } } None diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index b302cbc97559..c77342b144d5 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -532,7 +532,7 @@ pub struct DiskBtreeIterator<'a> { >, } -impl<'a> DiskBtreeIterator<'a> { +impl DiskBtreeIterator<'_> { pub async fn next(&mut self) -> Option, u64), DiskBtreeError>> { self.stream.next().await } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index aaec8a4c313a..ba79672bc79d 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -174,11 +174,11 @@ impl EphemeralFile { } impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, dst: tokio_epoll_uring::Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { let submitted_offset = self.buffered_writer.bytes_submitted(); diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 7f15baed10f4..1b6924425c25 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -392,8 +392,8 @@ impl LayerMap { image_layer: Option>, end_lsn: Lsn, ) -> Option { - assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta())); - assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta())); + assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta())); + assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta())); match (delta_layer, image_layer) { (None, None) => None, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 20e0536a00e5..fee11bc742bf 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -749,7 +749,7 @@ impl RemoteTimelineClient { // ahead of what's _actually_ on the remote during index upload. upload_queue.dirty.metadata = metadata.clone(); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Ok(()) } @@ -770,7 +770,7 @@ impl RemoteTimelineClient { upload_queue.dirty.metadata.apply(update); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Ok(()) } @@ -809,7 +809,7 @@ impl RemoteTimelineClient { if let Some(archived_at_set) = need_upload_scheduled { let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc()); upload_queue.dirty.archived_at = intended_archived_at; - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some(); @@ -824,7 +824,7 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; upload_queue.dirty.import_pgdata = state; - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Ok(()) } @@ -843,17 +843,14 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } Ok(()) } /// Launch an index-file upload operation in the background (internal function) - fn schedule_index_upload( - self: &Arc, - upload_queue: &mut UploadQueueInitialized, - ) -> Result<(), NotInitialized> { + fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); // fix up the duplicated field upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; @@ -880,7 +877,6 @@ impl RemoteTimelineClient { // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); - Ok(()) } /// Reparent this timeline to a new parent. @@ -909,7 +905,7 @@ impl RemoteTimelineClient { upload_queue.dirty.metadata.reparent(new_parent); upload_queue.dirty.lineage.record_previous_ancestor(&prev); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } @@ -948,7 +944,7 @@ impl RemoteTimelineClient { assert!(prev.is_none(), "copied layer existed already {layer}"); } - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } @@ -1004,7 +1000,7 @@ impl RemoteTimelineClient { upload_queue.dirty.gc_blocking = current .map(|x| x.with_reason(reason)) .or_else(|| Some(index::GcBlocking::started_now_for(reason))); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } } @@ -1057,8 +1053,7 @@ impl RemoteTimelineClient { upload_queue.dirty.gc_blocking = current.as_ref().and_then(|x| x.without_reason(reason)); assert!(wanted(upload_queue.dirty.gc_blocking.as_ref())); - // FIXME: bogus ? - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } } @@ -1125,8 +1120,8 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - let with_metadata = self - .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?; + let with_metadata = + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned()); self.schedule_deletion_of_unlinked0(upload_queue, with_metadata); @@ -1153,7 +1148,7 @@ impl RemoteTimelineClient { let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); @@ -1166,7 +1161,7 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Result, NotInitialized> + ) -> Vec<(LayerName, LayerFileMetadata)> where I: IntoIterator, { @@ -1208,10 +1203,10 @@ impl RemoteTimelineClient { // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } - Ok(with_metadata) + with_metadata } /// Schedules deletion for layer files which have previously been unlinked from the @@ -1302,7 +1297,7 @@ impl RemoteTimelineClient { let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); Ok(()) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d15f161fb6da..b4d45dca7523 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -145,8 +145,8 @@ pub async fn download_layer_file<'a>( /// /// If Err() is returned, there was some error. The file at `dst_path` has been unlinked. /// The unlinking has _not_ been made durable. -async fn download_object<'a>( - storage: &'a GenericRemoteStorage, +async fn download_object( + storage: &GenericRemoteStorage, src_path: &RemotePath, dst_path: &Utf8PathBuf, #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate, diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 0cd5d05aa276..e434d24e5f9c 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -25,8 +25,8 @@ use utils::id::{TenantId, TimelineId}; use tracing::info; /// Serializes and uploads the given index part data to the remote storage. -pub(crate) async fn upload_index_part<'a>( - storage: &'a GenericRemoteStorage, +pub(crate) async fn upload_index_part( + storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9e3a25cbbc53..b8206fca5a1c 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -345,10 +345,7 @@ impl LayerFringe { } pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { - let read_desc = match self.planned_visits_by_lsn.pop() { - Some(desc) => desc, - None => return None, - }; + let read_desc = self.planned_visits_by_lsn.pop()?; let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index fec8a0a16c50..ade1b794c65d 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1486,7 +1486,7 @@ pub struct ValueRef<'a> { layer: &'a DeltaLayerInner, } -impl<'a> ValueRef<'a> { +impl ValueRef<'_> { /// Loads the value from disk pub async fn load(&self, ctx: &RequestContext) -> Result { let buf = self.load_raw(ctx).await?; @@ -1543,7 +1543,7 @@ pub struct DeltaLayerIterator<'a> { is_end: bool, } -impl<'a> DeltaLayerIterator<'a> { +impl DeltaLayerIterator<'_> { pub(crate) fn layer_dbg_info(&self) -> String { self.delta_layer.layer_dbg_info() } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 834d1931d00f..0d3c9d5a44ca 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -1052,7 +1052,7 @@ pub struct ImageLayerIterator<'a> { is_end: bool, } -impl<'a> ImageLayerIterator<'a> { +impl ImageLayerIterator<'_> { pub(crate) fn layer_dbg_info(&self) -> String { self.image_layer.layer_dbg_info() } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index a4bb3a6bfc5d..1d86015fab1b 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -25,11 +25,11 @@ pub trait File: Send { /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. /// /// No guarantees are made about the remaining bytes in `dst` in case of a short read. - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, dst: Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)>; } @@ -479,11 +479,11 @@ mod tests { } impl File for InMemoryFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, mut dst: Slice, - _ctx: &'a RequestContext, + _ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); let nread = { @@ -609,12 +609,12 @@ mod tests { } } - impl<'x> File for RecorderFile<'x> { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + impl File for RecorderFile<'_> { + async fn read_exact_at_eof_ok( + &self, start: u64, dst: Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?; self.recorded.borrow_mut().push(RecordedRead { @@ -740,11 +740,11 @@ mod tests { } impl File for MockFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, mut dst: Slice, - _ctx: &'a RequestContext, + _ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let ExpectedRead { expect_pos, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b5c707922668..b36c2f487f59 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -31,9 +31,9 @@ use pageserver_api::{ }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ - CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, - DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, - LsnLease, TimelineState, + CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, + InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, ShardNumber, TenantShardId}, @@ -792,63 +792,6 @@ pub(crate) struct CompactRequest { pub sub_compaction_max_job_size_mb: Option, } -#[serde_with::serde_as] -#[derive(Debug, Clone, serde::Deserialize)] -pub(crate) struct CompactLsnRange { - pub start: Lsn, - pub end: Lsn, -} - -#[serde_with::serde_as] -#[derive(Debug, Clone, serde::Deserialize)] -pub(crate) struct CompactKeyRange { - #[serde_as(as = "serde_with::DisplayFromStr")] - pub start: Key, - #[serde_as(as = "serde_with::DisplayFromStr")] - pub end: Key, -} - -impl From> for CompactLsnRange { - fn from(range: Range) -> Self { - Self { - start: range.start, - end: range.end, - } - } -} - -impl From> for CompactKeyRange { - fn from(range: Range) -> Self { - Self { - start: range.start, - end: range.end, - } - } -} - -impl From for Range { - fn from(range: CompactLsnRange) -> Self { - range.start..range.end - } -} - -impl From for Range { - fn from(range: CompactKeyRange) -> Self { - range.start..range.end - } -} - -impl CompactLsnRange { - #[cfg(test)] - #[cfg(feature = "testing")] - pub fn above(lsn: Lsn) -> Self { - Self { - start: lsn, - end: Lsn::MAX, - } - } -} - #[derive(Debug, Clone, Default)] pub(crate) struct CompactOptions { pub flags: EnumSet, @@ -4064,8 +4007,11 @@ impl Timeline { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. + // Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for + // gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own + // heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction. return Err(CompactionError::Other(anyhow!( - "repartition() called concurrently, this should not happen" + "repartition() called concurrently, this is rare and a retry should be fine" ))); }; let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; @@ -5861,7 +5807,7 @@ enum OpenLayerAction { None, } -impl<'a> TimelineWriter<'a> { +impl TimelineWriter<'_> { async fn handle_open_layer_action( &mut self, at: Lsn, diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 701247194ba4..94c65631b206 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,6 +29,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; use crate::statvfs::Statvfs; +use crate::tenant::checks::check_valid_layermap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, @@ -1110,7 +1111,7 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + let same_key = prev_key == Some(key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { let mut next_key_size = 0u64; @@ -1821,10 +1822,12 @@ impl Timeline { let mut compact_jobs = Vec::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. - let Ok(partition) = self.partitioning.try_lock() else { - bail!("failed to acquire partition lock"); + let ((dense_ks, sparse_ks), _) = { + let Ok(partition) = self.partitioning.try_lock() else { + bail!("failed to acquire partition lock during gc-compaction"); + }; + partition.clone() }; - let ((dense_ks, sparse_ks), _) = &*partition; // Truncate the key range to be within user specified compaction range. fn truncate_to( source_start: &Key, @@ -2154,15 +2157,14 @@ impl Timeline { // Step 1: construct a k-merge iterator over all layers. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. - // disable the check for now because we need to adjust the check for partial compactions, will enable later. - // let layer_names = job_desc - // .selected_layers - // .iter() - // .map(|layer| layer.layer_desc().layer_name()) - // .collect_vec(); - // if let Some(err) = check_valid_layermap(&layer_names) { - // warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err); - // } + let layer_names = job_desc + .selected_layers + .iter() + .map(|layer| layer.layer_desc().layer_name()) + .collect_vec(); + if let Some(err) = check_valid_layermap(&layer_names) { + bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err); + } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc .selected_layers @@ -2544,13 +2546,48 @@ impl Timeline { ); // Step 3: Place back to the layer map. + + // First, do a sanity check to ensure the newly-created layer map does not contain overlaps. + let all_layers = { + let guard = self.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; + + let mut final_layers = all_layers + .iter() + .map(|layer| layer.layer_name()) + .collect::>(); + for layer in &layer_selection { + final_layers.remove(&layer.layer_desc().layer_name()); + } + for layer in &compact_to { + final_layers.insert(layer.layer_desc().layer_name()); + } + let final_layers = final_layers.into_iter().collect_vec(); + + // TODO: move this check before we call `finish` on image layer writers. However, this will require us to get the layer name before we finish + // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are + // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. + if let Some(err) = check_valid_layermap(&final_layers) { + bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err); + } + + // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only + // operate on L1 layers. { - // TODO: sanity check if the layer map is valid (i.e., should not have overlaps) let mut guard = self.layers.write().await; guard .open_mut()? .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) }; + + // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json. + // Otherwise, after restart, the index_part only contains the old `latest_gc_cutoff` and + // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should + // be batched into `schedule_compaction_update`. + let disk_consistent_lsn = self.disk_consistent_lsn.load(); + self.schedule_uploads(disk_consistent_lsn, None)?; self.remote_client .schedule_compaction_update(&layer_selection, &compact_to)?; @@ -2902,7 +2939,7 @@ impl CompactionLayer for ResidentDeltaLayer { impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; - async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result>> { + async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result>> { self.0.get_as_delta(ctx).await?.index_entries(ctx).await } } diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index b47b22cd20dc..59096a1bc8c0 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -428,6 +428,8 @@ MergeTable() hash_seq_init(&status, old_table->role_table); while ((entry = hash_seq_search(&status)) != NULL) { + RoleEntry * old; + bool found_old = false; RoleEntry *to_write = hash_search( CurrentDdlTable->role_table, entry->name, @@ -435,30 +437,23 @@ MergeTable() NULL); to_write->type = entry->type; - if (entry->password) - to_write->password = entry->password; + to_write->password = entry->password; strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); - if (entry->old_name[0] != '\0') - { - bool found_old = false; - RoleEntry *old = hash_search( - CurrentDdlTable->role_table, - entry->old_name, - HASH_FIND, - &found_old); - - if (found_old) - { - if (old->old_name[0] != '\0') - strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); - else - strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); - hash_search(CurrentDdlTable->role_table, - entry->old_name, - HASH_REMOVE, - NULL); - } - } + if (entry->old_name[0] == '\0') + continue; + + old = hash_search( + CurrentDdlTable->role_table, + entry->old_name, + HASH_FIND, + &found_old); + if (!found_old) + continue; + strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); + hash_search(CurrentDdlTable->role_table, + entry->old_name, + HASH_REMOVE, + NULL); } hash_destroy(old_table->role_table); } diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 70b250d3945d..ad5667cbab1c 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -365,6 +365,10 @@ lfc_change_limit_hook(int newval, void *extra) neon_log(LOG, "Failed to punch hole in file: %m"); #endif /* We remove the old entry, and re-enter a hole to the hash table */ + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; + } hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); memset(&holetag, 0, sizeof(holetag)); @@ -537,6 +541,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } else { + LWLockRelease(lfc_lock); return found; } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 6513ba4dd67c..88d0a5292bf7 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -827,7 +827,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) { while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { - HandleMainLoopInterrupts(); shard->n_reconnect_attempts += 1; } shard->n_reconnect_attempts = 0; diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c index 5eee5a167911..b94faafdfae9 100644 --- a/pgxn/neon/logical_replication_monitor.c +++ b/pgxn/neon/logical_replication_monitor.c @@ -131,8 +131,8 @@ get_snapshots_cutoff_lsn(void) { cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn; elog(LOG, - "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d", - LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files); + "ls_monitor: number of snapshot files, %zu, is larger than limit of %d", + snapshot_index, logical_replication_max_snap_files); } /* Is the size of the logical snapshots directory larger than specified? @@ -162,8 +162,8 @@ get_snapshots_cutoff_lsn(void) } if (cutoff != original) - elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB", - LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size); + elog(LOG, "ls_monitor: " SNAPDIR " is larger than %d KB", + logical_replication_max_logicalsnapdir_size); } pfree(snapshot_descriptors); @@ -214,9 +214,13 @@ InitLogicalReplicationMonitor(void) } /* - * Unused logical replication slots pins WAL and prevents deletion of snapshots. + * Unused logical replication slots pins WAL and prevent deletion of snapshots. * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which - * need too many .snap files. + * need too many .snap files. These files are stored as AUX files, which are a + * pageserver mechanism for storing non-relation data. AUX files are shipped in + * in the basebackup which is requested by compute_ctl before Postgres starts. + * The larger the time to retrieve the basebackup, the more likely it is the + * compute will be killed by the control plane due to a timeout. */ void LogicalSlotsMonitorMain(Datum main_arg) @@ -239,10 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg) ProcessConfigFile(PGC_SIGHUP); } - /* - * If there are too many .snap files, just drop all logical slots to - * prevent aux files bloat. - */ + /* Get the cutoff LSN */ cutoff_lsn = get_snapshots_cutoff_lsn(); if (cutoff_lsn > 0) { @@ -252,31 +253,37 @@ LogicalSlotsMonitorMain(Datum main_arg) ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; XLogRecPtr restart_lsn; - /* find the name */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); - /* Consider only logical repliction slots */ + + /* Consider only active logical repliction slots */ if (!s->in_use || !SlotIsLogical(s)) { LWLockRelease(ReplicationSlotControlLock); continue; } - /* do we need to drop it? */ + /* + * Retrieve the restart LSN to determine if we need to drop the + * slot + */ SpinLockAcquire(&s->mutex); restart_lsn = s->data.restart_lsn; SpinLockRelease(&s->mutex); + + strlcpy(slot_name, s->data.name.data, sizeof(slot_name)); + LWLockRelease(ReplicationSlotControlLock); + if (restart_lsn >= cutoff_lsn) { - LWLockRelease(ReplicationSlotControlLock); + elog(LOG, "ls_monitor: not dropping replication slot %s because restart LSN %X/%X is greater than cutoff LSN %X/%X", + slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); continue; } - strlcpy(slot_name, s->data.name.data, NAMEDATALEN); - elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X", + elog(LOG, "ls_monitor: dropping replication slot %s because restart LSN %X/%X lower than cutoff LSN %X/%X", slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); - LWLockRelease(ReplicationSlotControlLock); - /* now try to drop it, killing owner before if any */ + /* now try to drop it, killing owner before, if any */ for (;;) { pid_t active_pid; @@ -288,9 +295,9 @@ LogicalSlotsMonitorMain(Datum main_arg) if (active_pid == 0) { /* - * Slot is releasted, try to drop it. Though of course + * Slot is released, try to drop it. Though of course, * it could have been reacquired, so drop can ERROR - * out. Similarly it could have been dropped in the + * out. Similarly, it could have been dropped in the * meanwhile. * * In principle we could remove pg_try/pg_catch, that @@ -300,14 +307,14 @@ LogicalSlotsMonitorMain(Datum main_arg) PG_TRY(); { ReplicationSlotDrop(slot_name, true); - elog(LOG, "ls_monitor: slot %s dropped", slot_name); + elog(LOG, "ls_monitor: replication slot %s dropped", slot_name); } PG_CATCH(); { /* log ERROR and reset elog stack */ EmitErrorReport(); FlushErrorState(); - elog(LOG, "ls_monitor: failed to drop slot %s", slot_name); + elog(LOG, "ls_monitor: failed to drop replication slot %s", slot_name); } PG_END_TRY(); break; @@ -315,7 +322,7 @@ LogicalSlotsMonitorMain(Datum main_arg) else { /* kill the owner and wait for release */ - elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid); + elog(LOG, "ls_monitor: killing replication slot %s owner %d", slot_name, active_pid); (void) kill(active_pid, SIGTERM); /* We shouldn't get stuck, but to be safe add timeout. */ ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP); diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 575d60be8559..c3de77b35278 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -187,7 +187,6 @@ async fn authenticate( NodeInfo { config, aux: db_info.aux, - allow_self_signed_compute: false, // caller may override }, db_info.allowed_ips, )) diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index a258090b1582..df716f8455f0 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -776,6 +776,7 @@ impl From<&jose_jwk::Key> for KeyType { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::future::IntoFuture; use std::net::SocketAddr; diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index d4273fb52167..d10f0e82b283 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -37,7 +37,6 @@ impl LocalBackend { branch_id: BranchIdTag::get_interner().get_or_intern("local"), cold_start_info: ColdStartInfo::WarmCached, }, - allow_self_signed_compute: false, }, } } diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index f38ecf715f7d..0c9a7f7825de 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -463,6 +463,8 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { #[cfg(test)] mod tests { + #![allow(clippy::unimplemented, clippy::unwrap_used)] + use std::net::IpAddr; use std::sync::Arc; use std::time::Duration; @@ -676,6 +678,9 @@ mod tests { .await .unwrap(); + // flush the final server message + stream.flush().await.unwrap(); + handle.await.unwrap(); } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index f6bce9f2d8aa..eff49a402aaa 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -250,6 +250,7 @@ fn project_name_valid(name: &str) -> bool { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use serde_json::json; use ComputeUserInfoParseError::*; diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 60d1962d7f78..0992c6d8754b 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -10,7 +10,6 @@ use tracing::info; use super::backend::ComputeCredentialKeys; use super::{AuthError, PasswordHackPayload}; -use crate::config::TlsServerEndPoint; use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::intern::EndpointIdInt; @@ -18,6 +17,7 @@ use crate::sasl; use crate::scram::threadpool::ThreadPool; use crate::scram::{self}; use crate::stream::{PqStream, Stream}; +use crate::tls::TlsServerEndPoint; /// Every authentication selector is supposed to implement this trait. pub(crate) trait AuthMethod { diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 968682cf0f75..644f670f8899 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -13,7 +13,9 @@ use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; use proxy::auth::{self}; use proxy::cancellation::CancellationHandlerMain; -use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}; +use proxy::config::{ + self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, +}; use proxy::control_plane::locks::ApiLocks; use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings}; use proxy::http::health_server::AppMetrics; @@ -25,6 +27,7 @@ use proxy::rate_limiter::{ use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::{self, GlobalConnPoolOptions}; +use proxy::tls::client_config::compute_client_config_with_root_certs; use proxy::types::RoleName; use proxy::url::ApiUrl; @@ -209,6 +212,7 @@ async fn main() -> anyhow::Result<()> { http_listener, shutdown.clone(), Arc::new(CancellationHandlerMain::new( + &config.connect_to_compute, Arc::new(DashMap::new()), None, proxy::metrics::CancellationSource::Local, @@ -268,10 +272,15 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; + let compute_config = ComputeConfig { + retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?, + tls: Arc::new(compute_client_config_with_root_certs()?), + timeout: Duration::from_secs(2), + }; + Ok(Box::leak(Box::new(ProxyConfig { tls_config: None, metric_collection: None, - allow_self_signed_compute: false, http_config, authentication_config: AuthenticationConfig { jwks_cache: JwkCache::default(), @@ -290,9 +299,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig region: "local".into(), wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, connect_compute_locks, - connect_to_compute_retry_config: RetryConfig::parse( - RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES, - )?, + connect_to_compute: compute_config, }))) } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 623a0fd3b2c9..97d870a83ac9 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -10,12 +10,12 @@ use clap::Arg; use futures::future::Either; use futures::TryFutureExt; use itertools::Itertools; -use proxy::config::TlsServerEndPoint; use proxy::context::RequestContext; use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::protocol2::ConnectionInfo; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; use proxy::stream::{PqStream, Stream}; +use proxy::tls::TlsServerEndPoint; use rustls::crypto::ring; use rustls::pki_types::PrivateKeyDer; use tokio::io::{AsyncRead, AsyncWrite}; @@ -229,7 +229,7 @@ async fn ssl_handshake( let (raw, read_buf) = stream.into_inner(); // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empy. + // server says TLS handshake is ok and read_buf is empty. // However, you could imagine pipelining of postgres // SSLRequest + TLS ClientHello in one hunk similar to // pipelining in our node js driver. We should probably diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 97c4037009da..3b122d771cb1 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,6 +1,7 @@ use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; +use std::time::Duration; use anyhow::bail; use futures::future::Either; @@ -8,7 +9,7 @@ use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; use proxy::cancellation::{CancelMap, CancellationHandler}; use proxy::config::{ - self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig, + self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, }; use proxy::context::parquet::ParquetUploadArgs; @@ -23,6 +24,7 @@ use proxy::redis::{elasticache, notifications}; use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; +use proxy::tls::client_config::compute_client_config_with_root_certs; use proxy::{auth, control_plane, http, serverless, usage_metrics}; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; @@ -105,6 +107,9 @@ struct ProxyCliArgs { /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'c', long, alias = "ssl-cert")] tls_cert: Option, + /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. + #[clap(long, alias = "allow-ssl-keylogfile")] + allow_tls_keylogfile: bool, /// path to directory with TLS certificates for client postgres connections #[clap(long)] certs_dir: Option, @@ -126,9 +131,6 @@ struct ProxyCliArgs { /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] connect_compute_lock: String, - /// Allow self-signed certificates for compute nodes (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - allow_self_signed_compute: bool, #[clap(flatten)] sql_over_http: SqlOverHttpArgs, /// timeout for scram authentication protocol @@ -397,6 +399,7 @@ async fn main() -> anyhow::Result<()> { let cancellation_handler = Arc::new(CancellationHandler::< Option>>, >::new( + &config.connect_to_compute, cancel_map.clone(), redis_publisher, proxy::metrics::CancellationSource::FromClient, @@ -492,6 +495,7 @@ async fn main() -> anyhow::Result<()> { let cache = api.caches.project_info.clone(); if let Some(client) = client1 { maintenance_tasks.spawn(notifications::task_main( + config, client, cache.clone(), cancel_map.clone(), @@ -500,6 +504,7 @@ async fn main() -> anyhow::Result<()> { } if let Some(client) = client2 { maintenance_tasks.spawn(notifications::task_main( + config, client, cache.clone(), cancel_map.clone(), @@ -555,14 +560,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { key_path, cert_path, args.certs_dir.as_ref(), + args.allow_tls_keylogfile, )?), (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; - if args.allow_self_signed_compute { - warn!("allowing self-signed compute certificates"); - } let backup_metric_collection_config = config::MetricBackupCollectionConfig { interval: args.metric_backup_collection_interval, remote_storage_config: args.metric_backup_collection_remote_storage.clone(), @@ -634,10 +637,15 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, }; + let compute_config = ComputeConfig { + retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?, + tls: Arc::new(compute_client_config_with_root_certs()?), + timeout: Duration::from_secs(2), + }; + let config = ProxyConfig { tls_config, metric_collection, - allow_self_signed_compute: args.allow_self_signed_compute, http_config, authentication_config, proxy_protocol_v2: args.proxy_protocol_v2, @@ -645,9 +653,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { region: args.region.clone(), wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, - connect_to_compute_retry_config: config::RetryConfig::parse( - &args.connect_to_compute_retry, - )?, + connect_to_compute: compute_config, }; let config = Box::leak(Box::new(config)); diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 20db1fbb147a..0136446d6dfb 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -12,6 +12,7 @@ use tracing::info; use crate::config::EndpointCacheConfig; use crate::context::RequestContext; +use crate::ext::LockExt; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; use crate::rate_limiter::GlobalRateLimiter; @@ -96,7 +97,7 @@ impl EndpointsCache { // If the limiter allows, we can pretend like it's valid // (incase it is, due to redis channel lag). - if self.limiter.lock().unwrap().check() { + if self.limiter.lock_propagate_poison().check() { return true; } @@ -258,6 +259,7 @@ impl EndpointsCache { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 84430dc812e9..cab0b8b90594 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -365,6 +365,7 @@ impl Cache for ProjectInfoCacheImpl { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; use crate::scram::ServerSecret; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index ed717507ee40..df618cf24257 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -3,7 +3,8 @@ use std::sync::Arc; use dashmap::DashMap; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; -use postgres_client::{CancelToken, NoTls}; +use postgres_client::tls::MakeTlsConnect; +use postgres_client::CancelToken; use pq_proto::CancelKeyData; use thiserror::Error; use tokio::net::TcpStream; @@ -12,12 +13,15 @@ use tracing::{debug, info}; use uuid::Uuid; use crate::auth::{check_peer_addr_is_in_list, IpPattern}; +use crate::config::ComputeConfig; use crate::error::ReportableError; +use crate::ext::LockExt; use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; +use crate::tls::postgres_rustls::MakeRustlsConnect; pub type CancelMap = Arc>>; pub type CancellationHandlerMain = CancellationHandler>>>; @@ -29,6 +33,7 @@ type IpSubnetKey = IpNet; /// /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. pub struct CancellationHandler

{ + compute_config: &'static ComputeConfig, map: CancelMap, client: P, /// This field used for the monitoring purposes. @@ -114,7 +119,7 @@ impl CancellationHandler

{ IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), }; - if !self.limiter.lock().unwrap().check(subnet_key, 1) { + if !self.limiter.lock_propagate_poison().check(subnet_key, 1) { // log only the subnet part of the IP address to know which subnet is rate limited tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); Metrics::get() @@ -173,8 +178,11 @@ impl CancellationHandler

{ source: self.from, kind: crate::metrics::CancellationOutcome::Found, }); - info!("cancelling query per user's request using key {key}"); - cancel_closure.try_cancel_query().await + info!( + "cancelling query per user's request using key {key}, hostname {}, address: {}", + cancel_closure.hostname, cancel_closure.socket_addr + ); + cancel_closure.try_cancel_query(self.compute_config).await } #[cfg(test)] @@ -189,8 +197,13 @@ impl CancellationHandler

{ } impl CancellationHandler<()> { - pub fn new(map: CancelMap, from: CancellationSource) -> Self { + pub fn new( + compute_config: &'static ComputeConfig, + map: CancelMap, + from: CancellationSource, + ) -> Self { Self { + compute_config, map, client: (), from, @@ -205,8 +218,14 @@ impl CancellationHandler<()> { } impl CancellationHandler>>> { - pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { + pub fn new( + compute_config: &'static ComputeConfig, + map: CancelMap, + client: Option>>, + from: CancellationSource, + ) -> Self { Self { + compute_config, map, client, from, @@ -228,6 +247,7 @@ pub struct CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, ip_allowlist: Vec, + hostname: String, // for pg_sni router } impl CancelClosure { @@ -235,17 +255,36 @@ impl CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, ip_allowlist: Vec, + hostname: String, ) -> Self { Self { socket_addr, cancel_token, ip_allowlist, + hostname, } } /// Cancels the query running on user's compute node. - pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> { + pub(crate) async fn try_cancel_query( + self, + compute_config: &ComputeConfig, + ) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; - self.cancel_token.cancel_query_raw(socket, NoTls).await?; + + let mut mk_tls = + crate::tls::postgres_rustls::MakeRustlsConnect::new(compute_config.tls.clone()); + let tls = >::make_tls_connect( + &mut mk_tls, + &self.hostname, + ) + .map_err(|e| { + CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + )) + })?; + + self.cancel_token.cancel_query_raw(socket, tls).await?; debug!("query was cancelled"); Ok(()) } @@ -283,12 +322,32 @@ impl

Drop for Session

{ } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { + use std::time::Duration; + use super::*; + use crate::config::RetryConfig; + use crate::tls::client_config::compute_client_config_with_certs; + + fn config() -> ComputeConfig { + let retry = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + + ComputeConfig { + retry, + tls: Arc::new(compute_client_config_with_certs(std::iter::empty())), + timeout: Duration::from_secs(2), + } + } #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + Box::leak(Box::new(config())), CancelMap::default(), CancellationSource::FromRedis, )); @@ -304,8 +363,11 @@ mod tests { #[tokio::test] async fn cancel_session_noop_regression() { - let handler = - CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local); + let handler = CancellationHandler::<()>::new( + Box::leak(Box::new(config())), + CancelMap::default(), + CancellationSource::Local, + ); handler .cancel_session( CancelKeyData { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 4113b5bb80e3..89de6692ad6b 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,17 +1,13 @@ use std::io; use std::net::SocketAddr; -use std::sync::Arc; use std::time::Duration; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use once_cell::sync::OnceCell; use postgres_client::tls::MakeTlsConnect; use postgres_client::{CancelToken, RawConnection}; use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; -use rustls::client::danger::ServerCertVerifier; -use rustls::crypto::ring; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; @@ -19,14 +15,15 @@ use tracing::{debug, error, info, warn}; use crate::auth::parse_endpoint_param; use crate::cancellation::CancelClosure; +use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::MetricsAuxInfo; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; -use crate::postgres_rustls::MakeRustlsConnect; use crate::proxy::neon_option; +use crate::tls::postgres_rustls::MakeRustlsConnect; use crate::types::Host; pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; @@ -41,9 +38,6 @@ pub(crate) enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] CouldNotConnect(#[from] io::Error), - #[error("Couldn't load native TLS certificates: {0:?}")] - TlsCertificateError(Vec), - #[error("{COULD_NOT_CONNECT}: {0}")] TlsError(#[from] InvalidDnsNameError), @@ -90,7 +84,6 @@ impl ReportableError for ConnectionError { } ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, - ConnectionError::TlsCertificateError(_) => crate::error::ErrorKind::Service, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), @@ -200,11 +193,15 @@ impl ConnCfg { let connect_once = |host, port| { debug!("trying to connect to compute node at {host}:{port}"); - connect_with_timeout(host, port).and_then(|socket| async { - let socket_addr = socket.peer_addr()?; + connect_with_timeout(host, port).and_then(|stream| async { + let socket_addr = stream.peer_addr()?; + let socket = socket2::SockRef::from(&stream); + // Disable Nagle's algorithm to not introduce latency between + // client and compute. + socket.set_nodelay(true)?; // This prevents load balancer from severing the connection. - socket2::SockRef::from(&socket).set_keepalive(true)?; - Ok((socket_addr, socket)) + socket.set_keepalive(true)?; + Ok((socket_addr, stream)) }) }; @@ -251,35 +248,14 @@ impl ConnCfg { pub(crate) async fn connect( &self, ctx: &RequestContext, - allow_self_signed_compute: bool, aux: MetricsAuxInfo, - timeout: Duration, + config: &ComputeConfig, ) -> Result { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let (socket_addr, stream, host) = self.connect_raw(timeout).await?; + let (socket_addr, stream, host) = self.connect_raw(config.timeout).await?; drop(pause); - let client_config = if allow_self_signed_compute { - // Allow all certificates for creating the connection - let verifier = Arc::new(AcceptEverythingVerifier); - rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_safe_default_protocol_versions() - .expect("ring should support the default protocol versions") - .dangerous() - .with_custom_certificate_verifier(verifier) - } else { - let root_store = TLS_ROOTS - .get_or_try_init(load_certs) - .map_err(ConnectionError::TlsCertificateError)? - .clone(); - rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_safe_default_protocol_versions() - .expect("ring should support the default protocol versions") - .with_root_certificates(root_store) - }; - let client_config = client_config.with_no_client_auth(); - - let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config); + let mut mk_tls = crate::tls::postgres_rustls::MakeRustlsConnect::new(config.tls.clone()); let tls = >::make_tls_connect( &mut mk_tls, host, @@ -319,6 +295,7 @@ impl ConnCfg { secret_key, }, vec![], + host.to_string(), ); let connection = PostgresConnection { @@ -350,63 +327,6 @@ fn filtered_options(options: &str) -> Option { Some(options) } -fn load_certs() -> Result, Vec> { - let der_certs = rustls_native_certs::load_native_certs(); - - if !der_certs.errors.is_empty() { - return Err(der_certs.errors); - } - - let mut store = rustls::RootCertStore::empty(); - store.add_parsable_certificates(der_certs.certs); - Ok(Arc::new(store)) -} -static TLS_ROOTS: OnceCell> = OnceCell::new(); - -#[derive(Debug)] -struct AcceptEverythingVerifier; -impl ServerCertVerifier for AcceptEverythingVerifier { - fn supported_verify_schemes(&self) -> Vec { - use rustls::SignatureScheme; - // The schemes for which `SignatureScheme::supported_in_tls13` returns true. - vec![ - SignatureScheme::ECDSA_NISTP521_SHA512, - SignatureScheme::ECDSA_NISTP384_SHA384, - SignatureScheme::ECDSA_NISTP256_SHA256, - SignatureScheme::RSA_PSS_SHA512, - SignatureScheme::RSA_PSS_SHA384, - SignatureScheme::RSA_PSS_SHA256, - SignatureScheme::ED25519, - ] - } - fn verify_server_cert( - &self, - _end_entity: &rustls::pki_types::CertificateDer<'_>, - _intermediates: &[rustls::pki_types::CertificateDer<'_>], - _server_name: &rustls::pki_types::ServerName<'_>, - _ocsp_response: &[u8], - _now: rustls::pki_types::UnixTime, - ) -> Result { - Ok(rustls::client::danger::ServerCertVerified::assertion()) - } - fn verify_tls12_signature( - &self, - _message: &[u8], - _cert: &rustls::pki_types::CertificateDer<'_>, - _dss: &rustls::DigitallySignedStruct, - ) -> Result { - Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) - } - fn verify_tls13_signature( - &self, - _message: &[u8], - _cert: &rustls::pki_types::CertificateDer<'_>, - _dss: &rustls::DigitallySignedStruct, - ) -> Result { - Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 8bc8e3f96f59..8502edcfab09 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,17 +1,10 @@ -use std::collections::{HashMap, HashSet}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use anyhow::{bail, ensure, Context, Ok}; use clap::ValueEnum; -use itertools::Itertools; use remote_storage::RemoteStorageConfig; -use rustls::crypto::ring::{self, sign}; -use rustls::pki_types::{CertificateDer, PrivateKeyDer}; -use sha2::{Digest, Sha256}; -use tracing::{error, info}; -use x509_parser::oid_registry; use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::AuthRateLimiter; @@ -20,12 +13,12 @@ use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig} use crate::scram::threadpool::ThreadPool; use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; +pub use crate::tls::server_config::{configure_tls, TlsConfig}; use crate::types::Host; pub struct ProxyConfig { pub tls_config: Option, pub metric_collection: Option, - pub allow_self_signed_compute: bool, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub proxy_protocol_v2: ProxyProtocolV2, @@ -33,7 +26,13 @@ pub struct ProxyConfig { pub handshake_timeout: Duration, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, - pub connect_to_compute_retry_config: RetryConfig, + pub connect_to_compute: ComputeConfig, +} + +pub struct ComputeConfig { + pub retry: RetryConfig, + pub tls: Arc, + pub timeout: Duration, } #[derive(Copy, Clone, Debug, ValueEnum, PartialEq)] @@ -53,12 +52,6 @@ pub struct MetricCollectionConfig { pub backup_metric_collection_config: MetricBackupCollectionConfig, } -pub struct TlsConfig { - pub config: Arc, - pub common_names: HashSet, - pub cert_resolver: Arc, -} - pub struct HttpConfig { pub accept_websockets: bool, pub pool_options: GlobalConnPoolOptions, @@ -81,271 +74,6 @@ pub struct AuthenticationConfig { pub console_redirect_confirmation_timeout: tokio::time::Duration, } -impl TlsConfig { - pub fn to_server_config(&self) -> Arc { - self.config.clone() - } -} - -/// -pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; - -/// Configure TLS for the main endpoint. -pub fn configure_tls( - key_path: &str, - cert_path: &str, - certs_dir: Option<&String>, -) -> anyhow::Result { - let mut cert_resolver = CertResolver::new(); - - // add default certificate - cert_resolver.add_cert_path(key_path, cert_path, true)?; - - // add extra certificates - if let Some(certs_dir) = certs_dir { - for entry in std::fs::read_dir(certs_dir)? { - let entry = entry?; - let path = entry.path(); - if path.is_dir() { - // file names aligned with default cert-manager names - let key_path = path.join("tls.key"); - let cert_path = path.join("tls.crt"); - if key_path.exists() && cert_path.exists() { - cert_resolver.add_cert_path( - &key_path.to_string_lossy(), - &cert_path.to_string_lossy(), - false, - )?; - } - } - } - } - - let common_names = cert_resolver.get_common_names(); - - let cert_resolver = Arc::new(cert_resolver); - - // allow TLS 1.2 to be compatible with older client libraries - let mut config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()); - - config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; - - Ok(TlsConfig { - config: Arc::new(config), - common_names, - cert_resolver, - }) -} - -/// Channel binding parameter -/// -/// -/// Description: The hash of the TLS server's certificate as it -/// appears, octet for octet, in the server's Certificate message. Note -/// that the Certificate message contains a certificate_list, in which -/// the first element is the server's certificate. -/// -/// The hash function is to be selected as follows: -/// -/// * if the certificate's signatureAlgorithm uses a single hash -/// function, and that hash function is either MD5 or SHA-1, then use SHA-256; -/// -/// * if the certificate's signatureAlgorithm uses a single hash -/// function and that hash function neither MD5 nor SHA-1, then use -/// the hash function associated with the certificate's -/// signatureAlgorithm; -/// -/// * if the certificate's signatureAlgorithm uses no hash functions or -/// uses multiple hash functions, then this channel binding type's -/// channel bindings are undefined at this time (updates to is channel -/// binding type may occur to address this issue if it ever arises). -#[derive(Debug, Clone, Copy)] -pub enum TlsServerEndPoint { - Sha256([u8; 32]), - Undefined, -} - -impl TlsServerEndPoint { - pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { - let sha256_oids = [ - // I'm explicitly not adding MD5 or SHA1 here... They're bad. - oid_registry::OID_SIG_ECDSA_WITH_SHA256, - oid_registry::OID_PKCS1_SHA256WITHRSA, - ]; - - let pem = x509_parser::parse_x509_certificate(cert) - .context("Failed to parse PEM object from cerficiate")? - .1; - - info!(subject = %pem.subject, "parsing TLS certificate"); - - let reg = oid_registry::OidRegistry::default().with_all_crypto(); - let oid = pem.signature_algorithm.oid(); - let alg = reg.get(oid); - if sha256_oids.contains(oid) { - let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); - info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); - Ok(Self::Sha256(tls_server_end_point)) - } else { - error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding"); - Ok(Self::Undefined) - } - } - - pub fn supported(&self) -> bool { - !matches!(self, TlsServerEndPoint::Undefined) - } -} - -#[derive(Default, Debug)] -pub struct CertResolver { - certs: HashMap, TlsServerEndPoint)>, - default: Option<(Arc, TlsServerEndPoint)>, -} - -impl CertResolver { - pub fn new() -> Self { - Self::default() - } - - fn add_cert_path( - &mut self, - key_path: &str, - cert_path: &str, - is_default: bool, - ) -> anyhow::Result<()> { - let priv_key = { - let key_bytes = std::fs::read(key_path) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .unwrap() - .context(format!("Failed to parse TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - self.add_cert(priv_key, cert_chain, is_default) - } - - pub fn add_cert( - &mut self, - priv_key: PrivateKeyDer<'static>, - cert_chain: Vec>, - is_default: bool, - ) -> anyhow::Result<()> { - let key = sign::any_supported_type(&priv_key).context("invalid private key")?; - - let first_cert = &cert_chain[0]; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let pem = x509_parser::parse_x509_certificate(first_cert) - .context("Failed to parse PEM object from cerficiate")? - .1; - - let common_name = pem.subject().to_string(); - - // We need to get the canonical name for this certificate so we can match them against any domain names - // seen within the proxy codebase. - // - // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. - // We need to remove the wildcard prefix for the purposes of certificate selection. - // - // auth-broker does not use SNI and instead uses the Neon-Connection-String header. - // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. - // - // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string - // validation, so let's we can continue with any common-name - let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=") { - s.to_string() - } else { - bail!("Failed to parse common name from certificate") - }; - - let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); - - if is_default { - self.default = Some((cert.clone(), tls_server_end_point)); - } - - self.certs.insert(common_name, (cert, tls_server_end_point)); - - Ok(()) - } - - pub fn get_common_names(&self) -> HashSet { - self.certs.keys().map(|s| s.to_string()).collect() - } -} - -impl rustls::server::ResolvesServerCert for CertResolver { - fn resolve( - &self, - client_hello: rustls::server::ClientHello<'_>, - ) -> Option> { - self.resolve(client_hello.server_name()).map(|x| x.0) - } -} - -impl CertResolver { - pub fn resolve( - &self, - server_name: Option<&str>, - ) -> Option<(Arc, TlsServerEndPoint)> { - // loop here and cut off more and more subdomains until we find - // a match to get a proper wildcard support. OTOH, we now do not - // use nested domains, so keep this simple for now. - // - // With the current coding foo.com will match *.foo.com and that - // repeats behavior of the old code. - if let Some(mut sni_name) = server_name { - loop { - if let Some(cert) = self.certs.get(sni_name) { - return Some(cert.clone()); - } - if let Some((_, rest)) = sni_name.split_once('.') { - sni_name = rest; - } else { - return None; - } - } - } else { - // No SNI, use the default certificate, otherwise we can't get to - // options parameter which can be used to set endpoint name too. - // That means that non-SNI flow will not work for CNAME domains in - // verify-full mode. - // - // If that will be a problem we can: - // - // a) Instead of multi-cert approach use single cert with extra - // domains listed in Subject Alternative Name (SAN). - // b) Deploy separate proxy instances for extra domains. - self.default.clone() - } - } -} - #[derive(Debug)] pub struct EndpointCacheConfig { /// Batch size to receive all endpoints on the startup. diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 65702e0e4c7a..25a549039ccc 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -115,7 +115,7 @@ pub async fn task_main( Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); - match p.proxy_pass().await { + match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { error!(?session_id, "per-client task finished with an IO error from the client: {e:#}"); @@ -215,9 +215,8 @@ pub(crate) async fn handle_client( locks: &config.connect_compute_locks, }, &user_info, - config.allow_self_signed_compute, config.wake_compute_retry_config, - config.connect_to_compute_retry_config, + &config.connect_to_compute, ) .or_else(|e| stream.throw_error(e)) .await?; diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 3105d085260d..5f65b17374f2 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -23,6 +23,7 @@ use utils::backoff; use super::{RequestContextInner, LOG_CHAN}; use crate::config::remote_storage_from_toml; use crate::context::LOG_CHAN_DISCONNECT; +use crate::ext::TaskExt; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -171,7 +172,9 @@ pub async fn worker( }; let (tx, mut rx) = mpsc::unbounded_channel(); - LOG_CHAN.set(tx.downgrade()).unwrap(); + LOG_CHAN + .set(tx.downgrade()) + .expect("only one worker should set the channel"); // setup row stream that will close on cancellation let cancellation_token2 = cancellation_token.clone(); @@ -207,7 +210,9 @@ pub async fn worker( config.parquet_upload_disconnect_events_remote_storage { let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel(); - LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap(); + LOG_CHAN_DISCONNECT + .set(tx_disconnect.downgrade()) + .expect("only one worker should set the channel"); // setup row stream that will close on cancellation tokio::spawn(async move { @@ -326,7 +331,7 @@ where Ok::<_, parquet::errors::ParquetError>((rows, w, rg_meta)) }) .await - .unwrap()?; + .propagate_task_panic()?; rows.clear(); Ok((rows, w, rg_meta)) @@ -352,7 +357,7 @@ async fn upload_parquet( Ok((buffer, metadata)) }) .await - .unwrap()?; + .propagate_task_panic()?; let data = buffer.split().freeze(); @@ -409,6 +414,7 @@ async fn upload_parquet( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::net::Ipv4Addr; use std::num::NonZeroUsize; diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index e33a37f64366..00038a6ac6a1 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -250,7 +250,6 @@ impl NeonControlPlaneClient { let node = NodeInfo { config, aux: body.aux, - allow_self_signed_compute: false, }; Ok(node) diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index eaf692ab279b..5f8bda0f35ae 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -102,7 +102,9 @@ impl MockControlPlane { Some(s) => { info!("got allowed_ips: {s}"); s.split(',') - .map(|s| IpPattern::from_str(s).unwrap()) + .map(|s| { + IpPattern::from_str(s).expect("mocked ip pattern should be correct") + }) .collect() } None => vec![], @@ -174,7 +176,6 @@ impl MockControlPlane { branch_id: (&BranchId::from("branch")).into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, - allow_self_signed_compute: false, }; Ok(node) diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 41972e4e44d0..c65041df0e37 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -10,13 +10,13 @@ pub mod client; pub(crate) mod errors; use std::sync::Arc; -use std::time::Duration; use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; use crate::auth::IpPattern; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::cache::{Cached, TimedLru}; +use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; use crate::intern::ProjectIdInt; @@ -67,28 +67,18 @@ pub(crate) struct NodeInfo { /// Labels for proxy's metrics. pub(crate) aux: MetricsAuxInfo, - - /// Whether we should accept self-signed certificates (for testing) - pub(crate) allow_self_signed_compute: bool, } impl NodeInfo { pub(crate) async fn connect( &self, ctx: &RequestContext, - timeout: Duration, + config: &ComputeConfig, ) -> Result { - self.config - .connect( - ctx, - self.allow_self_signed_compute, - self.aux.clone(), - timeout, - ) - .await + self.config.connect(ctx, self.aux.clone(), config).await } + pub(crate) fn reuse_settings(&mut self, other: Self) { - self.allow_self_signed_compute = other.allow_self_signed_compute; self.config.reuse_password(other.config); } diff --git a/proxy/src/ext.rs b/proxy/src/ext.rs new file mode 100644 index 000000000000..8d00afbf51a4 --- /dev/null +++ b/proxy/src/ext.rs @@ -0,0 +1,41 @@ +use std::panic::resume_unwind; +use std::sync::{Mutex, MutexGuard}; + +use tokio::task::JoinError; + +pub(crate) trait LockExt { + fn lock_propagate_poison(&self) -> MutexGuard<'_, T>; +} + +impl LockExt for Mutex { + /// Lock the mutex and panic if the mutex was poisoned. + #[track_caller] + fn lock_propagate_poison(&self) -> MutexGuard<'_, T> { + match self.lock() { + Ok(guard) => guard, + // poison occurs when another thread panicked while holding the lock guard. + // since panicking is often unrecoverable, propagating the poison panic is reasonable. + Err(poison) => panic!("{poison}"), + } + } +} + +pub(crate) trait TaskExt { + fn propagate_task_panic(self) -> T; +} + +impl TaskExt for Result { + /// Unwrap the result and panic if the inner task panicked. + /// Also panics if the task was cancelled + #[track_caller] + fn propagate_task_panic(self) -> T { + match self { + Ok(t) => t, + // Using resume_unwind prevents the panic hook being called twice. + // Since we use this for structured concurrency, there is only + // 1 logical panic, so this is more correct. + Err(e) if e.is_panic() => resume_unwind(e.into_panic()), + Err(e) => panic!("unexpected task error: {e}"), + } + } +} diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 978ad9f76131..6ca091feb716 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -14,6 +14,7 @@ use utils::http::error::ApiError; use utils::http::json::json_response; use utils::http::{RouterBuilder, RouterService}; +use crate::ext::{LockExt, TaskExt}; use crate::jemalloc; async fn status_handler(_: Request) -> Result, ApiError> { @@ -76,7 +77,7 @@ async fn prometheus_metrics_handler( let body = tokio::task::spawn_blocking(move || { let _span = span.entered(); - let mut state = state.lock().unwrap(); + let mut state = state.lock_propagate_poison(); let PrometheusHandler { encoder, metrics } = &mut *state; metrics @@ -94,13 +95,13 @@ async fn prometheus_metrics_handler( body }) .await - .unwrap(); + .propagate_task_panic(); let response = Response::builder() .status(200) .header(CONTENT_TYPE, "text/plain; version=0.0.4") .body(Body::from(body)) - .unwrap(); + .expect("response headers should be valid"); Ok(response) } diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index f56d92a6b31e..79c6020302af 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -83,7 +83,7 @@ impl StringInterner { pub(crate) fn new() -> Self { StringInterner { inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( - Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()), + Capacity::new(2500, NonZeroUsize::new(1 << 16).expect("value is nonzero")), // unbounded MemoryLimits::for_memory_usage(usize::MAX), BuildHasherDefault::::default(), @@ -207,6 +207,7 @@ impl From for ProjectIdInt { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::sync::OnceLock; diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index ba69f9cf2d28..c56474edd729 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -22,8 +22,8 @@ clippy::string_add, clippy::string_to_string, clippy::todo, - // TODO: consider clippy::unimplemented - // TODO: consider clippy::unwrap_used + clippy::unimplemented, + clippy::unwrap_used, )] // List of permanently allowed lints. #![allow( @@ -82,13 +82,13 @@ pub mod console_redirect_proxy; pub mod context; pub mod control_plane; pub mod error; +mod ext; pub mod http; pub mod intern; pub mod jemalloc; pub mod logging; pub mod metrics; pub mod parse; -pub mod postgres_rustls; pub mod protocol2; pub mod proxy; pub mod rate_limiter; @@ -98,6 +98,7 @@ pub mod scram; pub mod serverless; pub mod signals; pub mod stream; +pub mod tls; pub mod types; pub mod url; pub mod usage_metrics; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 74d2b9a1d01e..41f10f052ffa 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -18,8 +18,16 @@ pub async fn init() -> anyhow::Result { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy() - .add_directive("aws_config=info".parse().unwrap()) - .add_directive("azure_core::policies::transport=off".parse().unwrap()); + .add_directive( + "aws_config=info" + .parse() + .expect("this should be a valid filter directive"), + ) + .add_directive( + "azure_core::policies::transport=off" + .parse() + .expect("this should be a valid filter directive"), + ); let fmt_layer = tracing_subscriber::fmt::layer() .with_ansi(false) diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 8c0f25106662..095d6278cc51 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -8,14 +8,6 @@ pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { Some((cstr, other)) } -/// See . -pub(crate) fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { - (bytes.len() >= N).then(|| { - let (head, tail) = bytes.split_at(N); - (head.try_into().unwrap(), tail) - }) -} - #[cfg(test)] mod tests { use super::*; @@ -33,11 +25,4 @@ mod tests { assert_eq!(cstr.to_bytes(), b"foo"); assert_eq!(rest, b"bar"); } - - #[test] - fn test_split_at_const() { - assert!(split_at_const::<0>(b"").is_some()); - assert!(split_at_const::<1>(b"").is_none()); - assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k")))); - } } diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 33a5eb5e1e03..0dc97b709724 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -396,6 +396,7 @@ impl NetworkEndianIpv6 { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use tokio::io::AsyncReadExt; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index a3027abd7cae..8a804948606b 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -6,7 +6,7 @@ use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; use crate::auth::backend::ComputeCredentialKeys; use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; -use crate::config::RetryConfig; +use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::locks::ApiLocks; @@ -19,8 +19,6 @@ use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; use crate::proxy::wake_compute::wake_compute; use crate::types::Host; -const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); - /// If we couldn't connect, a cached connection info might be to blame /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. @@ -49,7 +47,7 @@ pub(crate) trait ConnectMechanism { &self, ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, - timeout: time::Duration, + config: &ComputeConfig, ) -> Result; fn update_connect_config(&self, conf: &mut compute::ConnCfg); @@ -86,11 +84,11 @@ impl ConnectMechanism for TcpMechanism<'_> { &self, ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, - timeout: time::Duration, + config: &ComputeConfig, ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; - permit.release_result(node_info.connect(ctx, timeout).await) + permit.release_result(node_info.connect(ctx, config).await) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { @@ -104,9 +102,8 @@ pub(crate) async fn connect_to_compute Result where M::ConnectError: CouldRetry + ShouldRetryWakeCompute + std::fmt::Debug, @@ -117,14 +114,10 @@ where wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; node_info.set_keys(user_info.get_keys()); - node_info.allow_self_signed_compute = allow_self_signed_compute; mechanism.update_connect_config(&mut node_info.config); // try once - let err = match mechanism - .connect_once(ctx, &node_info, CONNECT_TIMEOUT) - .await - { + let err = match mechanism.connect_once(ctx, &node_info, compute).await { Ok(res) => { ctx.success(); Metrics::get().proxy.retries_metric.observe( @@ -144,7 +137,7 @@ where let node_info = if !node_info.cached() || !err.should_retry_wake_compute() { // If we just recieved this from cplane and didn't get it from cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. - if should_retry(&err, num_retries, connect_to_compute_retry_config) { + if should_retry(&err, num_retries, compute.retry) { Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, @@ -174,10 +167,7 @@ where debug!("wake_compute success. attempting to connect"); num_retries = 1; loop { - match mechanism - .connect_once(ctx, &node_info, CONNECT_TIMEOUT) - .await - { + match mechanism.connect_once(ctx, &node_info, compute).await { Ok(res) => { ctx.success(); Metrics::get().proxy.retries_metric.observe( @@ -192,7 +182,7 @@ where return Ok(res); } Err(e) => { - if !should_retry(&e, num_retries, connect_to_compute_retry_config) { + if !should_retry(&e, num_retries, compute.retry) { // Don't log an error here, caller will print the error Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { @@ -208,7 +198,7 @@ where } }; - let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); + let wait_duration = retry_after(num_retries, compute.retry); num_retries += 1; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 4e4af8863484..3336a9556a5b 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -257,6 +257,7 @@ impl CopyBuffer { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use tokio::io::AsyncWriteExt; diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index e27c211932ea..955f754497ec 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -8,12 +8,13 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, warn}; use crate::auth::endpoint_sni; -use crate::config::{TlsConfig, PG_ALPN_PROTOCOL}; +use crate::config::TlsConfig; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::Metrics; use crate::proxy::ERR_INSECURE_CONNECTION; use crate::stream::{PqStream, Stream, StreamUpgradeError}; +use crate::tls::PG_ALPN_PROTOCOL; #[derive(Error, Debug)] pub(crate) enum HandshakeError { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index cc04bc5e5ce9..3926c56fecc5 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -152,7 +152,7 @@ pub async fn task_main( Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); - match p.proxy_pass().await { + match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { warn!(?session_id, "per-client task finished with an IO error from the client: {e:#}"); @@ -191,13 +191,6 @@ impl ClientMode { } } - pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { - match self { - ClientMode::Tcp => config.allow_self_signed_compute, - ClientMode::Websockets { .. } => false, - } - } - fn hostname<'a, S>(&'a self, s: &'a Stream) -> Option<&'a str> { match self { ClientMode::Tcp => s.sni_hostname(), @@ -357,9 +350,8 @@ pub(crate) async fn handle_client( locks: &config.connect_compute_locks, }, &user_info, - mode.allow_self_signed_compute(config), config.wake_compute_retry_config, - config.connect_to_compute_retry_config, + &config.connect_to_compute, ) .or_else(|e| stream.throw_error(e)) .await?; @@ -494,7 +486,7 @@ impl NeonOptions { pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> { static RE: OnceCell = OnceCell::new(); - let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap()); + let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").expect("regex should be correct")); let cap = re.captures(bytes)?; let (_, [k, v]) = cap.extract(); diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index dcaa81e5cdda..a42f9aad39a7 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -5,6 +5,7 @@ use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; use crate::cancellation; use crate::compute::PostgresConnection; +use crate::config::ComputeConfig; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; use crate::stream::Stream; @@ -67,9 +68,17 @@ pub(crate) struct ProxyPassthrough { } impl ProxyPassthrough { - pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { + pub(crate) async fn proxy_pass( + self, + compute_config: &ComputeConfig, + ) -> Result<(), ErrorSource> { let res = proxy_pass(self.client, self.compute.stream, self.aux).await; - if let Err(err) = self.compute.cancel_closure.try_cancel_query().await { + if let Err(err) = self + .compute + .cancel_closure + .try_cancel_query(compute_config) + .await + { tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); } res diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 911b349416f2..10db2bcb303f 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -1,4 +1,5 @@ //! A group of high-level tests for connection establishing logic and auth. +#![allow(clippy::unimplemented, clippy::unwrap_used)] mod mitm; @@ -21,14 +22,16 @@ use super::*; use crate::auth::backend::{ ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, }; -use crate::config::{CertResolver, RetryConfig}; +use crate::config::{ComputeConfig, RetryConfig}; use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; use crate::control_plane::{ self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache, }; use crate::error::ErrorKind; -use crate::postgres_rustls::MakeRustlsConnect; +use crate::tls::client_config::compute_client_config_with_certs; +use crate::tls::postgres_rustls::MakeRustlsConnect; +use crate::tls::server_config::CertResolver; use crate::types::{BranchId, EndpointId, ProjectId}; use crate::{sasl, scram}; @@ -66,7 +69,7 @@ fn generate_certs( } struct ClientConfig<'a> { - config: rustls::ClientConfig, + config: Arc, hostname: &'a str, } @@ -109,16 +112,7 @@ fn generate_tls_config<'a>( }; let client_config = { - let config = - rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_safe_default_protocol_versions() - .context("ring should support the default protocol versions")? - .with_root_certificates({ - let mut store = rustls::RootCertStore::empty(); - store.add(ca)?; - store - }) - .with_no_client_auth(); + let config = Arc::new(compute_client_config_with_certs([ca])); ClientConfig { config, hostname } }; @@ -467,7 +461,7 @@ impl ConnectMechanism for TestConnectMechanism { &self, _ctx: &RequestContext, _node_info: &control_plane::CachedNodeInfo, - _timeout: std::time::Duration, + _config: &ComputeConfig, ) -> Result { let mut counter = self.counter.lock().unwrap(); let action = self.sequence[*counter]; @@ -553,7 +547,6 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn branch_id: (&BranchId::from("branch")).into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, - allow_self_signed_compute: false, }; let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone())); node2.map(|()| node) @@ -576,6 +569,20 @@ fn helper_create_connect_info( user_info } +fn config() -> ComputeConfig { + let retry = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + + ComputeConfig { + retry, + tls: Arc::new(compute_client_config_with_certs(std::iter::empty())), + timeout: Duration::from_secs(2), + } +} + #[tokio::test] async fn connect_to_compute_success() { let _ = env_logger::try_init(); @@ -583,12 +590,8 @@ async fn connect_to_compute_success() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); @@ -601,12 +604,8 @@ async fn connect_to_compute_retry() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); @@ -620,12 +619,8 @@ async fn connect_to_compute_non_retry_1() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap_err(); mechanism.verify(); @@ -639,12 +634,8 @@ async fn connect_to_compute_non_retry_2() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); @@ -665,18 +656,13 @@ async fn connect_to_compute_non_retry_3() { max_retries: 1, backoff_factor: 2.0, }; - let connect_to_compute_retry_config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; + let config = config(); connect_to_compute( &ctx, &mechanism, &user_info, - false, wake_compute_retry_config, - connect_to_compute_retry_config, + &config, ) .await .unwrap_err(); @@ -691,12 +677,8 @@ async fn wake_retry() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); @@ -710,12 +692,8 @@ async fn wake_non_retry() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index 45f9630dde0f..bff800f0a2f0 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -83,7 +83,7 @@ impl From for utils::leaky_bucket::LeakyBucketConfig { } #[cfg(test)] -#[allow(clippy::float_cmp)] +#[allow(clippy::float_cmp, clippy::unwrap_used)] mod tests { use std::time::Duration; diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 3000cc4c2af2..04e136b6d543 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -63,6 +63,7 @@ impl LimitAlgorithm for Aimd { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::time::Duration; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index a048721e77d7..6f6a8c9d4781 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -12,6 +12,7 @@ use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; +use crate::ext::LockExt; use crate::intern::EndpointIdInt; pub struct GlobalRateLimiter { @@ -246,12 +247,13 @@ impl BucketRateLimiter { let n = self.map.shards().len(); // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide // (impossible, infact, unless we have 2048 threads) - let shard = self.rand.lock().unwrap().gen_range(0..n); + let shard = self.rand.lock_propagate_poison().gen_range(0..n); self.map.shards()[shard].write().clear(); } } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::hash::BuildHasherDefault; use std::time::Duration; diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 82139ea1d5e5..0f6e765b02cd 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -69,7 +69,11 @@ impl ConnectionWithCredentialsProvider { pub fn new_with_static_credentials(params: T) -> Self { Self { - credentials: Credentials::Static(params.into_connection_info().unwrap()), + credentials: Credentials::Static( + params + .into_connection_info() + .expect("static configured redis credentials should be a valid format"), + ), con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index f3aa97c03284..671305a3005a 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -6,14 +6,15 @@ use pq_proto::CancelKeyData; use redis::aio::PubSub; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; +use tracing::Instrument; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::cache::project_info::ProjectInfoCache; use crate::cancellation::{CancelMap, CancellationHandler}; +use crate::config::ProxyConfig; use crate::intern::{ProjectIdInt, RoleNameInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; -use tracing::Instrument; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; @@ -39,6 +40,27 @@ pub(crate) enum Notification { AllowedIpsUpdate { allowed_ips_update: AllowedIpsUpdate, }, + #[serde( + rename = "/block_public_or_vpc_access_updated", + deserialize_with = "deserialize_json_string" + )] + BlockPublicOrVpcAccessUpdated { + block_public_or_vpc_access_updated: BlockPublicOrVpcAccessUpdated, + }, + #[serde( + rename = "/allowed_vpc_endpoints_updated_for_org", + deserialize_with = "deserialize_json_string" + )] + AllowedVpcEndpointsUpdatedForOrg { + allowed_vpc_endpoints_updated_for_org: AllowedVpcEndpointsUpdatedForOrg, + }, + #[serde( + rename = "/allowed_vpc_endpoints_updated_for_projects", + deserialize_with = "deserialize_json_string" + )] + AllowedVpcEndpointsUpdatedForProjects { + allowed_vpc_endpoints_updated_for_projects: AllowedVpcEndpointsUpdatedForProjects, + }, #[serde( rename = "/password_updated", deserialize_with = "deserialize_json_string" @@ -51,6 +73,24 @@ pub(crate) enum Notification { pub(crate) struct AllowedIpsUpdate { project_id: ProjectIdInt, } + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct BlockPublicOrVpcAccessUpdated { + project_id: ProjectIdInt, +} + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct AllowedVpcEndpointsUpdatedForOrg { + // TODO: change type once the implementation is more fully fledged. + // See e.g. https://github.com/neondatabase/neon/pull/10073. + account_id: ProjectIdInt, +} + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct AllowedVpcEndpointsUpdatedForProjects { + project_ids: Vec, +} + #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub(crate) struct PasswordUpdate { project_id: ProjectIdInt, @@ -164,7 +204,11 @@ impl MessageHandler { } } } - Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => { + Notification::AllowedIpsUpdate { .. } + | Notification::PasswordUpdate { .. } + | Notification::BlockPublicOrVpcAccessUpdated { .. } + | Notification::AllowedVpcEndpointsUpdatedForOrg { .. } + | Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => { invalidate_cache(self.cache.clone(), msg.clone()); if matches!(msg, Notification::AllowedIpsUpdate { .. }) { Metrics::get() @@ -177,6 +221,8 @@ impl MessageHandler { .redis_events_count .inc(RedisEventsCount::PasswordUpdate); } + // TODO: add additional metrics for the other event types. + // It might happen that the invalid entry is on the way to be cached. // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. @@ -203,6 +249,15 @@ fn invalidate_cache(cache: Arc, msg: Notification) { password_update.role_name, ), Notification::Cancel(_) => unreachable!("cancel message should be handled separately"), + Notification::BlockPublicOrVpcAccessUpdated { .. } => { + // https://github.com/neondatabase/neon/pull/10073 + } + Notification::AllowedVpcEndpointsUpdatedForOrg { .. } => { + // https://github.com/neondatabase/neon/pull/10073 + } + Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => { + // https://github.com/neondatabase/neon/pull/10073 + } } } @@ -249,6 +304,7 @@ async fn handle_messages( /// Handle console's invalidation messages. #[tracing::instrument(name = "redis_notifications", skip_all)] pub async fn task_main( + config: &'static ProxyConfig, redis: ConnectionWithCredentialsProvider, cache: Arc, cancel_map: CancelMap, @@ -258,6 +314,7 @@ where C: ProjectInfoCache + Send + Sync + 'static, { let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + &config.connect_to_compute, cancel_map, crate::metrics::CancellationSource::FromRedis, )); diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 1373dfba3d9a..4922ece61531 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -2,7 +2,7 @@ use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; -use crate::parse::{split_at_const, split_cstr}; +use crate::parse::split_cstr; /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] @@ -19,7 +19,7 @@ impl<'a> FirstMessage<'a> { let (method_cstr, tail) = split_cstr(bytes)?; let method = method_cstr.to_str().ok()?; - let (len_bytes, bytes) = split_at_const(tail)?; + let (len_bytes, bytes) = tail.split_first_chunk()?; let len = u32::from_be_bytes(*len_bytes) as usize; if len != bytes.len() { return None; @@ -51,6 +51,7 @@ impl<'a> ServerMessage<&'a str> { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index f1c916daa2b7..ac7755656683 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -50,6 +50,12 @@ impl SaslStream<'_, S> { self.stream.write_message(&msg.to_reply()).await?; Ok(()) } + + // Queue a SASL message for the client. + fn send_noflush(&mut self, msg: &ServerMessage<&str>) -> io::Result<()> { + self.stream.write_message_noflush(&msg.to_reply())?; + Ok(()) + } } /// SASL authentication outcome. @@ -85,7 +91,7 @@ impl SaslStream<'_, S> { continue; } Step::Success(result, reply) => { - self.send(&ServerMessage::Final(&reply)).await?; + self.send_noflush(&ServerMessage::Final(&reply))?; Outcome::Success(result) } Step::Failure(reason) => Outcome::Failure(reason), diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 6a13f645a5ab..77853db3db94 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -13,7 +13,6 @@ use super::secret::ServerSecret; use super::signature::SignatureBuilder; use super::threadpool::ThreadPool; use super::ScramKey; -use crate::config; use crate::intern::EndpointIdInt; use crate::sasl::{self, ChannelBinding, Error as SaslError}; @@ -59,14 +58,14 @@ enum ExchangeState { pub(crate) struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, - tls_server_end_point: config::TlsServerEndPoint, + tls_server_end_point: crate::tls::TlsServerEndPoint, } impl<'a> Exchange<'a> { pub(crate) fn new( secret: &'a ServerSecret, nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], - tls_server_end_point: config::TlsServerEndPoint, + tls_server_end_point: crate::tls::TlsServerEndPoint, ) -> Self { Self { state: ExchangeState::Initial(SaslInitial { nonce }), @@ -120,7 +119,7 @@ impl SaslInitial { fn transition( &self, secret: &ServerSecret, - tls_server_end_point: &config::TlsServerEndPoint, + tls_server_end_point: &crate::tls::TlsServerEndPoint, input: &str, ) -> sasl::Result> { let client_first_message = ClientFirstMessage::parse(input) @@ -155,7 +154,7 @@ impl SaslSentInner { fn transition( &self, secret: &ServerSecret, - tls_server_end_point: &config::TlsServerEndPoint, + tls_server_end_point: &crate::tls::TlsServerEndPoint, input: &str, ) -> sasl::Result> { let Self { @@ -168,8 +167,8 @@ impl SaslSentInner { .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?; let channel_binding = cbind_flag.encode(|_| match tls_server_end_point { - config::TlsServerEndPoint::Sha256(x) => Ok(x), - config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding), + crate::tls::TlsServerEndPoint::Sha256(x) => Ok(x), + crate::tls::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding), })?; // This might've been caused by a MITM attack diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 5ee3a513527d..0e54e7ded9a7 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -185,6 +185,7 @@ impl fmt::Debug for OwnedServerFirstMessage { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index 718445f61d48..cfa571cbe1a4 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -57,6 +57,7 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::threadpool::ThreadPool; use super::{Exchange, ServerSecret}; @@ -76,11 +77,8 @@ mod tests { const NONCE: [u8; 18] = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ]; - let mut exchange = Exchange::new( - &secret, - || NONCE, - crate::config::TlsServerEndPoint::Undefined, - ); + let mut exchange = + Exchange::new(&secret, || NONCE, crate::tls::TlsServerEndPoint::Undefined); let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO"; let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0="; diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 8c6a08d432d8..eb21b26ab40e 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -72,6 +72,7 @@ impl ServerSecret { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index ebc6dd2a3cef..8f1684c75b0e 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -33,14 +33,11 @@ thread_local! { } impl ThreadPool { - pub fn new(n_workers: u8) -> Arc { + pub fn new(mut n_workers: u8) -> Arc { // rayon would be nice here, but yielding in rayon does not work well afaict. if n_workers == 0 { - return Arc::new(Self { - runtime: None, - metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), - }); + n_workers = 1; } Arc::new_cyclic(|pool| { @@ -66,7 +63,7 @@ impl ThreadPool { }); }) .build() - .unwrap(); + .expect("password threadpool runtime should be configured correctly"); Self { runtime: Some(runtime), @@ -79,7 +76,7 @@ impl ThreadPool { JobHandle( self.runtime .as_ref() - .unwrap() + .expect("runtime is always set") .spawn(JobSpec { pbkdf2, endpoint }), ) } @@ -87,7 +84,10 @@ impl ThreadPool { impl Drop for ThreadPool { fn drop(&mut self) { - self.runtime.take().unwrap().shutdown_background(); + self.runtime + .take() + .expect("runtime is always set") + .shutdown_background(); } } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 251aa470843d..b398c3ddd07b 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -22,7 +22,7 @@ use crate::compute; use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; -use crate::config::ProxyConfig; +use crate::config::{ComputeConfig, ProxyConfig}; use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; @@ -195,9 +195,8 @@ impl PoolingBackend { locks: &self.config.connect_compute_locks, }, &backend, - false, // do not allow self signed compute for http flow self.config.wake_compute_retry_config, - self.config.connect_to_compute_retry_config, + &self.config.connect_to_compute, ) .await } @@ -237,9 +236,8 @@ impl PoolingBackend { locks: &self.config.connect_compute_locks, }, &backend, - false, // do not allow self signed compute for http flow self.config.wake_compute_retry_config, - self.config.connect_to_compute_retry_config, + &self.config.connect_to_compute, ) .await } @@ -270,7 +268,11 @@ impl PoolingBackend { if !self.local_pool.initialized(&conn_info) { // only install and grant usage one at a time. - let _permit = local_backend.initialize.acquire().await.unwrap(); + let _permit = local_backend + .initialize + .acquire() + .await + .expect("semaphore should never be closed"); // check again for race if !self.local_pool.initialized(&conn_info) { @@ -500,7 +502,7 @@ impl ConnectMechanism for TokioMechanism { &self, ctx: &RequestContext, node_info: &CachedNodeInfo, - timeout: Duration, + compute_config: &ComputeConfig, ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; @@ -509,7 +511,7 @@ impl ConnectMechanism for TokioMechanism { let config = config .user(&self.conn_info.user_info.user) .dbname(&self.conn_info.dbname) - .connect_timeout(timeout); + .connect_timeout(compute_config.timeout); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let res = config.connect(postgres_client::NoTls).await; @@ -550,7 +552,7 @@ impl ConnectMechanism for HyperMechanism { &self, ctx: &RequestContext, node_info: &CachedNodeInfo, - timeout: Duration, + config: &ComputeConfig, ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; @@ -558,7 +560,7 @@ impl ConnectMechanism for HyperMechanism { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let port = node_info.config.get_port(); - let res = connect_http2(&host, port, timeout).await; + let res = connect_http2(&host, port, config.timeout).await; drop(pause); let (client, connection) = permit.release_result(res)?; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index cac5a173cb16..447103edce53 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -186,8 +186,8 @@ impl ClientDataRemote { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { - use std::mem; use std::sync::atomic::AtomicBool; use super::*; @@ -269,39 +269,33 @@ mod tests { assert_eq!(0, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); assert_eq!(1, pool.get_global_connections_count()); } { - let mut closed_client = Client::new( + let closed_client = Client::new( create_inner_with(MockClient::new(true)), conn_info.clone(), ep_pool.clone(), ); - closed_client.do_drop().unwrap()(); - mem::forget(closed_client); // drop the client - // The closed client shouldn't be added to the pool. + drop(closed_client); assert_eq!(1, pool.get_global_connections_count()); } let is_closed: Arc = Arc::new(false.into()); { - let mut client = Client::new( + let client = Client::new( create_inner_with(MockClient(is_closed.clone())), conn_info.clone(), ep_pool.clone(), ); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client - + drop(client); // The client should be added to the pool. assert_eq!(2, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info, ep_pool); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info, ep_pool); + drop(client); // The client shouldn't be added to the pool. Because the ep-pool is full. assert_eq!(2, pool.get_global_connections_count()); @@ -319,15 +313,13 @@ mod tests { &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), ); { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); assert_eq!(3, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); // The client shouldn't be added to the pool. Because the global pool is full. assert_eq!(3, pool.get_global_connections_count()); diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 2a46c8f9c5cf..44eac77e8f94 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -187,19 +187,22 @@ impl EndpointConnPool { pub(crate) fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInnerCommon) { let conn_id = client.get_conn_id(); - let pool_name = pool.read().get_name().to_string(); + let (max_conn, conn_count, pool_name) = { + let pool = pool.read(); + ( + pool.global_pool_size_max_conns, + pool.global_connections_count + .load(atomic::Ordering::Relaxed), + pool.get_name().to_string(), + ) + }; + if client.inner.is_closed() { info!(%conn_id, "{}: throwing away connection '{conn_info}' because connection is closed", pool_name); return; } - let global_max_conn = pool.read().global_pool_size_max_conns; - if pool - .read() - .global_connections_count - .load(atomic::Ordering::Relaxed) - >= global_max_conn - { + if conn_count >= max_conn { info!(%conn_id, "{}: throwing away connection '{conn_info}' because pool is full", pool_name); return; } @@ -633,35 +636,29 @@ impl Client { } pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; + let aux = &self + .inner + .as_ref() + .expect("client inner should not be removed") + .aux; USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, }) } +} - pub(crate) fn do_drop(&mut self) -> Option> { +impl Drop for Client { + fn drop(&mut self) { let conn_info = self.conn_info.clone(); let client = self .inner .take() .expect("client inner should not be removed"); if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { - let current_span = self.span.clone(); + let _current_span = self.span.enter(); // return connection to the pool - return Some(move || { - let _span = current_span.enter(); - EndpointConnPool::put(&conn_pool, &conn_info, client); - }); - } - None - } -} - -impl Drop for Client { - fn drop(&mut self) { - if let Some(drop) = self.do_drop() { - tokio::task::spawn_blocking(drop); + EndpointConnPool::put(&conn_pool, &conn_info, client); } } } diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index c0208d4f68f1..d5c948777cae 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -81,11 +81,14 @@ impl HttpErrorBody { .header(http::header::CONTENT_TYPE, "application/json") // we do not have nested maps with non string keys so serialization shouldn't fail .body( - Full::new(Bytes::from(serde_json::to_string(self).unwrap())) - .map_err(|x| match x {}) - .boxed(), + Full::new(Bytes::from( + serde_json::to_string(self) + .expect("serialising HttpErrorBody should never fail"), + )) + .map_err(|x| match x {}) + .boxed(), ) - .unwrap() + .expect("content-type header should be valid") } } diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 25b25c66d3fb..ab012bd020f1 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -204,7 +204,10 @@ fn pg_array_parse_inner( if c == '\\' { escaped = true; - (i, c) = pg_array_chr.next().unwrap(); + let Some(x) = pg_array_chr.next() else { + return Err(JsonConversionError::UnbalancedArray); + }; + (i, c) = x; } match c { @@ -253,6 +256,7 @@ fn pg_array_parse_inner( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use serde_json::json; diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index b84cde9e252a..c51a2bc9babb 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -179,7 +179,6 @@ pub(crate) fn poll_client( info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = Arc::downgrade(&global_pool); - let pool_clone = pool.clone(); let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); @@ -273,11 +272,7 @@ pub(crate) fn poll_client( }), }; - Client::new( - inner, - conn_info, - Arc::downgrade(&pool_clone.upgrade().unwrap().global_pool), - ) + Client::new(inner, conn_info, Arc::downgrade(&global_pool.global_pool)) } impl ClientInnerCommon { @@ -321,7 +316,8 @@ fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result(buffer.format(jti)).unwrap(); + let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)) + .expect("itoa formatted integer should be guaranteed valid json"); // update the jti in-place let payload = @@ -368,6 +364,7 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use p256::ecdsa::SigningKey; use typed_json::json; diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 80b42f9e5534..c2623e0ecae3 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -46,6 +46,7 @@ use utils::http::error::ApiError; use crate::cancellation::CancellationHandlerMain; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; +use crate::ext::TaskExt; use crate::metrics::Metrics; use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo}; use crate::proxy::run_until_cancelled; @@ -84,7 +85,7 @@ pub async fn task_main( cancellation_token.cancelled().await; tokio::task::spawn_blocking(move || conn_pool.shutdown()) .await - .unwrap(); + .propagate_task_panic(); } }); @@ -104,7 +105,7 @@ pub async fn task_main( cancellation_token.cancelled().await; tokio::task::spawn_blocking(move || http_conn_pool.shutdown()) .await - .unwrap(); + .propagate_task_panic(); } }); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5e85f5ec4019..3e42787a0964 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1110,6 +1110,7 @@ impl Discard<'_> { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index bdb83fe6be05..47326c11815d 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -168,7 +168,7 @@ pub(crate) async fn serve_websocket( Ok(Some(p)) => { ctx.set_success(); ctx.log_connect(); - match p.proxy_pass().await { + match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => Ok(()), Err(ErrorSource::Client(err)) => Err(err).context("client"), Err(ErrorSource::Compute(err)) => Err(err).context("compute"), @@ -178,6 +178,7 @@ pub(crate) async fn serve_websocket( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::pin::pin; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 11f426819de0..ace27a72847e 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -11,9 +11,9 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; use tracing::debug; -use crate::config::TlsServerEndPoint; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::metrics::Metrics; +use crate::tls::TlsServerEndPoint; /// Stream wrapper which implements libpq's protocol. /// diff --git a/proxy/src/tls/client_config.rs b/proxy/src/tls/client_config.rs new file mode 100644 index 000000000000..a2d695aae11d --- /dev/null +++ b/proxy/src/tls/client_config.rs @@ -0,0 +1,42 @@ +use std::sync::Arc; + +use anyhow::bail; +use rustls::crypto::ring; + +pub(crate) fn load_certs() -> anyhow::Result> { + let der_certs = rustls_native_certs::load_native_certs(); + + if !der_certs.errors.is_empty() { + bail!("could not parse certificates: {:?}", der_certs.errors); + } + + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(der_certs.certs); + Ok(Arc::new(store)) +} + +/// Loads the root certificates and constructs a client config suitable for connecting to the neon compute. +/// This function is blocking. +pub fn compute_client_config_with_root_certs() -> anyhow::Result { + Ok( + rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_safe_default_protocol_versions() + .expect("ring should support the default protocol versions") + .with_root_certificates(load_certs()?) + .with_no_client_auth(), + ) +} + +#[cfg(test)] +pub fn compute_client_config_with_certs( + certs: impl IntoIterator>, +) -> rustls::ClientConfig { + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(certs); + + rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_safe_default_protocol_versions() + .expect("ring should support the default protocol versions") + .with_root_certificates(store) + .with_no_client_auth() +} diff --git a/proxy/src/tls/mod.rs b/proxy/src/tls/mod.rs new file mode 100644 index 000000000000..d6ce6bd9fcf4 --- /dev/null +++ b/proxy/src/tls/mod.rs @@ -0,0 +1,72 @@ +pub mod client_config; +pub mod postgres_rustls; +pub mod server_config; + +use anyhow::Context; +use rustls::pki_types::CertificateDer; +use sha2::{Digest, Sha256}; +use tracing::{error, info}; +use x509_parser::oid_registry; + +/// +pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; + +/// Channel binding parameter +/// +/// +/// Description: The hash of the TLS server's certificate as it +/// appears, octet for octet, in the server's Certificate message. Note +/// that the Certificate message contains a certificate_list, in which +/// the first element is the server's certificate. +/// +/// The hash function is to be selected as follows: +/// +/// * if the certificate's signatureAlgorithm uses a single hash +/// function, and that hash function is either MD5 or SHA-1, then use SHA-256; +/// +/// * if the certificate's signatureAlgorithm uses a single hash +/// function and that hash function neither MD5 nor SHA-1, then use +/// the hash function associated with the certificate's +/// signatureAlgorithm; +/// +/// * if the certificate's signatureAlgorithm uses no hash functions or +/// uses multiple hash functions, then this channel binding type's +/// channel bindings are undefined at this time (updates to is channel +/// binding type may occur to address this issue if it ever arises). +#[derive(Debug, Clone, Copy)] +pub enum TlsServerEndPoint { + Sha256([u8; 32]), + Undefined, +} + +impl TlsServerEndPoint { + pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { + let sha256_oids = [ + // I'm explicitly not adding MD5 or SHA1 here... They're bad. + oid_registry::OID_SIG_ECDSA_WITH_SHA256, + oid_registry::OID_PKCS1_SHA256WITHRSA, + ]; + + let pem = x509_parser::parse_x509_certificate(cert) + .context("Failed to parse PEM object from cerficiate")? + .1; + + info!(subject = %pem.subject, "parsing TLS certificate"); + + let reg = oid_registry::OidRegistry::default().with_all_crypto(); + let oid = pem.signature_algorithm.oid(); + let alg = reg.get(oid); + if sha256_oids.contains(oid) { + let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); + info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); + Ok(Self::Sha256(tls_server_end_point)) + } else { + error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding"); + Ok(Self::Undefined) + } + } + + pub fn supported(&self) -> bool { + !matches!(self, TlsServerEndPoint::Undefined) + } +} diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/tls/postgres_rustls.rs similarity index 96% rename from proxy/src/postgres_rustls/mod.rs rename to proxy/src/tls/postgres_rustls.rs index 5ef20991c309..0ad279b6356d 100644 --- a/proxy/src/postgres_rustls/mod.rs +++ b/proxy/src/tls/postgres_rustls.rs @@ -18,7 +18,7 @@ mod private { use tokio_rustls::client::TlsStream; use tokio_rustls::TlsConnector; - use crate::config::TlsServerEndPoint; + use crate::tls::TlsServerEndPoint; pub struct TlsConnectFuture { inner: tokio_rustls::Connect, @@ -126,16 +126,14 @@ mod private { /// That way you can connect to PostgreSQL using `rustls` as the TLS stack. #[derive(Clone)] pub struct MakeRustlsConnect { - config: Arc, + pub config: Arc, } impl MakeRustlsConnect { /// Creates a new `MakeRustlsConnect` from the provided `ClientConfig`. #[must_use] - pub fn new(config: ClientConfig) -> Self { - Self { - config: Arc::new(config), - } + pub fn new(config: Arc) -> Self { + Self { config } } } diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs new file mode 100644 index 000000000000..2cc1657eea3d --- /dev/null +++ b/proxy/src/tls/server_config.rs @@ -0,0 +1,218 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use anyhow::{bail, Context}; +use itertools::Itertools; +use rustls::crypto::ring::{self, sign}; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; + +use super::{TlsServerEndPoint, PG_ALPN_PROTOCOL}; + +pub struct TlsConfig { + pub config: Arc, + pub common_names: HashSet, + pub cert_resolver: Arc, +} + +impl TlsConfig { + pub fn to_server_config(&self) -> Arc { + self.config.clone() + } +} + +/// Configure TLS for the main endpoint. +pub fn configure_tls( + key_path: &str, + cert_path: &str, + certs_dir: Option<&String>, + allow_tls_keylogfile: bool, +) -> anyhow::Result { + let mut cert_resolver = CertResolver::new(); + + // add default certificate + cert_resolver.add_cert_path(key_path, cert_path, true)?; + + // add extra certificates + if let Some(certs_dir) = certs_dir { + for entry in std::fs::read_dir(certs_dir)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + // file names aligned with default cert-manager names + let key_path = path.join("tls.key"); + let cert_path = path.join("tls.crt"); + if key_path.exists() && cert_path.exists() { + cert_resolver.add_cert_path( + &key_path.to_string_lossy(), + &cert_path.to_string_lossy(), + false, + )?; + } + } + } + } + + let common_names = cert_resolver.get_common_names(); + + let cert_resolver = Arc::new(cert_resolver); + + // allow TLS 1.2 to be compatible with older client libraries + let mut config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_cert_resolver(cert_resolver.clone()); + + config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; + + if allow_tls_keylogfile { + // KeyLogFile will check for the SSLKEYLOGFILE environment variable. + config.key_log = Arc::new(rustls::KeyLogFile::new()); + } + + Ok(TlsConfig { + config: Arc::new(config), + common_names, + cert_resolver, + }) +} + +#[derive(Default, Debug)] +pub struct CertResolver { + certs: HashMap, TlsServerEndPoint)>, + default: Option<(Arc, TlsServerEndPoint)>, +} + +impl CertResolver { + pub fn new() -> Self { + Self::default() + } + + fn add_cert_path( + &mut self, + key_path: &str, + cert_path: &str, + is_default: bool, + ) -> anyhow::Result<()> { + let priv_key = { + let key_bytes = std::fs::read(key_path) + .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + rustls_pemfile::private_key(&mut &key_bytes[..]) + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? + }; + + self.add_cert(priv_key, cert_chain, is_default) + } + + pub fn add_cert( + &mut self, + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, + is_default: bool, + ) -> anyhow::Result<()> { + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let first_cert = &cert_chain[0]; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + let pem = x509_parser::parse_x509_certificate(first_cert) + .context("Failed to parse PEM object from cerficiate")? + .1; + + let common_name = pem.subject().to_string(); + + // We need to get the canonical name for this certificate so we can match them against any domain names + // seen within the proxy codebase. + // + // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. + // We need to remove the wildcard prefix for the purposes of certificate selection. + // + // auth-broker does not use SNI and instead uses the Neon-Connection-String header. + // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. + // + // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string + // validation, so let's we can continue with any common-name + let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=") { + s.to_string() + } else { + bail!("Failed to parse common name from certificate") + }; + + let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); + + if is_default { + self.default = Some((cert.clone(), tls_server_end_point)); + } + + self.certs.insert(common_name, (cert, tls_server_end_point)); + + Ok(()) + } + + pub fn get_common_names(&self) -> HashSet { + self.certs.keys().map(|s| s.to_string()).collect() + } +} + +impl rustls::server::ResolvesServerCert for CertResolver { + fn resolve( + &self, + client_hello: rustls::server::ClientHello<'_>, + ) -> Option> { + self.resolve(client_hello.server_name()).map(|x| x.0) + } +} + +impl CertResolver { + pub fn resolve( + &self, + server_name: Option<&str>, + ) -> Option<(Arc, TlsServerEndPoint)> { + // loop here and cut off more and more subdomains until we find + // a match to get a proper wildcard support. OTOH, we now do not + // use nested domains, so keep this simple for now. + // + // With the current coding foo.com will match *.foo.com and that + // repeats behavior of the old code. + if let Some(mut sni_name) = server_name { + loop { + if let Some(cert) = self.certs.get(sni_name) { + return Some(cert.clone()); + } + if let Some((_, rest)) = sni_name.split_once('.') { + sni_name = rest; + } else { + return None; + } + } + } else { + // No SNI, use the default certificate, otherwise we can't get to + // options parameter which can be used to set endpoint name too. + // That means that non-SNI flow will not work for CNAME domains in + // verify-full mode. + // + // If that will be a problem we can: + // + // a) Instead of multi-cert approach use single cert with extra + // domains listed in Subject Alternative Name (SAN). + // b) Deploy separate proxy instances for extra domains. + self.default.clone() + } + } +} diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 270cd7c24da0..d73a84057ae3 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -50,6 +50,7 @@ impl std::fmt::Display for ApiUrl { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 65e74466f2ec..487504d709ed 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -407,6 +407,7 @@ async fn upload_backup_events( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::fs; use std::io::BufReader; diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0422c46ab10c..3ebb7097f200 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -9,6 +9,7 @@ default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] +benchmarking = [] [dependencies] async-stream.workspace = true @@ -55,6 +56,7 @@ postgres_ffi.workspace = true pq_proto.workspace = true remote_storage.workspace = true safekeeper_api.workspace = true +safekeeper_client.workspace = true sha2.workspace = true sd-notify.workspace = true storage_broker.workspace = true @@ -76,3 +78,4 @@ tracing-subscriber = { workspace = true, features = ["json"] } [[bench]] name = "receive_wal" harness = false +required-features = ["benchmarking"] diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 313d945b942f..996c4d9b8c87 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -1,11 +1,7 @@ //! WAL ingestion benchmarks. -#[path = "benchutils.rs"] -mod benchutils; - use std::io::Write as _; -use benchutils::Env; use bytes::BytesMut; use camino_tempfile::tempfile; use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; @@ -16,6 +12,7 @@ use safekeeper::receive_wal::{self, WalAcceptor}; use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, }; +use safekeeper::test_utils::Env; use tokio::io::AsyncWriteExt as _; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -76,12 +73,15 @@ fn bench_process_msg(c: &mut Criterion) { assert!(size >= prefixlen); let message = vec![0; size - prefixlen]; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message)); + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0)); // Set up the Safekeeper. let env = Env::new(fsync)?; - let mut safekeeper = - runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?; + let mut safekeeper = runtime.block_on(env.make_safekeeper( + NodeId(1), + TenantTimelineId::generate(), + Lsn(0), + ))?; b.iter_batched_ref( // Pre-construct WAL records and requests. Criterion will batch them. @@ -134,7 +134,8 @@ fn bench_wal_acceptor(c: &mut Criterion) { let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded let env = Env::new(fsync)?; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message")); + let walgen = + &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"), Lsn(0)); // Create buffered channels that can fit all requests, to avoid blocking on channels. let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n); @@ -145,7 +146,7 @@ fn bench_wal_acceptor(c: &mut Criterion) { // TODO: WalAcceptor doesn't actually need a full timeline, only // Safekeeper::process_msg(). Consider decoupling them to simplify the setup. let tli = env - .make_timeline(NodeId(1), TenantTimelineId::generate()) + .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0)) .await? .wal_residence_guard() .await?; @@ -239,7 +240,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { assert!(size >= prefixlen); let message = vec![0; size - prefixlen]; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message)); + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0)); // Construct and spawn the WalAcceptor task. let env = Env::new(fsync)?; @@ -249,7 +250,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { runtime.block_on(async { let tli = env - .make_timeline(NodeId(1), TenantTimelineId::generate()) + .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0)) .await? .wal_residence_guard() .await?; diff --git a/safekeeper/client/Cargo.toml b/safekeeper/client/Cargo.toml new file mode 100644 index 000000000000..6c5a52de3acf --- /dev/null +++ b/safekeeper/client/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "safekeeper_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +safekeeper_api.workspace = true +thiserror.workspace = true +reqwest = { workspace = true, features = [ "stream" ] } +serde.workspace = true +utils.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/safekeeper/client/src/lib.rs b/safekeeper/client/src/lib.rs new file mode 100644 index 000000000000..3963fd466cc8 --- /dev/null +++ b/safekeeper/client/src/lib.rs @@ -0,0 +1 @@ +pub mod mgmt_api; diff --git a/safekeeper/src/http/client.rs b/safekeeper/client/src/mgmt_api.rs similarity index 95% rename from safekeeper/src/http/client.rs rename to safekeeper/client/src/mgmt_api.rs index a166fc1ab9b0..f78745043a35 100644 --- a/safekeeper/src/http/client.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -2,12 +2,9 @@ //! //! Partially copied from pageserver client; some parts might be better to be //! united. -//! -//! It would be also good to move it out to separate crate, but this needs -//! duplication of internal-but-reported structs like WalSenderState, ServerInfo -//! etc. use reqwest::{IntoUrl, Method, StatusCode}; +use safekeeper_api::models::TimelineStatus; use std::error::Error as _; use utils::{ http::error::HttpErrorBody, @@ -15,8 +12,6 @@ use utils::{ logging::SecretString, }; -use super::routes::TimelineStatus; - #[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 13f6e3457589..e0ba38d638e6 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -51,10 +51,12 @@ use utils::{ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). -#[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +// TODO: disabled because concurrent CPU profiles cause seg faults. See: +// https://github.com/neondatabase/neon/issues/10225. +//#[allow(non_upper_case_globals)] +//#[export_name = "malloc_conf"] +//pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index a4b4670e423b..dd152fd4cce8 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,11 +1,12 @@ //! Code to deal with safekeeper control file upgrades use crate::{ - safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, + safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}, state::{EvictionState, PersistedPeers, TimelinePersistentState}, wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::*; use utils::{ diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 93011eddec07..19362a0992d4 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -14,6 +14,7 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use postgres_ffi::XLogSegNo; use postgres_ffi::MAX_SEND_SIZE; +use safekeeper_api::models::WalSenderState; use serde::Deserialize; use serde::Serialize; @@ -25,7 +26,6 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use crate::safekeeper::TermHistory; -use crate::send_wal::WalSenderState; use crate::state::TimelineMemState; use crate::state::TimelinePersistentState; use crate::timeline::get_timeline_dir; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2ca6333ba835..bb639bfb3221 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -4,6 +4,8 @@ use anyhow::Context; use pageserver_api::models::ShardParameters; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; +use safekeeper_api::models::ConnectionId; +use safekeeper_api::Term; use std::future::Future; use std::str::{self, FromStr}; use std::sync::Arc; @@ -16,9 +18,7 @@ use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE}; -use crate::safekeeper::Term; use crate::timeline::TimelineError; -use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; use postgres_backend::PostgresBackend; use postgres_backend::QueryError; diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 7229ccb7390b..d82a713f8a93 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,4 +1,3 @@ -pub mod client; pub mod routes; pub use routes::make_router; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 71c36f1d4631..6186f4c3ba05 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,5 +1,9 @@ use hyper::{Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize}; +use safekeeper_api::models::AcceptorStateStatus; +use safekeeper_api::models::SafekeeperStatus; +use safekeeper_api::models::TermSwitchApiEntry; +use safekeeper_api::models::TimelineStatus; +use safekeeper_api::ServerInfo; use std::collections::HashMap; use std::fmt; use std::io::Write as _; @@ -31,26 +35,17 @@ use utils::{ request::{ensure_no_body, parse_request_param}, RequestExt, RouterBuilder, }, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, }; use crate::debug_dump::TimelineDigestRequest; -use crate::receive_wal::WalReceiverState; -use crate::safekeeper::Term; -use crate::safekeeper::{ServerInfo, TermLsn}; -use crate::send_wal::WalSenderState; -use crate::timeline::PeerInfo; +use crate::safekeeper::TermLsn; use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; use crate::SafeKeeperConf; use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; -#[derive(Debug, Serialize)] -struct SafekeeperStatus { - id: NodeId, -} - /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -73,50 +68,6 @@ fn get_global_timelines(request: &Request) -> Arc { .clone() } -/// Same as TermLsn, but serializes LSN using display serializer -/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct TermSwitchApiEntry { - pub term: Term, - pub lsn: Lsn, -} - -impl From for TermLsn { - fn from(api_val: TermSwitchApiEntry) -> Self { - TermLsn { - term: api_val.term, - lsn: api_val.lsn, - } - } -} - -/// Augment AcceptorState with last_log_term for convenience -#[derive(Debug, Serialize, Deserialize)] -pub struct AcceptorStateStatus { - pub term: Term, - pub epoch: Term, // aka last_log_term - pub term_history: Vec, -} - -/// Info about timeline on safekeeper ready for reporting. -#[derive(Debug, Serialize, Deserialize)] -pub struct TimelineStatus { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub acceptor_state: AcceptorStateStatus, - pub pg_info: ServerInfo, - pub flush_lsn: Lsn, - pub timeline_start_lsn: Lsn, - pub local_start_lsn: Lsn, - pub commit_lsn: Lsn, - pub backup_lsn: Lsn, - pub peer_horizon_lsn: Lsn, - pub remote_consistent_lsn: Lsn, - pub peers: Vec, - pub walsenders: Vec, - pub walreceivers: Vec, -} - fn check_permission(request: &Request, tenant_id: Option) -> Result<(), ApiError> { check_permission_with(request, |claims| { crate::auth::check_permission(claims, tenant_id) @@ -187,6 +138,15 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, res) } +impl From for TermLsn { + fn from(api_val: TermSwitchApiEntry) -> Self { + TermLsn { + term: api_val.term, + lsn: api_val.lsn, + } + } +} + /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( @@ -604,7 +564,7 @@ pub fn make_router( if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { const ALLOWLIST_ROUTES: &[&str] = - &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"]; + &["/v1/status", "/metrics", "/profile/cpu", "/profile/heap"]; if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index dc4ad3706e6c..256e350ceba5 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -8,16 +8,17 @@ use anyhow::Context; use postgres_backend::QueryError; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::*; use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; +use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; use crate::safekeeper::{ AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, }; -use crate::safekeeper::{Term, TermHistory, TermLsn}; +use crate::safekeeper::{TermHistory, TermLsn}; use crate::state::TimelinePersistentState; use crate::timeline::WalResidentTimeline; use postgres_backend::PostgresBackend; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index abe6e00a665b..7acf355e6a71 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -43,6 +43,9 @@ pub mod wal_reader_stream; pub mod wal_service; pub mod wal_storage; +#[cfg(any(test, feature = "benchmarking"))] +pub mod test_utils; + mod timelines_global_map; use std::sync::Arc; pub use timelines_global_map::GlobalTimelines; diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index f58a9dca1dbc..f2d8e4c85fd7 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -4,6 +4,9 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use safekeeper_api::{models::TimelineStatus, Term}; +use safekeeper_client::mgmt_api; +use safekeeper_client::mgmt_api::Client; use serde::{Deserialize, Serialize}; use std::{ cmp::min, @@ -21,11 +24,6 @@ use tracing::{error, info, instrument}; use crate::{ control_file::CONTROL_FILE_NAME, debug_dump, - http::{ - client::{self, Client}, - routes::TimelineStatus, - }, - safekeeper::Term, state::{EvictionState, TimelinePersistentState}, timeline::{Timeline, WalResidentTimeline}, timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, @@ -422,7 +420,7 @@ pub async fn handle_request( let http_hosts = request.http_hosts.clone(); // Figure out statuses of potential donors. - let responses: Vec> = + let responses: Vec> = futures::future::join_all(http_hosts.iter().map(|url| async { let cclient = Client::new(url.clone(), sk_auth_token.clone()); let info = cclient diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 2a49890d618f..3e9ce1da8eb8 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -9,9 +9,7 @@ use crate::metrics::{ }; use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; -use crate::safekeeper::ServerInfo; use crate::timeline::WalResidentTimeline; -use crate::wal_service::ConnectionId; use crate::GlobalTimelines; use anyhow::{anyhow, Context}; use bytes::BytesMut; @@ -23,8 +21,8 @@ use postgres_backend::PostgresBackend; use postgres_backend::PostgresBackendReader; use postgres_backend::QueryError; use pq_proto::BeMessage; -use serde::Deserialize; -use serde::Serialize; +use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; +use safekeeper_api::ServerInfo; use std::future; use std::net::SocketAddr; use std::sync::Arc; @@ -171,21 +169,6 @@ impl WalReceiversShared { } } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WalReceiverState { - /// None means it is recovery initiated by us (this safekeeper). - pub conn_id: Option, - pub status: WalReceiverStatus, -} - -/// Walreceiver status. Currently only whether it passed voting stage and -/// started receiving the stream, but it is easy to add more if needed. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum WalReceiverStatus { - Voting, - Streaming, -} - /// Scope guard to access slot in WalReceivers registry and unregister from /// it in Drop. pub struct WalReceiverGuard { @@ -335,7 +318,7 @@ struct NetworkReader<'a, IO> { global_timelines: Arc, } -impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { +impl NetworkReader<'_, IO> { async fn read_first_message( &mut self, ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 7b87166aa052..61647c16b00a 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -7,6 +7,8 @@ use std::{fmt, pin::pin}; use anyhow::{bail, Context}; use futures::StreamExt; use postgres_protocol::message::backend::ReplicationMessage; +use safekeeper_api::models::{PeerInfo, TimelineStatus}; +use safekeeper_api::Term; use tokio::sync::mpsc::{channel, Receiver, Sender}; use tokio::time::timeout; use tokio::{ @@ -24,13 +26,11 @@ use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; use crate::safekeeper::{AppendRequest, AppendRequestHeader}; use crate::timeline::WalResidentTimeline; use crate::{ - http::routes::TimelineStatus, receive_wal::MSG_QUEUE_SIZE, safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory, - TermLsn, VoteRequest, + AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn, + VoteRequest, }, - timeline::PeerInfo, SafeKeeperConf, }; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 6eb69f0b7ce2..6ceaf325b049 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -5,6 +5,9 @@ use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; +use safekeeper_api::models::HotStandbyFeedback; +use safekeeper_api::Term; +use safekeeper_api::INVALID_TERM; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; @@ -16,7 +19,6 @@ use tracing::*; use crate::control_file; use crate::metrics::MISC_OPERATION_SECONDS; -use crate::send_wal::HotStandbyFeedback; use crate::state::TimelineState; use crate::wal_storage; @@ -31,10 +33,6 @@ use utils::{ const SK_PROTOCOL_VERSION: u32 = 2; pub const UNKNOWN_SERVER_VERSION: u32 = 0; -/// Consensus logical timestamp. -pub type Term = u64; -pub const INVALID_TERM: Term = 0; - #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] pub struct TermLsn { pub term: Term, @@ -127,10 +125,7 @@ impl TermHistory { ); last_common_idx = Some(i); } - let last_common_idx = match last_common_idx { - None => return None, // no common point - Some(lci) => lci, - }; + let last_common_idx = last_common_idx?; // Now find where it ends at both prop and sk and take min. End of // (common) term is the start of the next except it is the last one; // there it is flush_lsn in case of safekeeper or, in case of proposer @@ -198,16 +193,6 @@ impl AcceptorState { } } -/// Information about Postgres. Safekeeper gets it once and then verifies -/// all further connections from computes match. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ServerInfo { - /// Postgres server version - pub pg_version: u32, - pub system_id: SystemId, - pub wal_seg_size: u32, -} - #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct PersistedPeerInfo { /// LSN up to which safekeeper offloaded WAL to s3. @@ -1041,6 +1026,7 @@ where mod tests { use futures::future::BoxFuture; use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; + use safekeeper_api::ServerInfo; use super::*; use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState}; diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 25890304221e..7d215176dd91 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -94,9 +94,14 @@ impl InterpretedWalSender<'_, IO> { } } + let max_next_record_lsn = match max_next_record_lsn { + Some(lsn) => lsn, + None => { continue; } + }; + let batch = InterpretedWalRecords { records, - next_record_lsn: max_next_record_lsn + next_record_lsn: Some(max_next_record_lsn), }; tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap(); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 0887cf726418..84632219984a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -4,11 +4,10 @@ use crate::handler::SafekeeperPostgresHandler; use crate::metrics::RECEIVED_PS_FEEDBACKS; use crate::receive_wal::WalReceivers; -use crate::safekeeper::{Term, TermLsn}; +use crate::safekeeper::TermLsn; use crate::send_interpreted_wal::InterpretedWalSender; use crate::timeline::WalResidentTimeline; use crate::wal_reader_stream::WalReaderStreamBuilder; -use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use anyhow::{bail, Context as AnyhowContext}; use bytes::Bytes; @@ -19,7 +18,11 @@ use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; -use serde::{Deserialize, Serialize}; +use safekeeper_api::models::{ + ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply, + WalSenderState, INVALID_FULL_TRANSACTION_ID, +}; +use safekeeper_api::Term; use tokio::io::{AsyncRead, AsyncWrite}; use utils::failpoint_support; use utils::id::TenantTimelineId; @@ -28,7 +31,6 @@ use utils::postgres_client::PostgresClientProtocol; use std::cmp::{max, min}; use std::net::SocketAddr; -use std::str; use std::sync::Arc; use std::time::Duration; use tokio::sync::watch::Receiver; @@ -42,65 +44,6 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; // neon extension of replication protocol const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; -type FullTransactionId = u64; - -/// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct HotStandbyFeedback { - pub ts: TimestampTz, - pub xmin: FullTransactionId, - pub catalog_xmin: FullTransactionId, -} - -const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; - -impl HotStandbyFeedback { - pub fn empty() -> HotStandbyFeedback { - HotStandbyFeedback { - ts: 0, - xmin: 0, - catalog_xmin: 0, - } - } -} - -/// Standby status update -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct StandbyReply { - pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. - pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. - pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. - pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. - pub reply_requested: bool, -} - -impl StandbyReply { - fn empty() -> Self { - StandbyReply { - write_lsn: Lsn::INVALID, - flush_lsn: Lsn::INVALID, - apply_lsn: Lsn::INVALID, - reply_ts: 0, - reply_requested: false, - } - } -} - -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct StandbyFeedback { - pub reply: StandbyReply, - pub hs_feedback: HotStandbyFeedback, -} - -impl StandbyFeedback { - pub fn empty() -> Self { - StandbyFeedback { - reply: StandbyReply::empty(), - hs_feedback: HotStandbyFeedback::empty(), - } - } -} - /// WalSenders registry. Timeline holds it (wrapped in Arc). pub struct WalSenders { mutex: Mutex, @@ -341,25 +284,6 @@ impl WalSendersShared { } } -// Serialized is used only for pretty printing in json. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WalSenderState { - ttid: TenantTimelineId, - addr: SocketAddr, - conn_id: ConnectionId, - // postgres application_name - appname: Option, - feedback: ReplicationFeedback, -} - -// Receiver is either pageserver or regular standby, which have different -// feedbacks. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -enum ReplicationFeedback { - Pageserver(PageserverFeedback), - Standby(StandbyFeedback), -} - // id of the occupied slot in WalSenders to access it (and save in the // WalSenderGuard). We could give Arc directly to the slot, but there is not // much sense in that as values aggregation which is performed on each feedback @@ -888,6 +812,7 @@ impl ReplyReader { #[cfg(test)] mod tests { + use safekeeper_api::models::FullTransactionId; use utils::id::{TenantId, TimelineId}; use super::*; diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 941b7e67d0a9..c6ae6c1d2b0e 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -5,7 +5,7 @@ use std::{cmp::max, ops::Deref}; use anyhow::{bail, Result}; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::TimelineTermBumpResponse; +use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term}; use serde::{Deserialize, Serialize}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -14,10 +14,7 @@ use utils::{ use crate::{ control_file, - safekeeper::{ - AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory, - UNKNOWN_SERVER_VERSION, - }, + safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION}, timeline::TimelineError, wal_backup_partial::{self}, }; diff --git a/safekeeper/benches/benchutils.rs b/safekeeper/src/test_utils.rs similarity index 78% rename from safekeeper/benches/benchutils.rs rename to safekeeper/src/test_utils.rs index 48d796221b43..c40a8bae5a7d 100644 --- a/safekeeper/benches/benchutils.rs +++ b/safekeeper/src/test_utils.rs @@ -1,18 +1,18 @@ use std::sync::Arc; +use crate::rate_limit::RateLimiter; +use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory}; +use crate::state::{TimelinePersistentState, TimelineState}; +use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; +use crate::timelines_set::TimelinesSet; +use crate::wal_backup::remote_timeline_path; +use crate::{control_file, wal_storage, SafeKeeperConf}; use camino_tempfile::Utf8TempDir; -use safekeeper::rate_limit::RateLimiter; -use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory}; -use safekeeper::state::{TimelinePersistentState, TimelineState}; -use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; -use safekeeper::timelines_set::TimelinesSet; -use safekeeper::wal_backup::remote_timeline_path; -use safekeeper::{control_file, wal_storage, SafeKeeperConf}; use tokio::fs::create_dir_all; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; -/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop. +/// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. pub struct Env { /// Whether to enable fsync. pub fsync: bool, @@ -21,7 +21,7 @@ pub struct Env { } impl Env { - /// Creates a new benchmarking environment in a temporary directory. fsync controls whether to + /// Creates a new test or benchmarking environment in a temporary directory. fsync controls whether to /// enable fsyncing. pub fn new(fsync: bool) -> anyhow::Result { let tempdir = camino_tempfile::tempdir()?; @@ -47,6 +47,7 @@ impl Env { &self, node_id: NodeId, ttid: TenantTimelineId, + start_lsn: Lsn, ) -> anyhow::Result> { let conf = self.make_conf(node_id); @@ -67,9 +68,9 @@ impl Env { safekeeper .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected { term: 1, - start_streaming_at: Lsn(0), - term_history: TermHistory(vec![(1, Lsn(0)).into()]), - timeline_start_lsn: Lsn(0), + start_streaming_at: start_lsn, + term_history: TermHistory(vec![(1, start_lsn).into()]), + timeline_start_lsn: start_lsn, })) .await?; @@ -82,12 +83,13 @@ impl Env { &self, node_id: NodeId, ttid: TenantTimelineId, + start_lsn: Lsn, ) -> anyhow::Result> { let conf = Arc::new(self.make_conf(node_id)); let timeline_dir = get_timeline_dir(&conf, &ttid); let remote_path = remote_timeline_path(&ttid)?; - let safekeeper = self.make_safekeeper(node_id, ttid).await?; + let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?; let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); let timeline = Timeline::new( diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 94d6ef106160..36860a0da2b4 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,8 +4,8 @@ use anyhow::{anyhow, bail, Result}; use camino::{Utf8Path, Utf8PathBuf}; use remote_storage::RemotePath; -use safekeeper_api::models::TimelineTermBumpResponse; -use serde::{Deserialize, Serialize}; +use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse}; +use safekeeper_api::Term; use tokio::fs::{self}; use tokio_util::sync::CancellationToken; use utils::id::TenantId; @@ -31,9 +31,7 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use crate::control_file; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; -use crate::safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, Term, TermLsn, -}; +use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; use crate::send_wal::WalSenders; use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState}; use crate::timeline_guard::ResidenceGuard; @@ -47,40 +45,17 @@ use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::SafeKeeperConf; use crate::{debug_dump, timeline_manager, wal_storage}; -/// Things safekeeper should know about timeline state on peers. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PeerInfo { - pub sk_id: NodeId, - pub term: Term, - /// Term of the last entry. - pub last_log_term: Term, - /// LSN of the last record. - pub flush_lsn: Lsn, - pub commit_lsn: Lsn, - /// Since which LSN safekeeper has WAL. - pub local_start_lsn: Lsn, - /// When info was received. Serde annotations are not very useful but make - /// the code compile -- we don't rely on this field externally. - #[serde(skip)] - #[serde(default = "Instant::now")] - ts: Instant, - pub pg_connstr: String, - pub http_connstr: String, -} - -impl PeerInfo { - fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { - PeerInfo { - sk_id: NodeId(sk_info.safekeeper_id), - term: sk_info.term, - last_log_term: sk_info.last_log_term, - flush_lsn: Lsn(sk_info.flush_lsn), - commit_lsn: Lsn(sk_info.commit_lsn), - local_start_lsn: Lsn(sk_info.local_start_lsn), - pg_connstr: sk_info.safekeeper_connstr.clone(), - http_connstr: sk_info.http_connstr.clone(), - ts, - } +fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { + PeerInfo { + sk_id: NodeId(sk_info.safekeeper_id), + term: sk_info.term, + last_log_term: sk_info.last_log_term, + flush_lsn: Lsn(sk_info.flush_lsn), + commit_lsn: Lsn(sk_info.commit_lsn), + local_start_lsn: Lsn(sk_info.local_start_lsn), + pg_connstr: sk_info.safekeeper_connstr.clone(), + http_connstr: sk_info.http_connstr.clone(), + ts, } } @@ -697,7 +672,7 @@ impl Timeline { { let mut shared_state = self.write_shared_state().await; shared_state.sk.record_safekeeper_info(&sk_info).await?; - let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); + let peer_info = peer_info_from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); } Ok(()) diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index c02fb904cf63..a33994dcabaa 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -14,6 +14,7 @@ use std::{ use futures::channel::oneshot; use postgres_ffi::XLogSegNo; +use safekeeper_api::{models::PeerInfo, Term}; use serde::{Deserialize, Serialize}; use tokio::{ task::{JoinError, JoinHandle}, @@ -32,10 +33,9 @@ use crate::{ rate_limit::{rand_duration, RateLimiter}, recovery::recovery_main, remove_wal::calc_horizon_lsn, - safekeeper::Term, send_wal::WalSenders, state::TimelineState, - timeline::{ManagerTimeline, PeerInfo, ReadGuardSharedState, StateSK, WalResidentTimeline}, + timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}, timeline_guard::{AccessService, GuardId, ResidenceGuard}, timelines_set::{TimelineSetGuard, TimelinesSet}, wal_backup::{self, WalBackupTaskHandle}, diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index e1241ceb9b84..ad29c9f66c2c 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -4,7 +4,6 @@ use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; use crate::rate_limit::RateLimiter; -use crate::safekeeper::ServerInfo; use crate::state::TimelinePersistentState; use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; use crate::timelines_set::TimelinesSet; @@ -13,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; +use safekeeper_api::ServerInfo; use serde::Serialize; use std::collections::HashMap; use std::str::FromStr; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 34b5dbeaa1cf..8517fa03443c 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -3,6 +3,7 @@ use anyhow::{Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use futures::stream::FuturesOrdered; use futures::StreamExt; +use safekeeper_api::models::PeerInfo; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -30,7 +31,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; -use crate::timeline::{PeerInfo, WalResidentTimeline}; +use crate::timeline::WalResidentTimeline; use crate::timeline_manager::{Manager, StateSnapshot}; use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index bddfca50e4fb..4e5b34a9bf65 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -22,6 +22,7 @@ use camino::Utf8PathBuf; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; +use safekeeper_api::Term; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; @@ -31,7 +32,6 @@ use utils::{id::NodeId, lsn::Lsn}; use crate::{ metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, rate_limit::{rand_duration, RateLimiter}, - safekeeper::Term, timeline::WalResidentTimeline, timeline_manager::StateSnapshot, wal_backup::{self}, diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index f8c0c502cdbc..aea628c20808 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -4,12 +4,12 @@ use async_stream::try_stream; use bytes::Bytes; use futures::Stream; use postgres_backend::CopyStreamHandlerEnd; +use safekeeper_api::Term; use std::time::Duration; use tokio::time::timeout; use utils::lsn::Lsn; use crate::{ - safekeeper::Term, send_wal::{EndWatch, WalSenderGuard}, timeline::WalResidentTimeline, }; diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 1ff83918a76c..1ebcb060e776 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,6 +4,7 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; +use safekeeper_api::models::ConnectionId; use std::sync::Arc; use std::time::Duration; use tokio::net::TcpStream; @@ -114,8 +115,6 @@ async fn handle_socket( .await } -/// Unique WAL service connection ids are logged in spans for observability. -pub type ConnectionId = u32; pub type ConnectionCount = u32; pub fn issue_connection_id(count: &mut ConnectionCount) -> ConnectionId { diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 12aa02577185..efcdd89e7da7 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -15,12 +15,13 @@ use desim::{ }; use http::Uri; use safekeeper::{ - safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, + safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION}, state::{TimelinePersistentState, TimelineState}, timeline::TimelineError, wal_storage::Storage, SafeKeeperConf, }; +use safekeeper_api::ServerInfo; use tracing::{debug, info_span, warn}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index aefb3919a1b3..7dc7f485487b 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -18,7 +18,7 @@ impl DiskWalProposer { internal_available_lsn: Lsn(0), prev_lsn: Lsn(0), disk: BlockStorage::new(), - wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[])), + wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[]), Lsn(0)), }), }) } diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 2b2ece3f0271..69db48f8d18c 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::error::Error as _; use std::sync::Arc; use std::{collections::HashMap, time::Duration}; @@ -6,6 +7,7 @@ use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use futures::StreamExt; use hyper::StatusCode; +use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; @@ -28,6 +30,9 @@ struct UnshardedComputeHookTenant { // Which node is this tenant attached to node_id: NodeId, + // The tenant's preferred AZ, so that we may pass this on to the control plane + preferred_az: Option, + // Must hold this lock to send a notification. send_lock: Arc>>, } @@ -36,6 +41,9 @@ struct ShardedComputeHookTenant { shard_count: ShardCount, shards: Vec<(ShardNumber, NodeId)>, + // The tenant's preferred AZ, so that we may pass this on to the control plane + preferred_az: Option, + // Must hold this lock to send a notification. The contents represent // the last successfully sent notification, and are used to coalesce multiple // updates by only sending when there is a chance since our last successful send. @@ -64,17 +72,24 @@ enum ComputeHookTenant { impl ComputeHookTenant { /// Construct with at least one shard's information - fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { + fn new( + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + preferred_az: Option, + node_id: NodeId, + ) -> Self { if tenant_shard_id.shard_count.count() > 1 { Self::Sharded(ShardedComputeHookTenant { shards: vec![(tenant_shard_id.shard_number, node_id)], stripe_size, shard_count: tenant_shard_id.shard_count, + preferred_az, send_lock: Arc::default(), }) } else { Self::Unsharded(UnshardedComputeHookTenant { node_id, + preferred_az, send_lock: Arc::default(), }) } @@ -120,15 +135,20 @@ impl ComputeHookTenant { /// Set one shard's location. If stripe size or shard count have changed, Self is reset /// and drops existing content. - fn update( - &mut self, - tenant_shard_id: TenantShardId, - stripe_size: ShardStripeSize, - node_id: NodeId, - ) { + fn update(&mut self, shard_update: ShardUpdate) { + let tenant_shard_id = shard_update.tenant_shard_id; + let node_id = shard_update.node_id; + let stripe_size = shard_update.stripe_size; + let preferred_az = shard_update.preferred_az; + match self { Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { - unsharded_tenant.node_id = node_id + unsharded_tenant.node_id = node_id; + if unsharded_tenant.preferred_az.as_ref() + != preferred_az.as_ref().map(|az| az.as_ref()) + { + unsharded_tenant.preferred_az = preferred_az.map(|az| az.as_ref().clone()); + } } Self::Sharded(sharded_tenant) if sharded_tenant.stripe_size == stripe_size @@ -146,10 +166,21 @@ impl ComputeHookTenant { .push((tenant_shard_id.shard_number, node_id)); sharded_tenant.shards.sort_by_key(|s| s.0) } + + if sharded_tenant.preferred_az.as_ref() + != preferred_az.as_ref().map(|az| az.as_ref()) + { + sharded_tenant.preferred_az = preferred_az.map(|az| az.as_ref().clone()); + } } _ => { // Shard count changed: reset struct. - *self = Self::new(tenant_shard_id, stripe_size, node_id); + *self = Self::new( + tenant_shard_id, + stripe_size, + preferred_az.map(|az| az.into_owned()), + node_id, + ); } } } @@ -165,6 +196,7 @@ struct ComputeHookNotifyRequestShard { #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] struct ComputeHookNotifyRequest { tenant_id: TenantId, + preferred_az: Option, stripe_size: Option, shards: Vec, } @@ -238,6 +270,10 @@ impl ComputeHookTenant { node_id: unsharded_tenant.node_id, }], stripe_size: None, + preferred_az: unsharded_tenant + .preferred_az + .as_ref() + .map(|az| az.0.clone()), }), Self::Sharded(sharded_tenant) if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => @@ -253,6 +289,7 @@ impl ComputeHookTenant { }) .collect(), stripe_size: Some(sharded_tenant.stripe_size), + preferred_az: sharded_tenant.preferred_az.as_ref().map(|az| az.0.clone()), }) } Self::Sharded(sharded_tenant) => { @@ -313,6 +350,17 @@ pub(super) struct ComputeHook { client: reqwest::Client, } +/// Callers may give us a list of these when asking us to send a bulk batch +/// of notifications in the background. This is a 'notification' in the sense of +/// other code notifying us of a shard's status, rather than being the final notification +/// that we send upwards to the control plane for the whole tenant. +pub(crate) struct ShardUpdate<'a> { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) node_id: NodeId, + pub(crate) stripe_size: ShardStripeSize, + pub(crate) preferred_az: Option>, +} + impl ComputeHook { pub(super) fn new(config: Config) -> Self { let authorization_header = config @@ -363,6 +411,7 @@ impl ComputeHook { tenant_id, shards, stripe_size, + preferred_az: _preferred_az, } = reconfigure_request; let compute_pageservers = shards @@ -503,24 +552,30 @@ impl ComputeHook { } /// Synchronous phase: update the per-tenant state for the next intended notification - fn notify_prepare( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - stripe_size: ShardStripeSize, - ) -> MaybeSendResult { + fn notify_prepare(&self, shard_update: ShardUpdate) -> MaybeSendResult { let mut state_locked = self.state.lock().unwrap(); use std::collections::hash_map::Entry; + let tenant_shard_id = shard_update.tenant_shard_id; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { - Entry::Vacant(e) => e.insert(ComputeHookTenant::new( - tenant_shard_id, - stripe_size, - node_id, - )), + Entry::Vacant(e) => { + let ShardUpdate { + tenant_shard_id, + node_id, + stripe_size, + preferred_az, + } = shard_update; + e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + preferred_az.map(|az| az.into_owned()), + node_id, + )) + } Entry::Occupied(e) => { let tenant = e.into_mut(); - tenant.update(tenant_shard_id, stripe_size, node_id); + tenant.update(shard_update); tenant } }; @@ -608,13 +663,14 @@ impl ComputeHook { /// if something failed. pub(super) fn notify_background( self: &Arc, - notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, + notifications: Vec, result_tx: tokio::sync::mpsc::Sender>, cancel: &CancellationToken, ) { let mut maybe_sends = Vec::new(); - for (tenant_shard_id, node_id, stripe_size) in notifications { - let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + for shard_update in notifications { + let tenant_shard_id = shard_update.tenant_shard_id; + let maybe_send_result = self.notify_prepare(shard_update); maybe_sends.push((tenant_shard_id, maybe_send_result)) } @@ -678,15 +734,14 @@ impl ComputeHook { /// periods, but we don't retry forever. The **caller** is responsible for handling failures and /// ensuring that they eventually call again to ensure that the compute is eventually notified of /// the proper pageserver nodes for a tenant. - #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] - pub(super) async fn notify( + #[tracing::instrument(skip_all, fields(tenant_id=%shard_update.tenant_shard_id.tenant_id, shard_id=%shard_update.tenant_shard_id.shard_slug(), node_id))] + pub(super) async fn notify<'a>( &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - stripe_size: ShardStripeSize, + shard_update: ShardUpdate<'a>, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + let tenant_shard_id = shard_update.tenant_shard_id; + let maybe_send_result = self.notify_prepare(shard_update); self.notify_execute(maybe_send_result, tenant_shard_id, cancel) .await } @@ -739,6 +794,7 @@ pub(crate) mod tests { shard_number: ShardNumber(0), }, ShardStripeSize(12345), + None, NodeId(1), ); @@ -765,30 +821,32 @@ pub(crate) mod tests { // Writing the first shard of a multi-sharded situation (i.e. in a split) // resets the tenant state and puts it in an non-notifying state (need to // see all shards) - tenant_state.update( - TenantShardId { + tenant_state.update(ShardUpdate { + tenant_shard_id: TenantShardId { tenant_id, shard_count: ShardCount::new(2), shard_number: ShardNumber(1), }, - ShardStripeSize(32768), - NodeId(1), - ); + stripe_size: ShardStripeSize(32768), + preferred_az: None, + node_id: NodeId(1), + }); assert!(matches!( tenant_state.maybe_send(tenant_id, None), MaybeSendResult::Noop )); // Writing the second shard makes it ready to notify - tenant_state.update( - TenantShardId { + tenant_state.update(ShardUpdate { + tenant_shard_id: TenantShardId { tenant_id, shard_count: ShardCount::new(2), shard_number: ShardNumber(0), }, - ShardStripeSize(32768), - NodeId(1), - ); + stripe_size: ShardStripeSize(32768), + preferred_az: None, + node_id: NodeId(1), + }); let send_result = tenant_state.maybe_send(tenant_id, None); let MaybeSendResult::Transmit((request, mut guard)) = send_result else { diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index e17fe78d2584..cc377e606ee8 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -11,6 +11,7 @@ use diesel::Connection; use itertools::Itertools; use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::controller_api::MetadataHealthRecord; +use pageserver_api::controller_api::SafekeeperDescribeResponse; use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; @@ -1241,6 +1242,18 @@ impl SafekeeperPersistence { availability_zone_id: &self.availability_zone_id, } } + pub(crate) fn as_describe_response(&self) -> SafekeeperDescribeResponse { + // omit the `active` flag on purpose: it is deprecated. + SafekeeperDescribeResponse { + id: NodeId(self.id as u64), + region_id: self.region_id.clone(), + version: self.version, + host: self.host.clone(), + port: self.port, + http_port: self.http_port, + availability_zone_id: self.availability_zone_id.clone(), + } + } } #[derive(Insertable, AsChangeset)] diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 3ad386a95b57..475f91eff48d 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,13 +1,14 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; -use crate::service; -use pageserver_api::controller_api::PlacementPolicy; +use crate::{compute_hook, service}; +use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy}; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; use reqwest::StatusCode; +use std::borrow::Cow; use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -45,6 +46,7 @@ pub(super) struct Reconciler { pub(crate) reconciler_config: ReconcilerConfig, pub(crate) config: TenantConfig, + pub(crate) preferred_az: Option, /// Observed state from the point of view of the reconciler. /// This gets updated as the reconciliation makes progress. @@ -834,9 +836,12 @@ impl Reconciler { let result = self .compute_hook .notify( - self.tenant_shard_id, - node.get_id(), - self.shard.stripe_size, + compute_hook::ShardUpdate { + tenant_shard_id: self.tenant_shard_id, + node_id: node.get_id(), + stripe_size: self.shard.stripe_size, + preferred_az: self.preferred_az.as_ref().map(Cow::Borrowed), + }, &self.cancel, ) .await; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 746177c08978..c0c5bc371aed 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -18,7 +18,7 @@ use crate::{ background_node_operations::{ Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, }, - compute_hook::NotifyError, + compute_hook::{self, NotifyError}, drain_utils::{self, TenantShardDrain, TenantShardIterator}, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, leadership::Leadership, @@ -46,10 +46,11 @@ use pageserver_api::{ controller_api::{ AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, - ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, - TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, - TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, - TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, + SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse, + TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, + TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, }, models::{ SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest, @@ -656,11 +657,14 @@ impl Service { // emit a compute notification for this. In the case where our observed state does not // yet match our intent, we will eventually reconcile, and that will emit a compute notification. if let Some(attached_at) = tenant_shard.stably_attached() { - compute_notifications.push(( - *tenant_shard_id, - attached_at, - tenant_shard.shard.stripe_size, - )); + compute_notifications.push(compute_hook::ShardUpdate { + tenant_shard_id: *tenant_shard_id, + node_id: attached_at, + stripe_size: tenant_shard.shard.stripe_size, + preferred_az: tenant_shard + .preferred_az() + .map(|az| Cow::Owned(az.clone())), + }); } } } @@ -4786,7 +4790,15 @@ impl Service { for (child_id, child_ps, stripe_size) in child_locations { if let Err(e) = self .compute_hook - .notify(child_id, child_ps, stripe_size, &self.cancel) + .notify( + compute_hook::ShardUpdate { + tenant_shard_id: child_id, + node_id: child_ps, + stripe_size, + preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed), + }, + &self.cancel, + ) .await { tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", @@ -6873,10 +6885,7 @@ impl Service { let mut plan = Vec::new(); for (node_id, attached) in nodes_by_load { - let available = locked - .nodes - .get(&node_id) - .map_or(false, |n| n.is_available()); + let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available()); if !available { continue; } @@ -7161,15 +7170,24 @@ impl Service { pub(crate) async fn safekeepers_list( &self, - ) -> Result, DatabaseError> { - self.persistence.list_safekeepers().await + ) -> Result, DatabaseError> { + Ok(self + .persistence + .list_safekeepers() + .await? + .into_iter() + .map(|v| v.as_describe_response()) + .collect::>()) } pub(crate) async fn get_safekeeper( &self, id: i64, - ) -> Result { - self.persistence.safekeeper_get(id).await + ) -> Result { + self.persistence + .safekeeper_get(id) + .await + .map(|v| v.as_describe_response()) } pub(crate) async fn upsert_safekeeper( diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index f1b921646f44..cba579e8a749 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1198,6 +1198,7 @@ impl TenantShard { detach, reconciler_config, config: self.config.clone(), + preferred_az: self.preferred_az_id.clone(), observed: self.observed.clone(), original_observed: self.observed.clone(), compute_hook: compute_hook.clone(), diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f759f54d19b7..32c86052efaa 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -310,7 +310,7 @@ pub(crate) enum BlobDataParseResult { index_part_generation: Generation, s3_layers: HashSet<(LayerName, Generation)>, }, - /// The remains of a deleted Timeline (i.e. an initdb archive only) + /// The remains of an uncleanly deleted Timeline or aborted timeline creation(e.g. an initdb archive only, or some layer without an index) Relic, Incorrect { errors: Vec, @@ -346,7 +346,7 @@ pub(crate) async fn list_timeline_blobs( match res { ListTimelineBlobsResult::Ready(data) => Ok(data), ListTimelineBlobsResult::MissingIndexPart(_) => { - // Retry if index is missing. + // Retry if listing raced with removal of an index let data = list_timeline_blobs_impl(remote_client, id, root_target) .await? .into_data(); @@ -358,7 +358,7 @@ pub(crate) async fn list_timeline_blobs( enum ListTimelineBlobsResult { /// Blob data is ready to be intepreted. Ready(RemoteTimelineBlobData), - /// List timeline blobs has layer files but is missing [`IndexPart`]. + /// The listing contained an index but when we tried to fetch it, we couldn't MissingIndexPart(RemoteTimelineBlobData), } @@ -467,19 +467,19 @@ async fn list_timeline_blobs_impl( match index_part_object.as_ref() { Some(selected) => index_part_keys.retain(|k| k != selected), None => { - // It is possible that the branch gets deleted after we got some layer files listed - // and we no longer have the index file in the listing. - errors.push( + // This case does not indicate corruption, but it should be very unusual. It can + // happen if: + // - timeline creation is in progress (first layer is written before index is written) + // - timeline deletion happened while a stale pageserver was still attached, it might upload + // a layer after the deletion is done. + tracing::info!( "S3 list response got no index_part.json file but still has layer files" - .to_string(), ); - return Ok(ListTimelineBlobsResult::MissingIndexPart( - RemoteTimelineBlobData { - blob_data: BlobDataParseResult::Incorrect { errors, s3_layers }, - unused_index_keys: index_part_keys, - unknown_keys, - }, - )); + return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData { + blob_data: BlobDataParseResult::Relic, + unused_index_keys: index_part_keys, + unknown_keys, + })); } } diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 887bfef478b8..9e32469d69cc 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -8,6 +8,7 @@ "fixtures.compute_reconfigure", "fixtures.storage_controller_proxy", "fixtures.paths", + "fixtures.compute_migrations", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/compute_migrations.py b/test_runner/fixtures/compute_migrations.py new file mode 100644 index 000000000000..ea99785af092 --- /dev/null +++ b/test_runner/fixtures/compute_migrations.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +import pytest + +from fixtures.paths import BASE_DIR + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + +COMPUTE_MIGRATIONS_DIR = BASE_DIR / "compute_tools" / "src" / "migrations" +COMPUTE_MIGRATIONS_TEST_DIR = COMPUTE_MIGRATIONS_DIR / "tests" + +COMPUTE_MIGRATIONS = sorted(next(os.walk(COMPUTE_MIGRATIONS_DIR))[2]) +NUM_COMPUTE_MIGRATIONS = len(COMPUTE_MIGRATIONS) + + +@pytest.fixture(scope="session") +def compute_migrations_dir() -> Iterator[Path]: + """ + Retrieve the path to the compute migrations directory. + """ + yield COMPUTE_MIGRATIONS_DIR + + +@pytest.fixture(scope="session") +def compute_migrations_test_dir() -> Iterator[Path]: + """ + Retrieve the path to the compute migrations test directory. + """ + yield COMPUTE_MIGRATIONS_TEST_DIR diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 1cd9158c688c..aa0d95fe802a 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -55,3 +55,17 @@ def metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") res.raise_for_status() return res.text + + def configure_failpoints(self, *args: tuple[str, str]) -> None: + body: list[dict[str, str]] = [] + + for fp in args: + body.append( + { + "name": fp[0], + "action": fp[1], + } + ) + + res = self.post(f"http://localhost:{self.port}/failpoints", json=body) + res.raise_for_status() diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index a85a1914553e..adbd6414a7eb 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -522,14 +522,15 @@ def endpoint_start( safekeepers: list[int] | None = None, remote_ext_config: str | None = None, pageserver_id: int | None = None, - allow_multiple=False, + allow_multiple: bool = False, basebackup_request_tries: int | None = None, + env: dict[str, str] | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", "start", ] - extra_env_vars = {} + extra_env_vars = env or {} if basebackup_request_tries is not None: extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) if remote_ext_config is not None: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 13ada1361e63..a0c642163d9d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -54,6 +54,7 @@ TimelineArchivalState, TimelineId, ) +from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS from fixtures.endpoint.http import EndpointHttpClient from fixtures.h2server import H2Server from fixtures.log_helper import log @@ -134,6 +135,9 @@ BASE_PORT: int = 15000 +# By default we create pageservers with this phony AZ +DEFAULT_AZ_ID: str = "us-east-2a" + @pytest.fixture(scope="session") def neon_api_key() -> str: @@ -1093,7 +1097,7 @@ def __init__(self, config: NeonEnvBuilder): "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` - "availability_zone": "us-east-2a", + "availability_zone": DEFAULT_AZ_ID, # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, @@ -3219,7 +3223,6 @@ def extra_args(self) -> list[str]: # Link auth backend params *["--auth-backend", "link"], *["--uri", NeonProxy.link_auth_uri], - *["--allow-self-signed-compute", "true"], ] class ProxyV1(AuthBackend): @@ -3853,6 +3856,7 @@ def start( safekeepers: list[int] | None = None, allow_multiple: bool = False, basebackup_request_tries: int | None = None, + env: dict[str, str] | None = None, ) -> Self: """ Start the Postgres instance. @@ -3873,6 +3877,7 @@ def start( pageserver_id=pageserver_id, allow_multiple=allow_multiple, basebackup_request_tries=basebackup_request_tries, + env=env, ) self._running.release(1) self.log_config_value("shared_buffers") @@ -3986,14 +3991,17 @@ def respec_deep(self, **kwargs: Any) -> None: log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) json.dump(data_dict, file, indent=4) - # Please note: Migrations only run if pg_skip_catalog_updates is false - def wait_for_migrations(self, num_migrations: int = 11): + def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None: + """ + Wait for all compute migrations to be ran. Remember that migrations only + run if "pg_skip_catalog_updates" is set in the compute spec to false. + """ with self.cursor() as cur: def check_migrations_done(): cur.execute("SELECT id FROM neon_migration.migration_id") migration_id: int = cur.fetchall()[0][0] - assert migration_id >= num_migrations + assert migration_id >= wait_for wait_until(check_migrations_done) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index eabdeb10539c..378e56862252 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -738,6 +738,18 @@ def timeline_offload( res_json = res.json() assert res_json is None + def timeline_compact_info( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + ) -> Any: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_compact( self, tenant_id: TenantId | TenantShardId, @@ -749,7 +761,6 @@ def timeline_compact( enhanced_gc_bottom_most_compaction=False, body: dict[str, Any] | None = None, ): - self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index 80777d65e9b0..fc4fb3629bb0 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -21,8 +21,8 @@ BASE_DIR = Path(__file__).parents[2] -COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc" DEFAULT_OUTPUT_DIR: str = "test_output" +COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc" def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | None = None) -> Path: diff --git a/test_runner/performance/many_relations/create_many_relations.sql b/test_runner/performance/many_relations/create_many_relations.sql new file mode 100644 index 000000000000..1b3673c9e185 --- /dev/null +++ b/test_runner/performance/many_relations/create_many_relations.sql @@ -0,0 +1,199 @@ +-- create a schema that simulates Neon control plane operations table +-- however use partitioned operations tables with many (e.g. 500) child partition tables per table +-- in summary we create multiple of these partitioned operations tables (with 500 childs each) - until we reach the requested number of tables + + +-- first we need some other tables that can be referenced by the operations table + +-- Table for branches +CREATE TABLE public.branches ( + id text PRIMARY KEY +); + +-- Table for endpoints +CREATE TABLE public.endpoints ( + id text PRIMARY KEY +); + +-- Table for projects +CREATE TABLE public.projects ( + id text PRIMARY KEY +); + +INSERT INTO public.branches (id) +VALUES ('branch_1'); + +-- Insert one row into endpoints +INSERT INTO public.endpoints (id) +VALUES ('endpoint_1'); + +-- Insert one row into projects +INSERT INTO public.projects (id) +VALUES ('project_1'); + +-- now we create a procedure that can create n operations tables +-- we do that in a procedure to save roundtrip latency when scaling the test to many tables +-- prefix is the base table name, e.g. 'operations_scale_1000' if we create 1000 tables +CREATE OR REPLACE PROCEDURE create_partitioned_tables(prefix text, n INT) +LANGUAGE plpgsql AS $$ +DECLARE + table_name TEXT; -- Variable to hold table names dynamically + i INT; -- Counter for the loop +BEGIN + -- Loop to create n partitioned tables + FOR i IN 1..n LOOP + table_name := format('%s_%s', prefix, i); + + -- Create the partitioned table + EXECUTE format( + 'CREATE TABLE public.%s ( + project_id character varying NOT NULL, + id uuid NOT NULL, + status integer, + action character varying NOT NULL, + error character varying, + created_at timestamp with time zone NOT NULL DEFAULT now(), + updated_at timestamp with time zone NOT NULL DEFAULT now(), + spec jsonb, + retry_at timestamp with time zone, + failures_count integer DEFAULT 0, + metadata jsonb NOT NULL DEFAULT ''{}''::jsonb, + executor_id text NOT NULL, + attempt_duration_ms integer, + metrics jsonb DEFAULT ''{}''::jsonb, + branch_id text, + endpoint_id text, + next_operation_id uuid, + compute_id text, + connection_attempt_at timestamp with time zone, + concurrency_key text, + queue_id text, + CONSTRAINT %s_pkey PRIMARY KEY (id, created_at), + CONSTRAINT %s_branch_id_fk FOREIGN KEY (branch_id) REFERENCES branches(id) ON DELETE CASCADE, + CONSTRAINT %s_endpoint_id_fk FOREIGN KEY (endpoint_id) REFERENCES endpoints(id) ON DELETE CASCADE, + CONSTRAINT %s_next_operation_id_fk FOREIGN KEY (next_operation_id, created_at) REFERENCES %s(id, created_at), + CONSTRAINT %s_project_id_fk FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE + ) PARTITION BY RANGE (created_at)', + table_name, table_name, table_name, table_name, table_name, table_name, table_name + ); + + -- Add indexes for the partitioned table + EXECUTE format('CREATE INDEX index_%s_on_next_operation_id ON public.%s (next_operation_id)', table_name, table_name); + EXECUTE format('CREATE INDEX index_%s_on_project_id ON public.%s (project_id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_branch_id ON public.%s (branch_id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_branch_id_created_idx ON public.%s (branch_id, created_at)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_created_at_idx ON public.%s (created_at)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_created_at_project_id_id_cond_idx ON public.%s (created_at, project_id, id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_endpoint_id ON public.%s (endpoint_id)', table_name, table_name); + EXECUTE format( + 'CREATE INDEX %s_for_redo_worker_idx ON public.%s (executor_id) WHERE status <> 1', + table_name, table_name + ); + EXECUTE format( + 'CREATE INDEX %s_project_id_status_index ON public.%s ((project_id::text), status)', + table_name, table_name + ); + EXECUTE format( + 'CREATE INDEX %s_status_not_finished ON public.%s (status) WHERE status <> 1', + table_name, table_name + ); + EXECUTE format('CREATE INDEX %s_updated_at_desc_idx ON public.%s (updated_at DESC)', table_name, table_name); + EXECUTE format( + 'CREATE INDEX %s_with_failures ON public.%s (failures_count) WHERE failures_count > 0', + table_name, table_name + ); + END LOOP; +END; +$$; + +-- next we create a procedure that can add the child partitions (one per day) to each of the operations tables +CREATE OR REPLACE PROCEDURE create_operations_partitions( + table_name TEXT, + start_date DATE, + end_date DATE +) +LANGUAGE plpgsql AS $$ +DECLARE + partition_date DATE; + partition_name TEXT; + counter INT := 0; -- Counter to track the number of tables created in the current transaction +BEGIN + partition_date := start_date; + + -- Create partitions in batches + WHILE partition_date < end_date LOOP + partition_name := format('%s_%s', table_name, to_char(partition_date,'YYYY_MM_DD')); + + EXECUTE format( + 'CREATE TABLE IF NOT EXISTS public.%s PARTITION OF public.%s + FOR VALUES FROM (''%s'') TO (''%s'')', + partition_name, + table_name, + partition_date, + partition_date + INTERVAL '1 day' + ); + + counter := counter + 1; + + -- Commit and reset counter after every 100 partitions + IF counter >= 100 THEN + COMMIT; + counter := 0; -- Reset the counter + END IF; + + -- Advance to the next day + partition_date := partition_date + INTERVAL '1 day'; + END LOOP; + + -- Final commit for remaining partitions + IF counter > 0 THEN + COMMIT; + END IF; + + -- Insert synthetic rows into each partition + EXECUTE format( + 'INSERT INTO %I ( + project_id, + branch_id, + endpoint_id, + id, + status, + action, + created_at, + updated_at, + spec, + metadata, + executor_id, + failures_count + ) + SELECT + ''project_1'', -- project_id + ''branch_1'', -- branch_id + ''endpoint_1'', -- endpoint_id + ''e8bba687-0df9-4291-bfcd-7d5f6aa7c158'', -- unique id + 1, -- status + ''SYNTHETIC_ACTION'', -- action + gs::timestamp + interval ''0 ms'', -- created_at + gs::timestamp + interval ''1 minute'', -- updated_at + ''{"key": "value"}'', -- spec (JSONB) + ''{"metadata_key": "metadata_value"}'', -- metadata (JSONB) + ''executor_1'', -- executor_id + 0 -- failures_count + FROM generate_series(%L, %L::DATE - INTERVAL ''1 day'', INTERVAL ''1 day'') AS gs', + table_name, start_date, end_date + ); + + -- Commit the inserted rows + COMMIT; +END; +$$; + +-- we can now create partitioned tables using something like +-- CALL create_partitioned_tables('operations_scale_1000' ,10); + +-- and we can create the child partitions for a table using something like +-- CALL create_operations_partitions( +-- 'operations_scale_1000_1', +-- '2000-01-01', -- Start date +-- ('2000-01-01'::DATE + INTERVAL '1 day' * 500)::DATE -- End date (start date + number of days) +-- ); diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 07f244da0cef..acb7b56fd073 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -22,7 +22,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma "checkpoint_distance": f"{1024 ** 2}", "compaction_target_size": f"{1024 ** 2}", # set PITR interval to be small, so we can do GC - "pitr_interval": "60 s", + "pitr_interval": "10 s", # "compaction_threshold": "3", # "image_creation_threshold": "2", } @@ -32,6 +32,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma n_steps = 10 n_update_iters = 100 step_size = 10000 + branch_created = 0 with endpoint.cursor() as cur: cur.execute("SET statement_timeout='1000s'") cur.execute( @@ -66,6 +67,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma if mode == "with_snapshots": if step == n_steps / 2: env.create_branch("child") + branch_created += 1 max_num_of_deltas_above_image = 0 max_total_num_of_deltas = 0 @@ -142,6 +144,15 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma with layer_map_path.open("w") as f: f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) + # We should have collected all garbage + if mode == "normal": + # in theory we should get physical size ~= logical size, but given that gc interval is 10s, + # and the layer has indexes that might contribute to the fluctuation, we allow a small margin + # of 1 here, and the end ratio we are asserting is 1 (margin) + 1 (expected) = 2. + assert physical_size / logical_size < 2 + elif mode == "with_snapshots": + assert physical_size / logical_size < (2 + branch_created) + @pytest.mark.timeout(10000) def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py new file mode 100644 index 000000000000..0ee0efe8b9c1 --- /dev/null +++ b/test_runner/performance/test_perf_many_relations.py @@ -0,0 +1,66 @@ +import os +from pathlib import Path + +import pytest +from fixtures.compare_fixtures import RemoteCompare +from fixtures.log_helper import log + + +def get_num_relations(default: int = 1000) -> list[int]: + # We parametrize each run with scale specifying the number of wanted child partitions. + # Databases are pre-created and passed through BENCHMARK_CONNSTR env variable. + scales = os.getenv("TEST_NUM_RELATIONS", default=str(default)) + rv = [] + for s in scales.split(","): + scale = int(s) + rv.append(scale) + return rv + + +@pytest.mark.parametrize("num_relations", get_num_relations()) +@pytest.mark.remote_cluster +def test_perf_many_relations(remote_compare: RemoteCompare, num_relations: int): + """ + Test creating many relations in a single database. + We use partitioned tables with child tables, indexes and constraints to have a realistic schema. + Also we include some common data types like text, uuid, timestamp, JSONB, etc. + + see many_relations/create_many_relations.sql + """ + env = remote_compare + + # prepare some base tables and the plpgsql procedures that we use to create the tables + sql_file = Path(__file__).parent / "many_relations" / "create_many_relations.sql" + env.pg_bin.run_capture(["psql", env.pg.connstr(), "-f", str(sql_file)]) + + num_parent_tables = num_relations // 500 + 1 + log.info(f"Creating {num_relations} relations in {num_parent_tables} parent tables") + + log.info(f"Creating {num_parent_tables} parent tables") + sql = f"CALL create_partitioned_tables('operations_scale_{num_relations}', {num_parent_tables})" + log.info(sql) + env.pg_bin.run_capture(["psql", env.pg.connstr(), "-c", sql]) + + current_table = 0 + num_relations_remaining = num_relations + + # now run and measure the actual relation creation + while num_relations_remaining > 0: + current_table += 1 + parent_table_name = f"operations_scale_{num_relations}_{current_table}" + if num_relations_remaining > 500: + num_relations_to_create = 500 + else: + num_relations_to_create = num_relations_remaining + num_relations_remaining -= num_relations_to_create + log.info( + f"Creating {num_relations_to_create} child tables in partitioned parent table '{parent_table_name}'" + ) + sql = f"CALL create_operations_partitions( '{parent_table_name}', '2000-01-01', ('2000-01-01'::DATE + INTERVAL '1 day' * {num_relations_to_create})::DATE)" + log.info(sql) + with env.zenbenchmark.record_duration( + f"CREATE_TABLE/{current_table}/{num_relations_to_create}" + ): + env.pg_bin.run_capture( + ["psql", env.pg.connstr(options="-cstatement_timeout=1000s "), "-c", sql] + ) diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index 10027ce6891b..2ae38e6d8887 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -1,18 +1,19 @@ from __future__ import annotations import os -import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import query_scalar +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import query_scalar, wait_until # # Test compute node start after clog truncation # -def test_clog_truncate(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_clog_truncate(neon_env_builder: NeonEnvBuilder): + # Use a multi-sharded tenant because WAL ingest logic is shard-dependent, and + # this test is one of the very few that exercises a CLogTruncate WAL record. + env = neon_env_builder.init_start(initial_tenant_shard_count=2) # set aggressive autovacuum to make sure that truncation will happen config = [ @@ -31,6 +32,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): endpoint.safe_psql("CREATE EXTENSION neon_test_utils") # Consume many xids to advance clog + log.info("Consuming xids...") with endpoint.cursor() as cur: cur.execute("select test_consume_xids(1000*1000*10);") log.info("xids consumed") @@ -47,11 +49,17 @@ def test_clog_truncate(neon_simple_env: NeonEnv): pg_xact_0000_path = os.path.join(endpoint.pg_xact_dir_path(), "0000") log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") - while os.path.isfile(pg_xact_0000_path): - log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") - time.sleep(5) + def assert_file_removed(): + exists = os.path.isfile(pg_xact_0000_path) + if exists: + log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") + assert not exists + + log.info("Waiting for truncation...") + wait_until(assert_file_removed) # checkpoint to advance latest lsn + log.info("Checkpointing...") with endpoint.cursor() as cur: cur.execute("CHECKPOINT;") lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()") diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 88873c63c24c..ae48a8fc27d0 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -134,6 +134,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): } env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + env.pageserver.allowed_errors.append( + r".*failed to acquire partition lock during gc-compaction.*" + ) + env.pageserver.allowed_errors.append(r".*repartition() called concurrently.*") tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -172,6 +176,12 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): workload.churn_rows(row_count, env.pageserver.id) + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + wait_until(compaction_finished, timeout=60) + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) env.pageserver.assert_log_contains( "scheduled_compact_timeline.*picked .* layers for compaction" diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py new file mode 100644 index 000000000000..803702a6f8ac --- /dev/null +++ b/test_runner/regress/test_compute_migrations.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, cast + +import pytest +from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_dir: Path): + """ + Test that compute_ctl can recover from migration failures next time it + starts, and that the persisted migration ID is correct in such cases. + """ + env = neon_simple_env + + endpoint = env.endpoints.create("main") + endpoint.respec(skip_pg_catalog_updates=False) + + for i in range(1, NUM_COMPUTE_MIGRATIONS + 1): + endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"}) + + # Make sure that the migrations ran + endpoint.wait_for_migrations(wait_for=i - 1) + + # Confirm that we correctly recorded that in the + # neon_migration.migration_id table + with endpoint.cursor() as cur: + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cast("int", cur.fetchall()[0][0]) + assert migration_id == i - 1 + + endpoint.stop() + + endpoint.start() + + # Now wait for the rest of the migrations + endpoint.wait_for_migrations() + + with endpoint.cursor() as cur: + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cast("int", cur.fetchall()[0][0]) + assert migration_id == NUM_COMPUTE_MIGRATIONS + + for i, m in enumerate(COMPUTE_MIGRATIONS, start=1): + migration_query = (compute_migrations_dir / m).read_text(encoding="utf-8") + if not migration_query.startswith("-- SKIP"): + pattern = rf"Skipping migration id={i}" + else: + pattern = rf"Running migration id={i}" + + endpoint.log_contains(pattern) + + +@pytest.mark.parametrize( + "migration", + (pytest.param((i, m), id=str(i)) for i, m in enumerate(COMPUTE_MIGRATIONS, start=1)), +) +def test_compute_migrations_e2e( + neon_simple_env: NeonEnv, + compute_migrations_dir: Path, + compute_migrations_test_dir: Path, + migration: tuple[int, str], +): + """ + Test that the migrations perform as advertised. + """ + env = neon_simple_env + + migration_id = migration[0] + migration_filename = migration[1] + + migration_query = (compute_migrations_dir / migration_filename).read_text(encoding="utf-8") + if migration_query.startswith("-- SKIP"): + pytest.skip("The migration is marked as SKIP") + + endpoint = env.endpoints.create("main") + endpoint.respec(skip_pg_catalog_updates=False) + + # Stop applying migrations after the one we want to test, so that we can + # test the state of the cluster at the given migration ID + endpoint.start(env={"FAILPOINTS": f"compute-migration=return({migration_id + 1})"}) + + endpoint.wait_for_migrations(wait_for=migration_id) + + check_query = (compute_migrations_test_dir / migration_filename).read_text(encoding="utf-8") + endpoint.safe_psql(check_query) diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index de44bbcbc895..b10e38885e42 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -60,14 +60,12 @@ def ddl_forward_handler( if request.json is None: log.info("Received invalid JSON") return Response(status=400) - json = request.json + json: dict[str, list[str]] = request.json # Handle roles first - if "roles" in json: - for operation in json["roles"]: - handle_role(dbs, roles, operation) - if "dbs" in json: - for operation in json["dbs"]: - handle_db(dbs, roles, operation) + for operation in json.get("roles", []): + handle_role(dbs, roles, operation) + for operation in json.get("dbs", []): + handle_db(dbs, roles, operation) return Response(status=200) @@ -207,6 +205,23 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): ddl.wait() assert ddl.roles == {} + cur.execute("CREATE ROLE bork WITH PASSWORD 'newyork'") + cur.execute("BEGIN") + cur.execute("SAVEPOINT point") + cur.execute("DROP ROLE bork") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {} + + cur.execute("CREATE ROLE bork WITH PASSWORD 'oldyork'") + cur.execute("BEGIN") + cur.execute("SAVEPOINT point") + cur.execute("ALTER ROLE bork PASSWORD NULL") + cur.execute("COMMIT") + cur.execute("DROP ROLE bork") + ddl.wait() + assert ddl.roles == {} + cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'") cur.execute("CREATE DATABASE stork WITH OWNER=bork") cur.execute("ALTER ROLE bork RENAME TO cork") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 29229b73c156..6ea2393a9d81 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -84,6 +84,8 @@ def handler(request: Request) -> Response: elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: + # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data + # to exercise multiple segments. target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) else: raise ValueError @@ -111,9 +113,15 @@ def handler(request: Request) -> Response: def validate_vanilla_equivalence(ep): # TODO: would be nicer to just compare pgdump - assert ep.safe_psql("select count(*), sum(data::bigint)::bigint from t") == [ - (expect_nrows, expect_sum) - ] + + # Enable IO concurrency for batching on large sequential scan, to avoid making + # this test unnecessarily onerous on CPU + assert ep.safe_psql_many( + [ + "set effective_io_concurrency=32;", + "select count(*), sum(data::bigint)::bigint from t", + ] + ) == [[], [(expect_nrows, expect_sum)]] validate_vanilla_equivalence(vanilla_pg) diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py deleted file mode 100644 index 7211619a9906..000000000000 --- a/test_runner/regress/test_migrations.py +++ /dev/null @@ -1,33 +0,0 @@ -from __future__ import annotations - -import time -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from fixtures.neon_fixtures import NeonEnv - - -def test_migrations(neon_simple_env: NeonEnv): - env = neon_simple_env - - endpoint = env.endpoints.create("main") - endpoint.respec(skip_pg_catalog_updates=False) - endpoint.start() - - num_migrations = 11 - endpoint.wait_for_migrations(num_migrations=num_migrations) - - with endpoint.cursor() as cur: - cur.execute("SELECT id FROM neon_migration.migration_id") - migration_id = cur.fetchall() - assert migration_id[0][0] == num_migrations - - endpoint.stop() - endpoint.start() - # We don't have a good way of knowing that the migrations code path finished executing - # in compute_ctl in the case that no migrations are being run - time.sleep(1) - with endpoint.cursor() as cur: - cur.execute("SELECT id FROM neon_migration.migration_id") - migration_id = cur.fetchall() - assert migration_id[0][0] == num_migrations diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 706da1e35e00..fcc465f90a55 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -22,7 +22,10 @@ async def run_worker_for_tenant( - env: NeonEnv, entries: int, tenant: TenantId, offset: int | None = None + env: NeonEnv, + entries: int, + tenant: TenantId, + offset: int | None = None, ) -> Lsn: if offset is None: offset = 0 @@ -37,12 +40,20 @@ async def run_worker_for_tenant( finally: await conn.close(timeout=10) - last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + loop = asyncio.get_running_loop() + sql = await loop.run_in_executor( + None, lambda ep: ep.safe_psql("SELECT pg_current_wal_flush_lsn()"), ep + ) + last_flush_lsn = Lsn(sql[0][0]) return last_flush_lsn async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.create_tenant(conf=tenant_conf) + loop = asyncio.get_running_loop() + # capture tenant_conf by specifying `tenant_conf=tenant_conf`, otherwise it will be evaluated to some random value + tenant, timeline = await loop.run_in_executor( + None, lambda tenant_conf, env: env.create_tenant(conf=tenant_conf), tenant_conf, env + ) last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) return tenant, timeline, last_flush_lsn diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py index ad2d0871b82c..3f9824ee677a 100644 --- a/test_runner/regress/test_physical_and_logical_replicaiton.py +++ b/test_runner/regress/test_physical_and_logical_replicaiton.py @@ -2,7 +2,7 @@ import time -from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync, wait_replica_caughtup def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg): @@ -38,6 +38,8 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE for pk in range(n_records): p_cur.execute("insert into t (pk) values (%s)", (pk,)) + wait_replica_caughtup(primary, secondary) + s_cur.execute("select count(*) from t") assert s_cur.fetchall()[0][0] == n_records diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py index 7676b78b0e90..99fe80e6218c 100644 --- a/test_runner/regress/test_prefetch_buffer_resize.py +++ b/test_runner/regress/test_prefetch_buffer_resize.py @@ -7,7 +7,6 @@ @pytest.mark.parametrize("shard_count", [None, 4]) -@pytest.mark.timeout(600) def test_prefetch(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 743ab0088b3d..4c381b563fe4 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -11,6 +11,7 @@ from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + DEFAULT_AZ_ID, NeonEnv, NeonEnvBuilder, StorageControllerApiException, @@ -793,6 +794,7 @@ def handler(request: Request): "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect @@ -812,6 +814,7 @@ def handler(request: Request): {"node_id": int(env.pageservers[0].id), "shard_number": 0}, {"node_id": int(env.pageservers[0].id), "shard_number": 1}, ], + "preferred_az": DEFAULT_AZ_ID, } log.info(f"Got notification: {notifications[1]}") assert notifications[1] == expect_after diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index ae9b596a1b98..7062c35e05ab 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -16,6 +16,7 @@ from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + DEFAULT_AZ_ID, NeonEnv, NeonEnvBuilder, NeonPageserver, @@ -599,6 +600,7 @@ def handler(request: Request): "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect @@ -616,6 +618,7 @@ def node_evacuated(node_id: int) -> None: "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } def received_migration_notification(): @@ -643,6 +646,7 @@ def received_restart_notification(): {"node_id": int(env.pageservers[1].id), "shard_number": 0}, {"node_id": int(env.pageservers[1].id), "shard_number": 1}, ], + "preferred_az": DEFAULT_AZ_ID, } def received_split_notification(): @@ -714,6 +718,7 @@ def handler(request: Request): "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect @@ -3004,7 +3009,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] - masked_keys = ["created_at", "updated_at"] + masked_keys = ["created_at", "updated_at", "active"] for d in compared: # keep deleting these in case we are comparing the body as it will be uploaded by real scripts diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index e808dd13966c..9b3a48add90b 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -398,6 +398,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): # Offloading is off by default at time of writing: remove this line when it's on by default neon_env_builder.pageserver_config_override = "timeline_offloading = true" + neon_env_builder.storage_controller_config = {"heartbeat_interval": "100msec"} neon_env_builder.enable_pageserver_remote_storage(s3_storage()) # We will exercise migrations, so need multiple pageservers @@ -426,6 +427,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): [ ".*removing local file.*because it has unexpected length.*", ".*__temp.*", + ".*method=POST path=\\S+/timeline .*: Not activating a Stopping timeline.*", # FIXME: there are still anyhow::Error paths in timeline creation/deletion which # generate 500 results when called during shutdown (https://github.com/neondatabase/neon/issues/9768) ".*InternalServerError.*", @@ -435,6 +437,14 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): ] ) + env.storage_scrubber.allowed_errors.extend( + [ + # Unclcean shutdowns of pageserver can legitimately result in orphan layers + # (https://github.com/neondatabase/neon/issues/9988#issuecomment-2520558211) + f".*Orphan layer detected: tenants/{tenant_id}/.*" + ] + ) + class TimelineState: def __init__(self): self.timeline_id = TimelineId.generate() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 23d4f23cdb84..0a8900b351e4 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1090,6 +1090,62 @@ def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): endpoint.safe_psql("SELECT 'works'") +# Test restarting compute at WAL page boundary. +def test_restart_endpoint_wal_page_boundary(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + ep = env.endpoints.create_start("main") + ep.safe_psql("create table t (i int)") + + with ep.cursor() as cur: + # measure how much space logical message takes. Sometimes first attempt + # creates huge message and then it stabilizes, have no idea why. + for _ in range(3): + lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"current_lsn={lsn_before}") + # Non-transactional logical message doesn't write WAL, only XLogInsert's + # it, so use transactional. Which is a bit problematic as transactional + # necessitates commit record. Alternatively we can do smth like + # select neon_xlogflush(pg_current_wal_insert_lsn()); + # but isn't much better + that particular call complains on 'xlog flush + # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips + # page headers. + payload = "blahblah" + cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')") + lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before + logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload) + log.info( + f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}" + ) + + # and write logical message spanning exactly as we want + lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"current_lsn={lsn_before}") + curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + offs = int(curr_lsn) % 8192 + till_page = 8192 - offs + target_lsn = curr_lsn + till_page + payload_len = ( + till_page - logical_message_base - 8 + ) # not sure why 8 is here, it is deduced from experiments + log.info( + f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}, target_lsn {target_lsn}" + ) + + cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')") + supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"supposedly_page_boundary={supposedly_contrecord_end}") + # The calculations to hit the page boundary are very fuzzy, so just + # ignore test if we fail to reach it. + if not (int(supposedly_contrecord_end) % 8192 == 0): + pytest.skip(f"missed page boundary, bad luck: lsn is {supposedly_contrecord_end}") + + ep.stop(mode="immediate") + ep = env.endpoints.create_start("main") + ep.safe_psql("insert into t values (42)") # should be ok + + # Context manager which logs passed time on exit. class DurationLogger: def __init__(self, desc): diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 13ff324150fc..c2f65b320159 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 13ff324150fceaac72920e01742addc053db9462 +Subproject commit c2f65b3201591e02ce45b66731392f98d3388e73 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 8736b10c1d93..f262d631ad47 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 8736b10c1d93d11b9c0489872dd529c4c0f5338f +Subproject commit f262d631ad477a1819e84a183e5a7ef561830085 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 81428621f7c0..97f9fde349c6 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 81428621f7c04aed03671cf80a928e0a36d92505 +Subproject commit 97f9fde349c6de6d573f5ce96db07eca60ce6185 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 01fa3c48664c..7e3f3974bc88 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 01fa3c48664ca030cfb69bb4a350aa9df4691d88 +Subproject commit 7e3f3974bc8895938308f94d0e96879ffae638cd diff --git a/vendor/revisions.json b/vendor/revisions.json index 7329aa437f05..bff2f709318a 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "01fa3c48664ca030cfb69bb4a350aa9df4691d88" + "7e3f3974bc8895938308f94d0e96879ffae638cd" ], "v16": [ "16.6", - "81428621f7c04aed03671cf80a928e0a36d92505" + "97f9fde349c6de6d573f5ce96db07eca60ce6185" ], "v15": [ "15.10", - "8736b10c1d93d11b9c0489872dd529c4c0f5338f" + "f262d631ad477a1819e84a183e5a7ef561830085" ], "v14": [ "14.15", - "13ff324150fceaac72920e01742addc053db9462" + "c2f65b3201591e02ce45b66731392f98d3388e73" ] }